def _update_type(df, added_cols): indices = list() for key in added_cols: indices.append(df.columns.get_loc(key)) for idx in indices: old_metadata = dict(df.metadata.query((mbase.ALL_ELEMENTS, idx))) numerics = pd.to_numeric(df.iloc[:, idx], errors='coerce') length = numerics.shape[0] nans = numerics.isnull().sum() if nans / length > 0.9: if HelperFunction.is_categorical(df.iloc[:, idx]): old_metadata['semantic_types'] = ( "https://metadata.datadrivendiscovery.org/types/CategoricalData",) else: old_metadata['semantic_types'] = ("http://schema.org/Text",) old_metadata['structural_type'] = type("type") else: intcheck = (numerics % 1) == 0 if np.sum(intcheck) / length > 0.9: old_metadata['semantic_types'] = ("http://schema.org/Integer",) old_metadata['structural_type'] = type(10) else: old_metadata['semantic_types'] = ("http://schema.org/Float",) old_metadata['structural_type'] = type(10.1) old_metadata['semantic_types'] += ("https://metadata.datadrivendiscovery.org/types/Attribute",) df.metadata = df.metadata.update((mbase.ALL_ELEMENTS, idx), old_metadata) return df
def update_types(self, col_name): old_metadata = dict( self.df.metadata.query( (mbase.ALL_ELEMENTS, self.df.columns.get_loc(col_name)))) numerics = pd.to_numeric(self.df[col_name], errors='coerce') length = numerics.shape[0] nans = numerics.isnull().sum() if nans / length > 0.9: if HelperFunction.is_categorical(self.df[col_name]): old_metadata['semantic_types'] = ( "https://metadata.datadrivendiscovery.org/types/CategoricalData", ) else: old_metadata['semantic_types'] = ("http://schema.org/Text", ) else: intcheck = (numerics % 1) == 0 if np.sum(intcheck) / length > 0.9: old_metadata['semantic_types'] = ( "http://schema.org/Integer", ) else: old_metadata['semantic_types'] = ("http://schema.org/Float", ) old_metadata['semantic_types'] += \ ("https://metadata.datadrivendiscovery.org/types/Attribute",) self.df.metadata = self.df.metadata.update( (mbase.ALL_ELEMENTS, self.df.columns.get_loc(col_name)), old_metadata)
def _relabel_categorical(inputs: Input) -> Output: for col in range(inputs.shape[1]): old_metadata = dict(inputs.metadata.query((mbase.ALL_ELEMENTS, col))) semantic_type = old_metadata.get('semantic_types', []) if 'https://metadata.datadrivendiscovery.org/types/CategoricalData' in semantic_type: if not HelperFunction.is_categorical(inputs.iloc[:, col]): old_metadata['semantic_types'] = tuple(i for i in old_metadata['semantic_types'] if i != 'https://metadata.datadrivendiscovery.org/types/CategoricalData') numerics = pd.to_numeric(inputs.iloc[:, col], errors='coerce') length = numerics.shape[0] nans = numerics.isnull().sum() if nans / length > 0.9: if "http://schema.org/Text" not in old_metadata['semantic_types']: old_metadata['semantic_types'] += ("http://schema.org/Text",) else: intcheck = (numerics % 1) == 0 if np.sum(intcheck) / length > 0.9: if "http://schema.org/Integer" not in old_metadata['semantic_types']: old_metadata['semantic_types'] += ("http://schema.org/Integer",) # old_metadata['structural_type'] = type(10) # inputs.iloc[:, col] = numerics else: if "http://schema.org/Float" not in old_metadata['semantic_types']: old_metadata['semantic_types'] += ("http://schema.org/Float",) # old_metadata['structural_type'] = type(10.2) # inputs.iloc[:, col] = numerics inputs.metadata = inputs.metadata.update((mbase.ALL_ELEMENTS, col), old_metadata) return inputs
def _update_structural_type(self): for col in range(self._input_data_copy.shape[1]): old_metadata = dict( self._input_data_copy.metadata.query( (mbase.ALL_ELEMENTS, col))) semantic_type = old_metadata.get('semantic_types', None) if not semantic_type: numerics = pd.to_numeric(self._input_data_copy.iloc[:, col], errors='coerce') length = numerics.shape[0] nans = numerics.isnull().sum() if nans / length > 0.9: if HelperFunction.is_categorical( self._input_data_copy.iloc[:, col]): old_metadata['semantic_types'] = ( "https://metadata.datadrivendiscovery.org/types/CategoricalData", ) else: old_metadata['semantic_types'] = ( "http://schema.org/Text", ) else: intcheck = (numerics % 1) == 0 if np.sum(intcheck) / length > 0.9: old_metadata['semantic_types'] = ( "http://schema.org/Integer", ) old_metadata['structural_type'] = type(10) self._input_data_copy.iloc[:, col] = numerics else: old_metadata['semantic_types'] = ( "http://schema.org/Float", ) old_metadata['structural_type'] = type(10.2) self._input_data_copy.iloc[:, col] = numerics old_metadata['semantic_types'] += ( "https://metadata.datadrivendiscovery.org/types/Attribute", ) else: if "http://schema.org/Integer" in semantic_type: self._input_data_copy.iloc[:, col] = pd.to_numeric( self._input_data_copy.iloc[:, col], errors='coerce') old_metadata['structural_type'] = type(10) elif "http://schema.org/Float" in semantic_type: self._input_data_copy.iloc[:, col] = pd.to_numeric( self._input_data_copy.iloc[:, col], errors='coerce') old_metadata['structural_type'] = type(10.2) self._input_data_copy.metadata = self._input_data_copy.metadata.update( (mbase.ALL_ELEMENTS, col), old_metadata)
def update_type(extends, df_origin): extends_df = pd.DataFrame.from_dict(extends) extends_df = d3m_DataFrame(extends_df, generate_metadata=True) if extends != {}: extends_df.index = df_origin.index.copy() new_df = d3m_DataFrame.append_columns(df_origin, extends_df) indices = list() for key in extends: indices.append(new_df.columns.get_loc(key)) for idx in indices: old_metadata = dict(new_df.metadata.query((mbase.ALL_ELEMENTS, idx))) numerics = pd.to_numeric(new_df.iloc[:, idx], errors='coerce') length = numerics.shape[0] nans = numerics.isnull().sum() if nans / length > 0.9: if HelperFunction.is_categorical(new_df.iloc[:, idx]): old_metadata['semantic_types'] = ( "https://metadata.datadrivendiscovery.org/types/CategoricalData", ) else: old_metadata['semantic_types'] = ("http://schema.org/Text", ) else: intcheck = (numerics % 1) == 0 if np.sum(intcheck) / length > 0.9: old_metadata['semantic_types'] = ( "http://schema.org/Integer", ) else: old_metadata['semantic_types'] = ("http://schema.org/Float", ) old_metadata['semantic_types'] += ( "https://metadata.datadrivendiscovery.org/types/Attribute", ) new_df.metadata = new_df.metadata.update((mbase.ALL_ELEMENTS, idx), old_metadata) return new_df