def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: categorical_attributes = common_utils.list_columns_with_semantic_types( metadata=self._training_data.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/OrdinalData", "https://metadata.datadrivendiscovery.org/types/CategoricalData" ]) all_attributes = common_utils.list_columns_with_semantic_types( metadata=self._training_data.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/Attribute" ]) self._s_cols = container.List( set(all_attributes).intersection(categorical_attributes)) print("[INFO] %d of categorical attributes found." % (len(self._s_cols))) if len(self._s_cols) > 0: temp_model = defaultdict(LabelEncoder) self._training_data.iloc[:, self._s_cols].apply( lambda x: temp_model[x.name].fit(x)) self._model = dict(temp_model) self._fitted = True else: self._fitted = False
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: self._fitted = True categorical_attributes = common_utils.list_columns_with_semantic_types( metadata=self._training_data.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/OrdinalData", "https://metadata.datadrivendiscovery.org/types/CategoricalData" ]) all_attributes = common_utils.list_columns_with_semantic_types( metadata=self._training_data.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/Attribute" ]) self._s_cols = container.List( set(all_attributes).intersection(categorical_attributes)) _logger.debug("%d of categorical attributes found." % (len(self._s_cols))) if len(self._s_cols) > 0: # temp_model = defaultdict(LabelEncoder) # self._training_data.iloc[:, self._s_cols].apply(lambda x: temp_model[x.name].fit(x)) # self._model = dict(temp_model) self._model = {} for col_index in self._s_cols: self._model[ col_index] = self._training_data.iloc[:, col_index].dropna( ).unique() return CallResult(None, has_finished=True)
def __get_fitted(self): attribute = utils.list_columns_with_semantic_types( self._train_x.metadata, ['https://metadata.datadrivendiscovery.org/types/Attribute']) # Mean for numerical columns self._numeric_columns = utils.list_columns_with_semantic_types( self._train_x.metadata, ['http://schema.org/Integer', 'http://schema.org/Float']) self._numeric_columns = [ x for x in self._numeric_columns if x in attribute ] _logger.debug('numeric columns %s', str(self._numeric_columns)) # Convert selected columns to_numeric, then compute column mean, then convert to_dict self.mean_values = self._train_x.iloc[:, self._numeric_columns].apply( lambda col: pd.to_numeric(col, errors='coerce')).mean( axis=0).to_dict() for name in self.mean_values.keys(): if pd.isnull(self.mean_values[name]): self.mean_values[name] = 0.0 # Mode for categorical columns self._categoric_columns = utils.list_columns_with_semantic_types( self._train_x.metadata, [ 'https://metadata.datadrivendiscovery.org/types/CategoricalData', 'http://schema.org/Boolean' ]) self._categoric_columns = [ x for x in self._categoric_columns if x in attribute ] _logger.debug('categorical columns %s', str(self._categoric_columns)) mode_values = self._train_x.iloc[:, self._categoric_columns].mode( axis=0).iloc[0].to_dict() for name in mode_values.keys(): if pd.isnull(mode_values[name]): # mode is nan rest = self._train_x[name].dropna() if rest.shape[0] == 0: # every value is nan mode = 0 else: mode = rest.mode().iloc[0] mode_values[name] = mode self.mean_values.update(mode_values) if self._verbose: import pprint print('mean imputation:') pprint.pprint(self.mean_values) _logger.debug('Mean values:') for name, value in self.mean_values.items(): _logger.debug(' %s %s', name, str(value))
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: primary_key_cols = common_utils.list_columns_with_semantic_types( metadata=inputs.metadata, semantic_types=["https://metadata.datadrivendiscovery.org/types/PrimaryKey"] ) unfold_cols = common_utils.list_columns_with_semantic_types( metadata=inputs.metadata, semantic_types=self.hyperparams["unfold_semantic_types"] ) if not primary_key_cols: warnings.warn("Did not find primary key column for grouping. Will not unfold") return CallResult(inputs) if not unfold_cols: warnings.warn("Did not find any column to unfold. Will not unfold") return CallResult(inputs) primary_key_col_names = [inputs.columns[pos] for pos in primary_key_cols] unfold_col_names = [inputs.columns[pos] for pos in unfold_cols] if self.hyperparams["use_pipeline_id_semantic_type"]: pipeline_id_cols = common_utils.list_columns_with_semantic_types( metadata=inputs.metadata, semantic_types=["https://metadata.datadrivendiscovery.org/types/PipelineId"] ) if len(pipeline_id_cols) >= 2: warnings.warn("Multiple pipeline id columns found. Will use first.") if pipeline_id_cols: inputs = inputs.sort_values(primary_key_col_names + [inputs.columns[pos] for pos in pipeline_id_cols]) self._sorted_pipe_ids = sorted(inputs.iloc[:, pipeline_id_cols[0]].unique()) else: warnings.warn( "No pipeline id column found by 'https://metadata.datadrivendiscovery.org/types/PipelineId'") new_df = self._get_new_df(inputs=inputs, use_cols=primary_key_cols + unfold_cols) groupby_df = inputs.groupby(primary_key_col_names)[unfold_col_names].aggregate( lambda x: container.List(x)).reset_index(drop=False) ret_df = container.DataFrame(groupby_df) ret_df.metadata = new_df.metadata ret_df = self._update_metadata_dimension(df=ret_df) split_col_names = [inputs.columns[pos] for pos in unfold_cols] ret_df = self._split_aggregated(df=ret_df, split_col_names=split_col_names) ret_df = common_utils.remove_columns( inputs=ret_df, column_indices=[ret_df.columns.get_loc(name) for name in split_col_names] ) return CallResult(ret_df)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: index_col = common_utils.list_columns_with_semantic_types( metadata=inputs.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/PrimaryKey" ]) if not index_col: warnings.warn( "Did not find primary key column. Can not vote, output origin") return CallResult(inputs) predict_target_col = common_utils.list_columns_with_semantic_types( metadata=inputs.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/PredictedTarget" ]) if not index_col: warnings.warn( "Did not find PredictedTarget column. Can not vote, output origin" ) return CallResult(inputs) df = inputs.copy() new_df = self._get_index_and_target_df(inputs=df, use_cols=index_col + predict_target_col) if self.hyperparams["ensemble_method"] == 'majority': groupby_df = new_df.groupby([ new_df.columns[pos] for pos in index_col ]).agg(lambda x: x.value_counts().index[0]).reset_index(drop=False) ret_df = container.DataFrame(groupby_df) ret_df.metadata = new_df.metadata if self.hyperparams["ensemble_method"] == 'max': groupby_df = new_df.groupby([ new_df.columns[pos] for pos in index_col ]).max().reset_index(drop=False) ret_df = container.DataFrame(groupby_df) ret_df.metadata = new_df.metadata if self.hyperparams["ensemble_method"] == 'min': groupby_df = new_df.groupby([ new_df.columns[pos] for pos in index_col ]).min().reset_index(drop=False) ret_df = container.DataFrame(groupby_df) ret_df.metadata = new_df.metadata return CallResult(self._update_metadata(df=ret_df))
def fit(self, *, timeout: float = None, iterations: int = None) -> None: if self._fitted: return if self._input_data is None: raise ValueError('Missing training(fitting) data.') # Look at attribute columns only # print('fit in', self._input_data.columns) data = self._input_data.copy() all_attributes = utils.list_columns_with_semantic_types(metadata=data.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/Attribute"]) # Remove columns with all empty values, structural type str numeric = utils.list_columns_with_semantic_types( data.metadata, ['http://schema.org/Integer', 'http://schema.org/Float']) numeric = [x for x in numeric if x in all_attributes] for element in numeric: if data.metadata.query((mbase.ALL_ELEMENTS, element)).get('structural_type', ()) == str: if pd.isnull(pd.to_numeric(data.iloc[:, element])).sum() == data.shape[0]: self._empty_columns.append(element) # Remove columns with all empty values, structural numeric is_empty = pd.isnull(data).sum(axis=0) == data.shape[0] for i in all_attributes: if is_empty.iloc[i]: self._empty_columns.append(i) _logger.debug('Removing entirely empty columns: {}'.format(data.columns[self._empty_columns])) data = utils.remove_columns(data, self._empty_columns, source='ISI DSBox Data Encoder') categorical_attributes = utils.list_columns_with_semantic_types(metadata=data.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/OrdinalData", "https://metadata.datadrivendiscovery.org/types/CategoricalData"]) all_attributes = utils.list_columns_with_semantic_types(metadata=data.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/Attribute"]) self._cat_col_index = list(set(all_attributes).intersection(categorical_attributes)) self._cat_columns = data.columns[self._cat_col_index].tolist() _logger.debug('Encoding columns: {}'.format(self._cat_columns)) mapping = {} for column_name in self._cat_columns: col = data[column_name] temp = self._trim_features(col, self.hyperparams['n_limit']) if temp: mapping[temp[0]] = temp[1] self._mapping = mapping self._fitted = True
def set_training_data(self, *, inputs: Input) -> None: """ Sets training data of this primitive. Parameters ---------- inputs : Input The inputs. """ attribute = utils.list_columns_with_semantic_types( inputs.metadata, ['https://metadata.datadrivendiscovery.org/types/Attribute']) nan_sum = 0 for col in attribute: if str(inputs.dtypes[inputs.columns[col]]) != "object": nan_sum += inputs.iloc[:, col].isnull().sum() else: for i in range(inputs.shape[0]): if inputs.iloc[i, col] == "" or pd.isnull( inputs.iloc[i, col]): nan_sum += 1 if nan_sum == 0: # no missing value exists if self._verbose: print("Warning: no missing value in train dataset") _logger.info('no missing value in train dataset') self._train_x = inputs self._is_fitted = False
def _get_date_cols(data): dates = common_utils.list_columns_with_semantic_types( metadata=data.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/Time" ]) return dates
def _find_csv_file_column(cls, inputs_metadata: metadata_base.DataMetadata, res_id: int) -> typing.Optional[int]: indices = utils.list_columns_with_semantic_types(inputs_metadata, cls._semantic_types, at=(res_id, )) for i in indices: if cls._is_csv_file_column(inputs_metadata, res_id, i): return i return None
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: new_df = pd.concat([x for x in inputs], ignore_index=self.hyperparams["ignore_index"]) if self.hyperparams["sort_on_primary_key"]: primary_key_col = common_utils.list_columns_with_semantic_types(metadata=new_df.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/PrimaryKey"]) if not primary_key_col: warnings.warn("No PrimaryKey column found. Will not sort on PrimaryKey") return CallResult(self._update_metadata(new_df)) new_df = new_df.sort_values([new_df.columns[pos] for pos in primary_key_col]) return CallResult(self._update_metadata(new_df))
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: numerical_attributes = utils.list_columns_with_semantic_types( metadata=self._training_data.metadata, semantic_types=[ "http://schema.org/Float", "http://schema.org/Integer" ]) all_attributes = utils.list_columns_with_semantic_types( metadata=self._training_data.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/Attribute" ]) self._s_cols = list( set(all_attributes).intersection(numerical_attributes)) # print(" %d columns scaled" % (len(self._s_cols))) if len(self._s_cols) > 0: self._model.fit(self._training_data.iloc[:, self._s_cols]) self._fitted = True else: self._fitted = False
def _split_column(self, inputs): """ Inner function to sample part of the column of the input dataset """ input_dataset_shape = inputs[self._main_resource_id].shape # find target column, we should not split these column target_column = utils.list_columns_with_semantic_types(self._training_inputs.metadata, ['https://metadata.datadrivendiscovery.org/types/TrueTarget'], at=(self._main_resource_id,)) if not target_column: self._logger.warn("No target column found from the input dataset.") index_column = utils.get_index_columns(self._training_inputs.metadata,at=(self._main_resource_id,)) if not index_column: self._logger.warn("No index column found from the input dataset.") outputs = copy.copy(inputs) if self._status is Status.TRAIN: # check again on the amount of the attributes column only # we only need to sample when attribute column numbers are larger than threshould attribute_column_length = (input_dataset_shape[1] - len(index_column) - len(target_column)) if attribute_column_length > self._threshold_column_length: attribute_column = set(range(input_dataset_shape[1])) for each_target_column in target_column: attribute_column.remove(each_target_column) for each_index_column in index_column: attribute_column.remove(each_index_column) # generate the remained column index randomly and sort it self._column_remained = random.sample(attribute_column, self._threshold_column_length) self._column_remained.extend(target_column) self._column_remained.extend(index_column) self._column_remained.sort() # use common primitive's RemoveColumnsPrimitive inner function to finish sampling if len(self._column_remained) > 0: # Just to make sure. outputs.metadata = inputs.metadata.set_for_value(outputs, generate_metadata=False) outputs[self._main_resource_id] = inputs[self._main_resource_id].iloc[:, self._column_remained] outputs.metadata = RemoveColumnsPrimitive._select_columns_metadata(outputs.metadata, self._main_resource_id, self._column_remained) return outputs
def __iterativeRegress(self, data, iterations): ''' init with simple imputation, then apply regression to impute iteratively ''' # for now, cancel the evaluation part for iterativeRegress # is_eval = False # if (label_col_name==None or len(label_col_name)==0): # is_eval = False # else: # is_eval = True # indices for numeric attribute columns only attribute = utils.list_columns_with_semantic_types( data.metadata, ['https://metadata.datadrivendiscovery.org/types/Attribute']) numeric = utils.list_columns_with_semantic_types( data.metadata, ['http://schema.org/Integer', 'http://schema.org/Float']) numeric = [x for x in numeric if x in attribute] keys = data.keys() missing_col_id = [] numeric_data = data.iloc[:, numeric].apply( lambda col: pd.to_numeric(col, errors='coerce')) data = mvp.df2np(numeric_data, missing_col_id, self._verbose) # Impute numerical attributes only missing_col_id = [x for x in missing_col_id if x in numeric] missing_col_data = data[:, missing_col_id] # If all values in a column are missing, set that column to zero all_missing = np.sum(np.isnan(missing_col_data), axis=0) == missing_col_data.shape[0] for col, col_missing in enumerate(all_missing): if col_missing: missing_col_data[:, col] = 0 imputed_data = np.zeros([data.shape[0], len(missing_col_id)]) imputed_data_lastIter = missing_col_data # coeff_matrix = np.zeros([len(missing_col_id), data.shape[1]-1]) #coefficient vector for each missing value column model_list = [None] * len(missing_col_id) # store the regression model epoch = iterations counter = 0 # mean init all missing-value columns init_imputation = ["mean"] * len(missing_col_id) next_data = mvp.imputeData(data, missing_col_id, init_imputation, self._verbose) while (counter < epoch): for i in range(len(missing_col_id)): target_col = missing_col_id[i] next_data[:, target_col] = missing_col_data[:, i] # recover the column that to be imputed data_clean, model_list[i] = mvp.bayeImpute(next_data, target_col, self._verbose) next_data[:, target_col] = data_clean[:, target_col] # update bayesian imputed column imputed_data[:, i] = data_clean[:, target_col] # add the imputed data # if (is_eval): # self.__evaluation(data_clean, label) # if (counter > 0): # distance = np.square(imputed_data - imputed_data_lastIter).sum() # if self._verbose: print("changed distance: {}".format(distance)) imputed_data_lastIter = np.copy(imputed_data) counter += 1 data[:, missing_col_id] = imputed_data_lastIter # convert model_list to dict model_dict = {} for i in range(len(model_list)): model_dict[keys[missing_col_id[i]]] = model_list[i] return data, model_dict
def _find_real_vector_column( cls, inputs_metadata: metadata_base.DataMetadata ) -> typing.Optional[int]: indices = utils.list_columns_with_semantic_types( inputs_metadata, cls._semantic_types) return indices[0] if len(indices) > 0 else None
def produce(self, *, inputs: Input, timeout: float = None, iterations: int = None) -> CallResult[Output]: """ precond: run fit() before Parameters: ---------- data: pandas dataframe """ if (not self._is_fitted): # todo: specify a NotFittedError, like in sklearn raise ValueError("Calling produce before fitting.") # if (pd.isnull(inputs).sum().sum() == 0): # no missing value exists # if self._verbose: print ("Warning: no missing value in test dataset") # self._has_finished = True # return CallResult(inputs, self._has_finished, self._iterations_done) if (timeout is None): timeout = 2**31 - 1 if isinstance(inputs, pd.DataFrame): data = inputs.copy() else: data = inputs[0].copy() # setup the timeout with stopit.ThreadingTimeout(timeout) as to_ctx_mrg: assert to_ctx_mrg.state == to_ctx_mrg.EXECUTING # start completing data... if self._verbose: print("=========> impute by mean value of the attribute:") data.iloc[:, self. _numeric_columns] = data.iloc[:, self. _numeric_columns].apply( lambda col: pd. to_numeric(col, errors= 'coerce')) # assume the features of testing data are same with the training data # therefore, only use the mean_values to impute, should get a clean dataset attribute = utils.list_columns_with_semantic_types( data.metadata, ['https://metadata.datadrivendiscovery.org/types/Attribute']) for col in attribute: if str(inputs.dtypes[inputs.columns[col]]) != "object": data.iloc[:, col].fillna(self.mean_values[data.columns[col]], inplace=True) else: for i in range(data.shape[0]): if data.iloc[i, col] == "" or pd.isnull( data.iloc[i, col]): data.iloc[i, col] = self.mean_values[ data.columns[col]] data_clean = data # Update metadata for col in self._numeric_columns: old_metadata = dict( data_clean.metadata.query((mbase.ALL_ELEMENTS, col))) dtype = data_clean.iloc[:, col].dtype if str(dtype).lower().startswith("int"): if "http://schema.org/Integer" not in old_metadata[ 'semantic_types']: old_metadata['semantic_types'] += ( "http://schema.org/Integer", ) old_metadata["structural_type"] = type(10) elif str(dtype).lower().startswith("float"): if "http://schema.org/Float" not in old_metadata[ 'semantic_types']: old_metadata['semantic_types'] += ( "http://schema.org/Float", ) old_metadata["structural_type"] = type(10.2) data_clean.metadata = data_clean.metadata.update( (mbase.ALL_ELEMENTS, col), old_metadata) value = None if to_ctx_mrg.state == to_ctx_mrg.EXECUTED: self._has_finished = True self._iterations_done = True value = data_clean elif to_ctx_mrg.state == to_ctx_mrg.TIMED_OUT: _logger.warn('Produce timed out') self._has_finished = False self._iterations_done = False return CallResult(value, self._has_finished, self._iterations_done)
def __imputationGreedySearch(self, data, label): """ running greedy search for imputation combinations """ # indices for numeric attribute columns only attribute = utils.list_columns_with_semantic_types( data.metadata, ['https://metadata.datadrivendiscovery.org/types/Attribute']) numeric = utils.list_columns_with_semantic_types( data.metadata, ['http://schema.org/Integer', 'http://schema.org/Float']) d3m_index = utils.list_columns_with_semantic_types( data.metadata, ['https://metadata.datadrivendiscovery.org/types/PrimaryKey']) numeric = [x for x in numeric if x in attribute] col_names = data.keys() # 1. convert to np array and get missing value column id missing_col_id = [] data = mvp.df2np(data, missing_col_id, self._verbose) # Imput numerical attribute columns only. Should consider imputing category attribute missing_col_id = [x for x in missing_col_id if x in numeric] label = label.values # init for the permutation permutations = [0] * len(missing_col_id) # length equal with the missing_col_id; value represents the id for imputation_strategies pos = len(permutations) - 1 min_score = float("inf") max_score = -float("inf") max_strategy_id = 0 best_combo = [0] * len(missing_col_id) # init for best combo # greedy search for the best permutation iteration = 1 while (iteration > 0): for i in range(len(permutations)): max_strategy_id = permutations[i] for strategy in range(len(self._imputation_strategies)): permutations[i] = strategy imputation_list = [self._imputation_strategies[x] for x in permutations] data_clean = mvp.imputeData(data, missing_col_id, imputation_list, self._verbose) if self._verbose: print("for the missing value imputation combination: {} ".format(permutations)) score = self.__evaluation(data_clean, label) if (score > max_score): max_score = score max_strategy_id = strategy best_combo = permutations min_score = min(score, min_score) permutations[i] = max_strategy_id iteration -= 1 if self._verbose: print("max score is {}, min score is {}\n".format(max_score, min_score)) print("and the best score is given by the imputation combination: ") best_imputation = {} # key: col_name; value: imputation strategy for i in range(len(best_combo)): best_imputation[col_names[missing_col_id[i]]] = self._imputation_strategies[best_combo[i]] if self._verbose: print(self._imputation_strategies[best_combo[i]] + " for the column {}".format(col_names[missing_col_id[i]])) return best_imputation
def produce(self, *, inputs: Input, timeout: float = None, iterations: int = None) -> CallResult[Output]: """ precond: run fit() before to complete the data, based on the learned parameters, support: -> greedy search also support the untrainable methods: -> iteratively regression -> other Parameters: ---------- data: pandas dataframe label: pandas series, used for the evaluation of imputation TODO: ---------- 1. add evaluation part for __simpleImpute() """ # inputs = inputs.convert_objects(convert_numeric=True) attribute = utils.list_columns_with_semantic_types( inputs.metadata, ['https://metadata.datadrivendiscovery.org/types/Attribute']) numeric = utils.list_columns_with_semantic_types( inputs.metadata, ['http://schema.org/Integer', 'http://schema.org/Float']) numeric = [x for x in numeric if x in attribute] # keys = data.keys() # missing_col_id = [] inputs = inputs.iloc[:, numeric].apply( lambda col: pd.to_numeric(col, errors='coerce')) # data = mvp.df2np(numeric_data, missing_col_id, self._verbose) for i in numeric: old_metadata = dict(inputs.metadata.query((mbase.ALL_ELEMENTS, i))) old_metadata["structural_type"] = inputs.iloc[:, i].values.dtype.type inputs.metadata = inputs.metadata.update((mbase.ALL_ELEMENTS, i), old_metadata) # Impute numerical attributes only if (not self._is_fitted): # todo: specify a NotFittedError, like in sklearn raise ValueError("Calling produce before fitting.") if (pd.isnull(inputs).sum().sum() == 0): # no missing value exists if self._verbose: print("Warning: no missing value in test dataset") self._has_finished = True return CallResult(inputs, self._has_finished, self._iterations_done) if (timeout is None): timeout = 2**31 - 1 if (iterations is None): self._iterations_done = True iterations = 30 # only works for iteratively_regre method data = inputs.copy() # record keys: keys = data.keys() index = data.index # setup the timeout with stopit.ThreadingTimeout(timeout) as to_ctx_mrg: assert to_ctx_mrg.state == to_ctx_mrg.EXECUTING # start completing data... if self._verbose: print("=========> iteratively regress method:") data_clean = self.__regressImpute(data, self._best_imputation, iterations) value = None if to_ctx_mrg.state == to_ctx_mrg.EXECUTED: self._is_fitted = True self._has_finished = True value = pd.DataFrame(data_clean, index, keys) value = container.DataFrame(value) value.metadata = data.metadata elif to_ctx_mrg.state == to_ctx_mrg.TIMED_OUT: print("Timed Out...") self._is_fitted = False self._has_finished = False self._iterations_done = False return CallResult(value, self._has_finished, self._iterations_done)
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: """ Need training data from set_training_data first. The encoder would record specified columns to encode and column values to unary encode later in the produce step. """ if self._fitted: return if self._training_inputs is None: raise ValueError('Missing training(fitting) data.') data = self._training_inputs.copy() all_attributes = utils.list_columns_with_semantic_types(metadata=data.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/Attribute"]) # Remove columns with all empty values, structural type str numeric = utils.list_columns_with_semantic_types( data.metadata, ['http://schema.org/Integer', 'http://schema.org/Float']) numeric = [x for x in numeric if x in all_attributes] for element in numeric: if data.metadata.query((mbase.ALL_ELEMENTS, element)).get('structural_type', ())==str: if pd.isnull(pd.to_numeric(data.iloc[:,element], errors='coerce')).sum() == data.shape[0]: self._empty_columns.append(element) # Remove columns with all empty values, structural numeric is_empty = pd.isnull(data).sum(axis=0) == data.shape[0] for i in all_attributes: if is_empty.iloc[i]: self._empty_columns.append(i) self._empty_columns = list(set(self._empty_columns)) self._empty_columns.reverse() self._empty_columns = container.List(self._empty_columns) data = utils.remove_columns(data, self._empty_columns) # print('fit', data.shape) categorical_attributes = utils.list_columns_with_semantic_types( metadata=data.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/OrdinalData", "https://metadata.datadrivendiscovery.org/types/CategoricalData" ] ) all_attributes = utils.list_columns_with_semantic_types( metadata=data.metadata, semantic_types=["https://metadata.datadrivendiscovery.org/types/Attribute"] ) self._cat_col_index = container.List(set(all_attributes).intersection(numeric)) self._cat_columns = container.List(data.columns[self._cat_col_index].tolist()) #import pdb #pdb.set_trace() numerical_values = data.iloc[:, self._cat_col_index].apply( lambda col: pd.to_numeric(col, errors='coerce')) self._all_columns = set(data.columns) # mapping idict = {} for name in self._cat_columns: col = numerical_values[name] idict[name] = sorted(col.unique()) self._mapping = idict if self._text2int: texts = data.drop(self._mapping.keys(),axis=1) texts = texts.select_dtypes(include=[object]) le = Label_encoder() le.fit_pd(texts) self._textmapping = le.get_params() # determine whether to run unary encoder on the given column or not data_enc = data.iloc[:, self._cat_col_index].apply(lambda col: pd.to_numeric(col, errors='coerce')) for column_name in data_enc: col = data_enc[column_name] col.is_copy = False # only apply unary encoder when the amount of the numerical data is less than 12 if col.unique().shape[0] < 13: self._requirement[column_name] = True else: self._requirement[column_name] = False self._fitted = True return CallResult(None, has_finished=True, iterations_done=1)
def produce( self, *, inputs: container.DataFrame, timeout: float = None, iterations: int = None) -> base.CallResult[container.DataFrame]: # make sure the target column is of a valid type target_idx = self.hyperparams['target_col_index'] if not self._can_use_column(inputs.metadata, target_idx): raise exceptions.InvalidArgumentValueError( 'column idx=' + str(target_idx) + ' from ' + str(inputs.columns) + ' does not contain continuous or discrete type') # check if target is discrete or continuous semantic_types = inputs.metadata.query_column( target_idx)['semantic_types'] discrete = len(set(semantic_types).intersection( self._discrete_types)) > 0 # make a copy of the inputs and clean out any missing data feature_df = inputs.copy() feature_df.dropna(inplace=True) # split out the target feature target_df = feature_df.iloc[:, target_idx] # drop features that are not compatible with ranking feature_indices = set( utils.list_columns_with_semantic_types(inputs.metadata, self._semantic_types)) role_indices = set( utils.list_columns_with_semantic_types(inputs.metadata, self._roles)) feature_indices = feature_indices.intersection(role_indices) all_indices = set(range(0, inputs.shape[1])) skipped_indices = all_indices.difference(feature_indices) skipped_indices.add(target_idx) # drop the target too for i, v in enumerate(skipped_indices): feature_df.drop(inputs.columns[v], axis=1, inplace=True) # figure out the discrete and continuous feature indices and create an array # that flags them discrete_indices = utils.list_columns_with_semantic_types( inputs.metadata, self._discrete_types) discrete_flags = [False] * feature_df.shape[1] for v in discrete_indices: col_name = inputs.columns[v] if col_name in feature_df: col_idx = feature_df.columns.get_loc(col_name) discrete_flags[col_idx] = True target_np = target_df.values feature_np = feature_df.values # compute mutual information for discrete or continuous target ranked_features_np = None if discrete: ranked_features_np = mutual_info_classif( feature_np, target_np, discrete_features=discrete_flags, random_state=self._random_seed) else: ranked_features_np = mutual_info_regression( feature_np, target_np, discrete_features=discrete_flags, random_state=self._random_seed) # merge back into a single list of col idx / rank value tuples data: typing.List[typing.Tuple[int, str, float]] = [] data = self._append_rank_info(inputs, data, ranked_features_np, feature_df) cols = ['idx', 'name', 'rank'] results = container.DataFrame(data=data, columns=cols) results = results.sort_values(by=['rank'], ascending=False).reset_index(drop=True) # wrap as a D3M container - metadata should be auto generated return base.CallResult(results)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: index_col = common_utils.list_columns_with_semantic_types( metadata=inputs.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/PrimaryKey" ]) if not index_col: warnings.warn( "Did not find primary key column. Can not vote, output origin") return CallResult(inputs) predict_target_col = common_utils.list_columns_with_semantic_types( metadata=inputs.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/PredictedTarget" ]) if not index_col: warnings.warn( "Did not find PredictedTarget column. Can not vote, output origin" ) return CallResult(inputs) df = inputs.copy() # temporary fix for index type problem # fix data type to be correct here for each_col in index_col: col_semantic_type = df.metadata.query( (ALL_ELEMENTS, each_col))['semantic_types'] if 'http://schema.org/Integer' in col_semantic_type and df[ df.columns[each_col]].dtype == 'O': df[df.columns[each_col]] = df[df.columns[each_col]].astype(int) new_df = self._get_index_and_target_df(inputs=df, use_cols=index_col + predict_target_col) if self.hyperparams["ensemble_method"] == 'majority': groupby_df = new_df.groupby([ new_df.columns[pos] for pos in index_col ]).agg(lambda x: x.value_counts().index[0]).reset_index(drop=False) ret_df = container.DataFrame(groupby_df) ret_df.metadata = new_df.metadata if self.hyperparams["ensemble_method"] == 'max': groupby_df = new_df.groupby([ new_df.columns[pos] for pos in index_col ]).max().reset_index(drop=False) ret_df = container.DataFrame(groupby_df) ret_df.metadata = new_df.metadata if self.hyperparams["ensemble_method"] == 'min': groupby_df = new_df.groupby([ new_df.columns[pos] for pos in index_col ]).min().reset_index(drop=False) ret_df = container.DataFrame(groupby_df) ret_df.metadata = new_df.metadata return CallResult(self._update_metadata(df=ret_df))
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: dataframe_resource_id, dataframe = base_utils.get_tabular_resource( inputs, self.hyperparams["dataframe_resource"]) # get attribute columns hyperparams_class = ( dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.query() ["primitive_code"]["class_type_arguments"]["Hyperparams"]) primitive = dataset_to_dataframe.DatasetToDataFramePrimitive( hyperparams=hyperparams_class.defaults()) dataframe_meta = primitive.produce(inputs=inputs).value attributes = list_columns_with_semantic_types( metadata=dataframe_meta.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/Attribute" ], ) base_file_path = "/".join( inputs.metadata._current_metadata.metadata["location_uris"] [0].split("/")[:-1]) edge_list = pd.read_csv(os.path.join(base_file_path, "graphs", "edgeList.csv"), index_col=0) if len(edge_list.columns) > 2: graph = nx.from_pandas_edgelist( edge_list, source=edge_list.columns[0], target=edge_list.columns[1], edge_attr=edge_list.columns[2], ) else: graph = nx.from_pandas_edgelist(edge_list, source=edge_list.columns[0], target=edge_list.columns[1]) if len(attributes) > 1: # add attributers to nodes. attribute_node_map = dataframe_meta[ dataframe_meta.columns[attributes]] attribute_node_map["nodeID"] = attribute_node_map["nodeID"].astype( int) attribute_node_map.index = attribute_node_map["nodeID"] attribute_cols = attribute_node_map.columns attribute_node_map.drop(["nodeID"], axis=1) attribute_node_map = attribute_node_map.to_dict(orient="index") for i in graph.nodes: default = {attribute: 0 for attribute in attribute_cols} default["nodeID"] = i graph.nodes[i].update(attribute_node_map.get(i, default)) else: # featurizer expects at a minimum nodeids to be present for i in graph.nodes: default = {} default["nodeID"] = i graph.nodes[i].update(default) # int2str_map = dict(zip(graph.nodes, [str(n) for n in graph.nodes])) # graph = nx.relabel_nodes(graph, mapping=int2str_map) dataframe.metadata = self._update_metadata(inputs.metadata, dataframe_resource_id) assert isinstance(dataframe, container.DataFrame), type(dataframe) U_train = {"graph": graph} y_train = self.produce_target(inputs=inputs).value X_train = dataframe # TODO use attribute in vertex classification X_train = self._typify_dataframe(X_train) X_train.value = pd.DataFrame(X_train.value["nodeID"]) return base.CallResult([X_train, y_train, U_train])