def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Parameters ---------- inputs : dataframe Returns ---------- Outputs The output is a transformed dataframe of X fit into an embedded space, n feature columns will equal n_components hyperparameter For timeseries datasets the output is the dimensions concatenated to the timeseries filename dataframe """ targets = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/TrueTarget') if not len(targets): targets = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/TrueTarget') if not len(targets): targets = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/SuggestedTarget') target_names = [list(inputs)[t] for t in targets] index = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/PrimaryKey') index_names = [list(inputs)[i] for i in index] X_test = inputs.drop(columns = list(inputs)[index[0]]) X_test = X_test.drop(columns = target_names).values # special semi-supervised case - during training, only produce rows with labels series = inputs[target_names] != '' if series.any().any(): inputs = dataframe_utils.select_rows(inputs, np.flatnonzero(series)) X_test = X_test[np.flatnonzero(series)] sc_df = d3m_DataFrame(pandas.DataFrame(self.sc.fit_predict(X_test), columns=['cluster_labels'])) # just add last column of last column ('clusters') col_dict = dict(sc_df.metadata.query((metadata_base.ALL_ELEMENTS, 0))) col_dict['structural_type'] = type(1) if self.hyperparams['task_type'] == 'classification': col_dict['semantic_types'] = ('http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute') col_dict['name'] = 'cluster_labels' else: col_dict['semantic_types'] = ('http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget') col_dict['name'] = target_names[0] sc_df.metadata = sc_df.metadata.update((metadata_base.ALL_ELEMENTS, 0), col_dict) df_dict = dict(sc_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict_1 = dict(sc_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict['dimension'] = df_dict_1 df_dict_1['name'] = 'columns' df_dict_1['semantic_types'] = ('https://metadata.datadrivendiscovery.org/types/TabularColumn',) df_dict_1['length'] = 1 sc_df.metadata = sc_df.metadata.update((metadata_base.ALL_ELEMENTS,), df_dict) return CallResult(utils_cp.append_columns(inputs, sc_df))
def _scalar_filter(self, inputs, vector_column): max_value = self.hyperparams["maxs"] min_value = self.hyperparams["mins"] indices = inputs.index.tolist() if min_value == None: float("-inf") if max_value == None: float("inf") try: rows = np.stack(inputs.iloc[:, vector_column], axis=0) rows = np.logical_and( self._min_comparison_op( rows, min_value, ), self._max_comparision_op(rows, max_value), ) rows_to_keep = rows.sum(axis=1) == rows.shape[1] except ValueError as error: rows = inputs.iloc[:, vector_column] def _filter_r(row, min_val, max_val): return np.logical_and( self._min_comparison_op( row, min_val, ), self._max_comparision_op( row, max_val, ), ) rows = rows.apply( _filter_r, args=(min_value, max_value), ) rows_to_keep = rows.apply(np.sum) == rows.apply(np.shape).apply( np.take, args=([0])) if self.hyperparams["inclusive"]: rows_to_keep = [ indices[j] for j in range(len(indices)) if rows_to_keep[j] ] else: rows_to_keep = [ indices[j] for j in range(len(indices)) if not rows_to_keep[j] ] return dataframe_utils.select_rows(inputs, rows_to_keep)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Parameters ---------- inputs : dataframe with attached metadata for semi-supervised or unsupervised data Returns ---------- Outputs The output depends on the required_output hyperparameter and is either a dataframe containing a single column where each entry is the cluster ID, or the input daatframe with the cluster ID of each row added as an additional feature. """ # find target and index variables targets = inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/TrueTarget') if not len(targets): targets = inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/TrueTarget') if not len(targets): targets = inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget' ) target_names = [list(inputs)[t] for t in targets] index = inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/PrimaryKey') index_names = [list(inputs)[i] for i in index] X_test = inputs.copy() if len(index): X_test = X_test.drop(columns=list(inputs)[index[0]]) if len(target_names): X_test = X_test.drop(columns=target_names) X_test = X_test.values # special semi-supervised case - during training, only produce rows with labels series = inputs[target_names] != '' if series.any().any(): inputs = dataframe_utils.select_rows(inputs, np.flatnonzero(series)) X_test = X_test[np.flatnonzero(series)] if self.hyperparams['required_output'] == 'feature': hdb_df = d3m_DataFrame( pandas.DataFrame(self.clf.fit_predict(X_test), columns=['cluster_labels'])) # just add last column of last column ('clusters') col_dict = dict( hdb_df.metadata.query((metadata_base.ALL_ELEMENTS, 0))) col_dict['structural_type'] = type(1) col_dict['name'] = 'cluster_labels' col_dict['semantic_types'] = ( 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute') hdb_df.metadata = hdb_df.metadata.update( (metadata_base.ALL_ELEMENTS, 0), col_dict) df_dict = dict( hdb_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict_1 = dict( hdb_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict['dimension'] = df_dict_1 df_dict_1['name'] = 'columns' df_dict_1['semantic_types'] = ( 'https://metadata.datadrivendiscovery.org/types/TabularColumn', ) df_dict_1['length'] = 1 hdb_df.metadata = hdb_df.metadata.update( (metadata_base.ALL_ELEMENTS, ), df_dict) return CallResult(utils_cp.append_columns(inputs, hdb_df)) else: hdb_df = d3m_DataFrame( pandas.DataFrame(self.clf.fit_predict(X_test), columns=[target_names[0]])) hdb_df = pandas.concat([inputs.d3mIndex, hdb_df], axis=1) col_dict = dict( hdb_df.metadata.query((metadata_base.ALL_ELEMENTS, 0))) col_dict['structural_type'] = type(1) col_dict['name'] = index_names[0] col_dict['semantic_types'] = ( 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey') hdb_df.metadata = hdb_df.metadata.update( (metadata_base.ALL_ELEMENTS, 0), col_dict) col_dict = dict( hdb_df.metadata.query((metadata_base.ALL_ELEMENTS, 1))) col_dict['structural_type'] = type(1) col_dict['name'] = target_names[0] col_dict['semantic_types'] = ( 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget' ) hdb_df.metadata = hdb_df.metadata.update( (metadata_base.ALL_ELEMENTS, 1), col_dict) df_dict = dict( hdb_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict_1 = dict( hdb_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict['dimension'] = df_dict_1 df_dict_1['name'] = 'columns' df_dict_1['semantic_types'] = ( 'https://metadata.datadrivendiscovery.org/types/TabularColumn', ) df_dict_1['length'] = 2 hdb_df.metadata = hdb_df.metadata.update( (metadata_base.ALL_ELEMENTS, ), df_dict) return CallResult(hdb_df)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Parameters ---------- inputs : numpy ndarray of size (number_of_time_series, time_series_length) containing new time series Returns ---------- Outputs The output is a dataframe containing a single column where each entry is the associated series' cluster number. """ hyperparams_class = DatasetToDataFrame.DatasetToDataFramePrimitive.metadata.query( )['primitive_code']['class_type_arguments']['Hyperparams'] ds2df_client = DatasetToDataFrame.DatasetToDataFramePrimitive( hyperparams=hyperparams_class.defaults().replace( {"dataframe_resource": "learningData"})) metadata_inputs = ds2df_client.produce(inputs=inputs).value # temporary (until Uncharted adds conversion primitive to repo) if not self.hyperparams['long_format']: formatted_inputs = TimeSeriesFormatterPrimitive( hyperparams=self._hp).produce(inputs=inputs).value['0'] else: formatted_inputs = d3m_DataFrame( ds2df_client.produce(inputs=inputs).value) # store information on target, index variable targets = metadata_inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/TrueTarget') if not len(targets): targets = metadata_inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/TrueTarget') if not len(targets): targets = metadata_inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget' ) target_names = [list(metadata_inputs)[t] for t in targets] index = metadata_inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/PrimaryKey') # parse values from output of time series formatter n_ts = len(formatted_inputs.d3mIndex.unique()) if n_ts == formatted_inputs.shape[0]: X_test = formatted_inputs.drop( columns=list(formatted_inputs)[index[0]]) X_test = X_test.drop(columns=target_names).values else: ts_sz = int(formatted_inputs.shape[0] / n_ts) X_test = np.array(formatted_inputs.value).reshape(n_ts, ts_sz) # special semi-supervised case - during training, only produce rows with labels series = metadata_inputs[target_names] != '' if series.any().any(): metadata_inputs = dataframe_utils.select_rows( metadata_inputs, np.flatnonzero(series)) X_test = X_test[np.flatnonzero(series)] sloth_df = d3m_DataFrame( pandas.DataFrame(self.clf.fit_predict(X_test), columns=['cluster_labels'])) # last column ('clusters') col_dict = dict( sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, 0))) col_dict['structural_type'] = type(1) col_dict['name'] = 'cluster_labels' col_dict['semantic_types'] = ( 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/CategoricalData') sloth_df.metadata = sloth_df.metadata.update( (metadata_base.ALL_ELEMENTS, 0), col_dict) df_dict = dict(sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict_1 = dict( sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict['dimension'] = df_dict_1 df_dict_1['name'] = 'columns' df_dict_1['semantic_types'] = ( 'https://metadata.datadrivendiscovery.org/types/TabularColumn', ) df_dict_1['length'] = 1 sloth_df.metadata = sloth_df.metadata.update( (metadata_base.ALL_ELEMENTS, ), df_dict) return CallResult(utils_cp.append_columns(metadata_inputs, sloth_df))
def produce( self, *, inputs: container.DataFrame, timeout: float = None, iterations: int = None, ) -> base.CallResult[container.DataFrame]: vector_column = self._get_floatvector_column(inputs.metadata) if vector_column is None: return base.CallResult(inputs) maxs = self.hyperparams["maxs"] mins = self.hyperparams["mins"] if type(mins) == float or type(mins) == int: return base.CallResult(self._scalar_filter(inputs, vector_column)) indices = inputs.index.tolist() mins = [float("-inf") if i == None else i for i in mins] maxs = [float("inf") if i == None else i for i in maxs] indices_to_keep = np.empty((inputs.shape[0], )) try: rows = np.stack(inputs.iloc[:, vector_column], axis=0) filter_length = rows.shape[1] rows = np.logical_and( self._min_comparison_op( rows[:, :filter_length], mins, ), self._max_comparision_op(rows[:, :filter_length], maxs), ) rows_to_keep = rows.sum(axis=1) == filter_length except ValueError as error: # rows had uneven length rows = inputs.iloc[:, vector_column] # get length of each vector vector_lengths = rows.apply(np.shape).apply(np.take, args=([0])) filter_lengths = vector_lengths.values # need this to loop over lengths array while keeping vectorised # apply function over rows count_for_ref = [0] def _filter_r(row, filter_lengths, mins, maxs, counter): # in case fewer filters than row length filterable_range = min(filter_lengths[counter[0]], len(mins)) mins_for_filter = np.array(mins[:filterable_range]) maxs_for_filter = np.array(maxs[:filterable_range]) filtered_row = np.logical_and( self._min_comparison_op(row[:filterable_range], mins_for_filter), self._max_comparision_op( row[:filterable_range], maxs_for_filter, ), ) counter[0] += 1 return filtered_row rows = rows.apply( _filter_r, args=(filter_lengths, mins, maxs, count_for_ref), ) rows_to_keep = rows.apply(np.sum).values == filter_lengths if self.hyperparams["inclusive"]: indices_to_keep = [ indices[j] for j in range(len(indices)) if rows_to_keep[j] ] else: indices_to_keep = [ indices[j] for j in range(len(indices)) if not rows_to_keep[j] ] outputs = dataframe_utils.select_rows(inputs, indices_to_keep) return base.CallResult(outputs)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None ) -> CallResult[container.pandas.DataFrame]: """ Parameters ---------- inputs : D3M dataframe with associated metadata. Returns ------- Outputs For unsupervised problems: The output is a dataframe containing a single column where each entry is the associated series' cluster number. For semi-supervised problems: The output is the input df containing an additional feature - cluster_label """ #hyperparams_class = DatasetToDataFrame.DatasetToDataFramePrimitive.metadata.query()['primitive_code']['class_type_arguments']['Hyperparams'] #ds2df_client = DatasetToDataFrame.DatasetToDataFramePrimitive(hyperparams = hyperparams_class.defaults().replace({"dataframe_resource":"learningData"})) #metadata_inputs = ds2df_client.produce(inputs = inputs).value #formatted_inputs = ds2df_client.produce(inputs = inputs).value # store information on target, index variable targets = inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/TrueTarget') if not len(targets): targets = inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/TrueTarget') if not len(targets): targets = inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget' ) target_names = [list(inputs)[t] for t in targets] index = inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/PrimaryKey') index_names = [list(inputs)[i] for i in index] # load and reshape training data n_ts = len(inputs.d3mIndex.unique()) if n_ts == inputs.shape[0]: X_test = inputs.drop(columns=list(inputs)[index[0]]) X_test = X_test.drop(columns=target_names).values else: ts_sz = int(inputs.shape[0] / n_ts) X_test = np.array(inputs.value).reshape(n_ts, ts_sz, 1) # special semi-supervised case - during training, only produce rows with labels if self.clustering: sloth_df = d3m_DataFrame( pandas.DataFrame(self._kmeans.predict(X_test), columns=[target_names[0]])) sloth_df = pandas.concat([inputs.d3mIndex, sloth_df], axis=1) # first column ('d3mTndex') col_dict = dict( sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, 0))) col_dict['structural_type'] = type("1") col_dict['name'] = index_names[0] col_dict['semantic_types'] = ( 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', ) sloth_df.metadata = sloth_df.metadata.update( (metadata_base.ALL_ELEMENTS, 0), col_dict) # second column ('Class') col_dict = dict( sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, 1))) col_dict['structural_type'] = type("1") col_dict['name'] = target_names[0] col_dict['semantic_types'] = ( 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget' ) sloth_df.metadata = sloth_df.metadata.update( (metadata_base.ALL_ELEMENTS, 1), col_dict) df_dict = dict( sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict_1 = dict( sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict['dimension'] = df_dict_1 df_dict_1['name'] = 'columns' df_dict_1['semantic_types'] = ( 'https://metadata.datadrivendiscovery.org/types/TabularColumn', ) df_dict_1['length'] = 2 sloth_df.metadata = sloth_df.metadata.update( (metadata_base.ALL_ELEMENTS, ), df_dict) return CallResult(sloth_df) else: series = inputs[target_names] != '' if series.any().any(): inputs = dataframe_utils.select_rows(inputs, np.flatnonzero(series)) X_test = X_test[np.flatnonzero(series)] sloth_df = d3m_DataFrame( pandas.DataFrame(self._kmeans.predict(X_test), columns=['cluster_labels'])) # add clusters as a feature in the main dataframe - last column ('clusters') col_dict = dict( sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, 0))) col_dict['structural_type'] = type(1) col_dict['name'] = 'cluster_labels' col_dict['semantic_types'] = ( 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/CategoricalData' ) sloth_df.metadata = sloth_df.metadata.update( (metadata_base.ALL_ELEMENTS, 0), col_dict) df_dict = dict( sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict_1 = dict( sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict['dimension'] = df_dict_1 df_dict_1['name'] = 'columns' df_dict_1['semantic_types'] = ( 'https://metadata.datadrivendiscovery.org/types/TabularColumn', ) df_dict_1['length'] = 1 sloth_df.metadata = sloth_df.metadata.update( (metadata_base.ALL_ELEMENTS, ), df_dict) return CallResult(utils_cp.append_columns(inputs, sloth_df))