def produce(self, *, inputs: Input, timeout: float = None, iterations: int = None) -> CallResult[Output]: #make_keras_pickleable() produce_data, learning_df, nodes_df, edges_df = self._parse_inputs( inputs, return_all=True) if self.fitted: result = self._sdne._Y #produce( )#_Y else: dim = self.hyperparams['dimension'] alpha = self.hyperparams['alpha'] beta = self.hyperparams['beta'] #self._model self._sdne = sdne.SDNE(d=dim, alpha=alpha, beta=beta, **args) produce_data = networkx.from_scipy_sparse_matrix(produce_data) self._sdne.learn_embedding(graph=produce_data) self._model = self._sdne._model result = self._sdne._Y target_types = [ 'https://metadata.datadrivendiscovery.org/types/TrueTarget', 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget' ] if self.hyperparams['return_list']: result_np = container.ndarray(result, generate_metadata=True) return_list = d3m_List([result_np, inputs[1], inputs[2]], generate_metadata=True) return CallResult(return_list, True, 1) else: learn_df = d3m_DataFrame(learning_df, generate_metadata=True) learn_df = get_columns_not_of_type(learn_df, target_types) learn_df = learn_df.remove_columns( [learn_df.columns.get_loc('nodeID')]) #learn_df = learn_df.drop('nodeID', axis = 'columns') result_df = d3m_DataFrame(result, generate_metadata=True) result_df = result_df.loc[result_df.index.isin( learning_df['d3mIndex'].values)] for column_index in range(result_df.shape[1]): col_dict = dict( result_df.metadata.query((ALL_ELEMENTS, column_index))) col_dict['structural_type'] = type(1.0) col_dict['name'] = str(learn_df.shape[1] + column_index) col_dict['semantic_types'] = ( 'http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute') result_df.metadata = result_df.metadata.update( (ALL_ELEMENTS, column_index), col_dict) result_df.index = learn_df.index.copy() output = utils.append_columns(learn_df, result_df) #output.set_index('d3mIndex', inplace=True) return CallResult(output, True, 1)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Parameters ---------- inputs : dataframe Returns ---------- Outputs The output is a transformed dataframe of X fit into an embedded space, n feature columns will equal n_components hyperparameter For timeseries datasets the output is the dimensions concatenated to the timeseries filename dataframe """ targets = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/TrueTarget') if not len(targets): targets = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/TrueTarget') if not len(targets): targets = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/SuggestedTarget') target_names = [list(inputs)[t] for t in targets] index = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/PrimaryKey') index_names = [list(inputs)[i] for i in index] X_test = inputs.drop(columns = list(inputs)[index[0]]) X_test = X_test.drop(columns = target_names).values # special semi-supervised case - during training, only produce rows with labels series = inputs[target_names] != '' if series.any().any(): inputs = dataframe_utils.select_rows(inputs, np.flatnonzero(series)) X_test = X_test[np.flatnonzero(series)] sc_df = d3m_DataFrame(pandas.DataFrame(self.sc.fit_predict(X_test), columns=['cluster_labels'])) # just add last column of last column ('clusters') col_dict = dict(sc_df.metadata.query((metadata_base.ALL_ELEMENTS, 0))) col_dict['structural_type'] = type(1) if self.hyperparams['task_type'] == 'classification': col_dict['semantic_types'] = ('http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute') col_dict['name'] = 'cluster_labels' else: col_dict['semantic_types'] = ('http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget') col_dict['name'] = target_names[0] sc_df.metadata = sc_df.metadata.update((metadata_base.ALL_ELEMENTS, 0), col_dict) df_dict = dict(sc_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict_1 = dict(sc_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict['dimension'] = df_dict_1 df_dict_1['name'] = 'columns' df_dict_1['semantic_types'] = ('https://metadata.datadrivendiscovery.org/types/TabularColumn',) df_dict_1['length'] = 1 sc_df.metadata = sc_df.metadata.update((metadata_base.ALL_ELEMENTS,), df_dict) return CallResult(utils_cp.append_columns(inputs, sc_df))
def produce(self, *, inputs: Input, timeout: float = None, iterations: int = None) -> CallResult[Output]: self.columns = list(inputs) X_ = inputs[self.columns].values if iterations is not None: self.max_iter = iterations else: self.max_iter = 10000 #try: if not self.fitted: raise ValueError('Please fit before calling produce') #except: # pass self.latent_factors = self.model.transform(X_) out_df = d3m_DataFrame(inputs, generate_metadata=True) corex_df = d3m_DataFrame(self.latent_factors, generate_metadata=True) for column_index in range(corex_df.shape[1]): col_dict = dict( corex_df.metadata.query((ALL_ELEMENTS, column_index))) col_dict['structural_type'] = type(1.0) # FIXME: assume we apply corex only once per template, otherwise column names might duplicate col_dict['name'] = str( out_df.shape[1] + column_index ) #should just be column index, no corex prefix #'corex_' + col_dict['semantic_types'] = ( 'http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute') corex_df.metadata = corex_df.metadata.update( (ALL_ELEMENTS, column_index), col_dict) corex_df.index = out_df.index.copy() out_df = utils.append_columns(out_df, corex_df) return CallResult(out_df, True, self.max_iter)
def produce(self, *, inputs: Input, timeout: float = None, iterations: int = None) -> CallResult[Output]: columns_list_to_fold = self._mapping.get('foldable_columns', []) if len(columns_list_to_fold) == 0: return CallResult(inputs, True, 1) if inputs.shape[0] > 20000: return CallResult(inputs, True, 1) self._column_names = list(inputs) if inputs is not None else [] df = None for columns_to_fold in columns_list_to_fold: df = self._fold_columns(inputs, columns_to_fold) cols_to_drop = list() for col_idx, col_name in enumerate(inputs.columns): if col_name not in df.columns: cols_to_drop.append(col_idx) inputs = utils.remove_columns(inputs, cols_to_drop) new_df = inputs[0:0] for col_name in new_df.columns: new_df.loc[:, col_name] = df.loc[:, col_name] extends = {} for col_name in df.columns: if col_name not in new_df.columns: extends[col_name] = df.loc[:, col_name].tolist() if extends: extends_df = d3m_DataFrame.from_dict(extends) extends_df.index = new_df.index.copy() new_df = utils.append_columns(new_df, extends_df) new_df = self._update_type(new_df, list(extends.keys())) old_metadata = dict(new_df.metadata.query(())) old_metadata["dimension"] = dict(old_metadata["dimension"]) old_metadata["dimension"]["length"] = new_df.shape[0] new_df.metadata = new_df.metadata.update((), old_metadata) return CallResult(new_df, True, 1) if new_df is not None else CallResult( inputs, True, 1)
def update_type(extends, df_origin): extends_df = d3m_DataFrame.from_dict(extends) if extends != {}: extends_df.index = df_origin.index.copy() new_df = utils.append_columns(df_origin, extends_df) indices = list() for key in extends: indices.append(new_df.columns.get_loc(key)) for idx in indices: old_metadata = dict(new_df.metadata.query((mbase.ALL_ELEMENTS, idx))) numerics = pd.to_numeric(new_df.iloc[:, idx], errors='coerce') length = numerics.shape[0] nans = numerics.isnull().sum() if nans / length > 0.9: if HelperFunction.is_categorical(new_df.iloc[:, idx]): old_metadata['semantic_types'] = ( "https://metadata.datadrivendiscovery.org/types/CategoricalData", ) else: old_metadata['semantic_types'] = ("http://schema.org/Text", ) else: intcheck = (numerics % 1) == 0 if np.sum(intcheck) / length > 0.9: old_metadata['semantic_types'] = ( "http://schema.org/Integer", ) else: old_metadata['semantic_types'] = ("http://schema.org/Float", ) old_metadata['semantic_types'] += ( "https://metadata.datadrivendiscovery.org/types/Attribute", ) new_df.metadata = new_df.metadata.update((mbase.ALL_ELEMENTS, idx), old_metadata) return new_df
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Parameters ---------- inputs : dataframe with attached metadata for semi-supervised or unsupervised data Returns ---------- Outputs The output depends on the required_output hyperparameter and is either a dataframe containing a single column where each entry is the cluster ID, or the input daatframe with the cluster ID of each row added as an additional feature. """ # find target and index variables targets = inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/TrueTarget') if not len(targets): targets = inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/TrueTarget') if not len(targets): targets = inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget' ) target_names = [list(inputs)[t] for t in targets] index = inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/PrimaryKey') index_names = [list(inputs)[i] for i in index] X_test = inputs.copy() if len(index): X_test = X_test.drop(columns=list(inputs)[index[0]]) if len(target_names): X_test = X_test.drop(columns=target_names) X_test = X_test.values # special semi-supervised case - during training, only produce rows with labels series = inputs[target_names] != '' if series.any().any(): inputs = dataframe_utils.select_rows(inputs, np.flatnonzero(series)) X_test = X_test[np.flatnonzero(series)] if self.hyperparams['required_output'] == 'feature': hdb_df = d3m_DataFrame( pandas.DataFrame(self.clf.fit_predict(X_test), columns=['cluster_labels'])) # just add last column of last column ('clusters') col_dict = dict( hdb_df.metadata.query((metadata_base.ALL_ELEMENTS, 0))) col_dict['structural_type'] = type(1) col_dict['name'] = 'cluster_labels' col_dict['semantic_types'] = ( 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute') hdb_df.metadata = hdb_df.metadata.update( (metadata_base.ALL_ELEMENTS, 0), col_dict) df_dict = dict( hdb_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict_1 = dict( hdb_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict['dimension'] = df_dict_1 df_dict_1['name'] = 'columns' df_dict_1['semantic_types'] = ( 'https://metadata.datadrivendiscovery.org/types/TabularColumn', ) df_dict_1['length'] = 1 hdb_df.metadata = hdb_df.metadata.update( (metadata_base.ALL_ELEMENTS, ), df_dict) return CallResult(utils_cp.append_columns(inputs, hdb_df)) else: hdb_df = d3m_DataFrame( pandas.DataFrame(self.clf.fit_predict(X_test), columns=[target_names[0]])) hdb_df = pandas.concat([inputs.d3mIndex, hdb_df], axis=1) col_dict = dict( hdb_df.metadata.query((metadata_base.ALL_ELEMENTS, 0))) col_dict['structural_type'] = type(1) col_dict['name'] = index_names[0] col_dict['semantic_types'] = ( 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey') hdb_df.metadata = hdb_df.metadata.update( (metadata_base.ALL_ELEMENTS, 0), col_dict) col_dict = dict( hdb_df.metadata.query((metadata_base.ALL_ELEMENTS, 1))) col_dict['structural_type'] = type(1) col_dict['name'] = target_names[0] col_dict['semantic_types'] = ( 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget' ) hdb_df.metadata = hdb_df.metadata.update( (metadata_base.ALL_ELEMENTS, 1), col_dict) df_dict = dict( hdb_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict_1 = dict( hdb_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict['dimension'] = df_dict_1 df_dict_1['name'] = 'columns' df_dict_1['semantic_types'] = ( 'https://metadata.datadrivendiscovery.org/types/TabularColumn', ) df_dict_1['length'] = 2 hdb_df.metadata = hdb_df.metadata.update( (metadata_base.ALL_ELEMENTS, ), df_dict) return CallResult(hdb_df)
def _process_files(self, inputs: Input): fn_attributes = DataMetadata.list_columns_with_semantic_types(self=inputs.metadata, \ semantic_types=["https://metadata.datadrivendiscovery.org/types/FileName"]) all_attributes = DataMetadata.list_columns_with_semantic_types(self=inputs.metadata, \ semantic_types=["https://metadata.datadrivendiscovery.org/types/Attribute"]) fn_columns = list(set(all_attributes).intersection(fn_attributes)) # if no file name columns are detected, default to regular behavior if len(fn_columns) == 0: return inputs # create an empty DataFrame of the required size processed_cols = pd.DataFrame("", index = copy.deepcopy(inputs.index), \ columns = ['text_files_' + str(i) for i in range(len(fn_columns))]) # for column_index in range(len(fn_columns)): for column_index in fn_columns: curr_column = copy.deepcopy(inputs.iloc[:, column_index]) file_loc = inputs.metadata.query( (ALL_ELEMENTS, column_index))['location_base_uris'] file_loc = file_loc[0] # take the first elem of the tuple file_loc = file_loc[7:] # get rid of 'file://' prefix for row_index in range(curr_column.shape[0]): text_file = curr_column.iloc[row_index] file_path = file_loc + text_file with open(file_path, 'rb') as file: doc = file.read() doc = "".join(map(chr, doc)) doc_tokens = re.compile(r"(?u)\b\w\w+\b").findall( doc) # list of strings processed_cols.iloc[row_index, fn_columns.index(column_index)] = " ".join( doc_tokens) # construct metadata for the newly generated columns processed_cols = d3m_DataFrame(processed_cols, generate_metadata=True) for column_index in range(processed_cols.shape[1]): col_dict = dict( processed_cols.metadata.query((ALL_ELEMENTS, column_index))) col_dict['structural_type'] = type("text") # FIXME: assume we apply corex only once per template, otherwise column names might duplicate col_dict['name'] = 'processed_file_' + str(inputs.shape[1] + column_index) col_dict['semantic_types'] = ( 'http://schema.org/Text', 'https://metadata.datadrivendiscovery.org/types/Attribute') processed_cols.metadata = processed_cols.metadata.update( (ALL_ELEMENTS, column_index), col_dict) # concatenate the input with the newly created columns updated_inputs = utils.append_columns(inputs, processed_cols) # remove the initial FileName columns from the df, if we do this before concatenating we might get an empty dataset error updated_inputs = utils.remove_columns(updated_inputs, fn_columns) return updated_inputs
def produce(self, *, inputs: Input, timeout: float = None, iterations: int = None) -> CallResult[Output]: # if corex didn't run for any reason, just return the given dataset if self.do_nothing: return CallResult(inputs, True, 1) inputs = self._process_files(inputs) if iterations is not None: self.max_iter = iterations else: self.max_iter = 250 self.model.max_iter = self.max_iter # concatenate the columns row-wise concat_cols = None for column_index in self.text_columns: if concat_cols is not None: concat_cols = concat_cols.str.cat(inputs.iloc[:, column_index], sep=" ") else: concat_cols = copy.deepcopy(inputs.iloc[:, column_index]) bow = self.bow.transform(map(self._get_ngrams, concat_cols.ravel())) # choose between CorEx and the TfIdf matrix if bow.shape[1] > self.hyperparams['threshold']: # use CorEx self.latent_factors = self.model.transform(bow).astype(float) else: # just use the bag of words representation self.latent_factors = pd.DataFrame(bow.todense()) # make the columns corex adds distinguishable from other columns # remove the selected columns from input and add the latent factors given by corex out_df = d3m_DataFrame(inputs, generate_metadata=True) self.latent_factors.columns = [ str(out_df.shape[-1] + i) for i in range(self.latent_factors.shape[-1]) ] # create metadata for the corex columns corex_df = d3m_DataFrame(self.latent_factors, generate_metadata=True) for column_index in range(corex_df.shape[1]): col_dict = dict( corex_df.metadata.query((ALL_ELEMENTS, column_index))) col_dict['structural_type'] = type(1.0) # FIXME: assume we apply corex only once per template, otherwise column names might duplicate col_dict['name'] = 'corex_' + str(out_df.shape[1] + column_index) col_dict['semantic_types'] = ( 'http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute') corex_df.metadata = corex_df.metadata.update( (ALL_ELEMENTS, column_index), col_dict) # concatenate is --VERY-- slow without this next line corex_df.index = out_df.index.copy() out_df = utils.append_columns(out_df, corex_df) # remove the initial text columns from the df, if we do this before CorEx we can get an empty dataset error out_df = utils.remove_columns(out_df, self.text_columns) # TO DO : Incorporate timeout, max_iter # return CallResult(d3m_DataFrame(self.latent_factors)) return CallResult(out_df, True, 1)
def produce( self, *, inputs: Input, timeout: float = None, iterations: int = None ) -> CallResult[Output]: # TAKES IN DF with index column self._extra_params() modeling = self.hyperparams['use_as_modeling'] inp = self.model.input # outputs = [layer.output for layer in self.model.layers if 'z_mean' in layer.name or 'z_noise' in layer.name] # functors = [K.function([inp, K.learning_phase()], [out]) for out in outputs] # dec_inp = [layer.input for layer in self.model.layers if 'decoder_0' in layer.name][0] # # directly output sampled latent? # output_z = [layer.output for layer in self.model.layers if 'z_act' in layer.name or 'latent_act' in layer.name] # functors_z = [K.function([inp, K.learning_phase()], [out]) for out in output_z] # preds = [layer.output for layer in self.model.layers if 'y_pred' in layer.name] # pred_function = K.function([dec_inp, K.learning_phase()], [preds[0]]) inps = inputs.remove_columns([inputs.columns.get_loc('d3mIndex')]) #predictions = [] #eatures = [] features = self.enc_model.predict(inps, batch_size=self._batch) predictions = self.dec_model.predict(features, batch_size=self._batch) # for i in range(0, inps.shape[0], self._batch): # data = inps.values[i:i+self._batch] # z_stats = [func([data, 1.])[0] for func in functors] # z_out = [func([data, 1.])[0] for func in functors_z] # z_act = self.enc_model(data) # y_pred = self.dec_model(z_act) # _echo_args = copy.copy(self._echo_args) # _echo_args['batch'] = data.shape[0] # _echo_args['d_max'] = data.shape[0] # #z_act = echo_sample(z_stats, **_echo_args).eval(session=get_session()) # y_pred= pred_function([z_act, 1.])[0]#.eval(session=K.get_session()) # features.extend([z_act[yp] for yp in range(z_act.shape[0])]) # y_pred = np.argmax(y_pred, axis = -1) # predictions.extend([y_pred[yp] for yp in range(y_pred.shape[0])]) # predictions = np.array(predictions) if self.label_encode is not None: predictions = np.argmax(predictions, axis=-1) predictions = self.label_encode.inverse_transform(predictions) if modeling: output = d3m_DataFrame(predictions, columns=self.output_columns, generate_metadata=True, source=self) else: out_df = d3m_DataFrame(inputs, generate_metadata=True) # create metadata for the corex columns features = np.array(features) if len(predictions.shape) < len(features.shape): predictions = np.expand_dims(predictions, axis=-1) constructed = np.concatenate([features, predictions], axis=-1) corex_df = d3m_DataFrame(constructed, generate_metadata=True) for column_index in range(corex_df.shape[1]): col_dict = dict( corex_df.metadata.query((ALL_ELEMENTS, column_index))) col_dict['structural_type'] = type(1.0) # FIXME: assume we apply corex only once per template, otherwise column names might duplicate col_dict['name'] = str( out_df.shape[1] + column_index ) #'echoib_'+('pred_' if column_index < self.hyperparams['n_hidden'] else 'feature_') + col_dict['semantic_types'] = ( 'http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute') corex_df.metadata = corex_df.metadata.update( (ALL_ELEMENTS, column_index), col_dict) # concatenate is --VERY-- slow without this next line corex_df.index = out_df.index.copy() outputs = common_utils.append_columns(out_df, corex_df) if modeling: self._training_indices = [ c for c in inputs.columns if isinstance(c, str) and 'index' in c.lower() ] outputs = common_utils.combine_columns( return_result='new', #self.hyperparams['return_result'], add_index_columns=True, #self.hyperparams['add_index_columns'], inputs=inputs, columns_list=[output], source=self, column_indices=self._training_indices) #predictions = d3m_DataFrame(predictions, index = inputs.index.copy())# columns = self.output_columns return CallResult(outputs, True, 1)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Parameters ---------- inputs : numpy ndarray of size (number_of_time_series, time_series_length) containing new time series Returns ---------- Outputs The output is a dataframe containing a single column where each entry is the associated series' cluster number. """ hyperparams_class = DatasetToDataFrame.DatasetToDataFramePrimitive.metadata.query( )['primitive_code']['class_type_arguments']['Hyperparams'] ds2df_client = DatasetToDataFrame.DatasetToDataFramePrimitive( hyperparams=hyperparams_class.defaults().replace( {"dataframe_resource": "learningData"})) metadata_inputs = ds2df_client.produce(inputs=inputs).value # temporary (until Uncharted adds conversion primitive to repo) if not self.hyperparams['long_format']: formatted_inputs = TimeSeriesFormatterPrimitive( hyperparams=self._hp).produce(inputs=inputs).value['0'] else: formatted_inputs = d3m_DataFrame( ds2df_client.produce(inputs=inputs).value) # store information on target, index variable targets = metadata_inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/TrueTarget') if not len(targets): targets = metadata_inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/TrueTarget') if not len(targets): targets = metadata_inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget' ) target_names = [list(metadata_inputs)[t] for t in targets] index = metadata_inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/PrimaryKey') # parse values from output of time series formatter n_ts = len(formatted_inputs.d3mIndex.unique()) if n_ts == formatted_inputs.shape[0]: X_test = formatted_inputs.drop( columns=list(formatted_inputs)[index[0]]) X_test = X_test.drop(columns=target_names).values else: ts_sz = int(formatted_inputs.shape[0] / n_ts) X_test = np.array(formatted_inputs.value).reshape(n_ts, ts_sz) # special semi-supervised case - during training, only produce rows with labels series = metadata_inputs[target_names] != '' if series.any().any(): metadata_inputs = dataframe_utils.select_rows( metadata_inputs, np.flatnonzero(series)) X_test = X_test[np.flatnonzero(series)] sloth_df = d3m_DataFrame( pandas.DataFrame(self.clf.fit_predict(X_test), columns=['cluster_labels'])) # last column ('clusters') col_dict = dict( sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, 0))) col_dict['structural_type'] = type(1) col_dict['name'] = 'cluster_labels' col_dict['semantic_types'] = ( 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/CategoricalData') sloth_df.metadata = sloth_df.metadata.update( (metadata_base.ALL_ELEMENTS, 0), col_dict) df_dict = dict(sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict_1 = dict( sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict['dimension'] = df_dict_1 df_dict_1['name'] = 'columns' df_dict_1['semantic_types'] = ( 'https://metadata.datadrivendiscovery.org/types/TabularColumn', ) df_dict_1['length'] = 1 sloth_df.metadata = sloth_df.metadata.update( (metadata_base.ALL_ELEMENTS, ), df_dict) return CallResult(utils_cp.append_columns(metadata_inputs, sloth_df))
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Parameters ---------- inputs : numpy ndarray of size (number_of_time_series, time_series_length) containing new time series Returns ---------- Outputs The output is a transformed dataframe of X fit into an embedded space, n feature columns will equal n_components hyperparameter For timeseries datasets the output is the dimensions concatenated to the timeseries filename dataframe """ hyperparams_class = DatasetToDataFrame.DatasetToDataFramePrimitive.metadata.query( )['primitive_code']['class_type_arguments']['Hyperparams'] ds2df_client = DatasetToDataFrame.DatasetToDataFramePrimitive( hyperparams=hyperparams_class.defaults().replace( {"dataframe_resource": "learningData"})) metadata_inputs = ds2df_client.produce(inputs=inputs).value # temporary (until Uncharted adds conversion primitive to repo) if not self.hyperparams['long_format']: formatted_inputs = TimeSeriesFormatterPrimitive( hyperparams=self._hp).produce(inputs=inputs).value['0'] else: formatted_inputs = d3m_DataFrame( ds2df_client.produce(inputs=inputs).value) # store information on target, index variable targets = metadata_inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/TrueTarget') if not len(targets): targets = metadata_inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/TrueTarget') if not len(targets): targets = metadata_inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget' ) target_names = [list(metadata_inputs)[t] for t in targets] index = metadata_inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/PrimaryKey') index_names = [list(metadata_inputs)[i] for i in index] n_ts = len(formatted_inputs.d3mIndex.unique()) if n_ts == formatted_inputs.shape[0]: X_test = formatted_inputs.drop( columns=list(formatted_inputs)[index[0]]) X_test = X_test.drop(columns=target_names).values else: ts_sz = int(formatted_inputs.shape[0] / n_ts) X_test = np.array(formatted_inputs.value).reshape(n_ts, ts_sz) # fit_transform data and create new dataframe n_components = self.hyperparams['n_components'] col_names = ['Dim' + str(c) for c in range(0, n_components)] tsne_df = d3m_DataFrame( pandas.DataFrame(self.clf.fit_transform(X_test), columns=col_names)) if self.hyperparams['long_format']: tsne_df = pandas.concat([formatted_inputs.d3mIndex, tsne_df], axis=1) # add index colmn metadata col_dict = dict( tsne_df.metadata.query((metadata_base.ALL_ELEMENTS, 0))) col_dict['structural_type'] = type('1') col_dict['name'] = index_names[0] col_dict['semantic_types'] = ( 'http://schema.org/Int', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey') tsne_df.metadata = tsne_df.metadata.update( (metadata_base.ALL_ELEMENTS, 0), col_dict) # add dimenion columns metadata for c in range(1, n_components + 1): col_dict = dict( tsne_df.metadata.query((metadata_base.ALL_ELEMENTS, c))) col_dict['structural_type'] = type(1.0) col_dict['name'] = 'Dim' + str(c - 1) col_dict['semantic_types'] = ( 'http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute') tsne_df.metadata = tsne_df.metadata.update( (metadata_base.ALL_ELEMENTS, c), col_dict) df_dict = dict( tsne_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict_1 = dict( tsne_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict['dimension'] = df_dict_1 df_dict_1['name'] = 'columns' df_dict_1['semantic_types'] = ( 'https://metadata.datadrivendiscovery.org/types/TabularColumn', ) df_dict_1['length'] = n_components + 1 tsne_df.metadata = tsne_df.metadata.update( (metadata_base.ALL_ELEMENTS, ), df_dict) return CallResult(tsne_df) else: for c in range(0, n_components): col_dict = dict( tsne_df.metadata.query((metadata_base.ALL_ELEMENTS, c))) col_dict['structural_type'] = type('1') col_dict['name'] = str(c) col_dict['semantic_types'] = ( 'http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute') tsne_df.metadata = tsne_df.metadata.update( (metadata_base.ALL_ELEMENTS, c), col_dict) df_dict = dict( tsne_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict_1 = dict( tsne_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict['dimension'] = df_dict_1 df_dict_1['name'] = 'columns' df_dict_1['semantic_types'] = ( 'https://metadata.datadrivendiscovery.org/types/TabularColumn', ) df_dict_1['length'] = n_components tsne_df.metadata = tsne_df.metadata.update( (metadata_base.ALL_ELEMENTS, ), df_dict) return CallResult(utils_cp.append_columns(metadata_inputs, tsne_df))
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None ) -> CallResult[container.pandas.DataFrame]: """ Parameters ---------- inputs : D3M dataframe with associated metadata. Returns ------- Outputs For unsupervised problems: The output is a dataframe containing a single column where each entry is the associated series' cluster number. For semi-supervised problems: The output is the input df containing an additional feature - cluster_label """ #hyperparams_class = DatasetToDataFrame.DatasetToDataFramePrimitive.metadata.query()['primitive_code']['class_type_arguments']['Hyperparams'] #ds2df_client = DatasetToDataFrame.DatasetToDataFramePrimitive(hyperparams = hyperparams_class.defaults().replace({"dataframe_resource":"learningData"})) #metadata_inputs = ds2df_client.produce(inputs = inputs).value #formatted_inputs = ds2df_client.produce(inputs = inputs).value # store information on target, index variable targets = inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/TrueTarget') if not len(targets): targets = inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/TrueTarget') if not len(targets): targets = inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget' ) target_names = [list(inputs)[t] for t in targets] index = inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/PrimaryKey') index_names = [list(inputs)[i] for i in index] # load and reshape training data n_ts = len(inputs.d3mIndex.unique()) if n_ts == inputs.shape[0]: X_test = inputs.drop(columns=list(inputs)[index[0]]) X_test = X_test.drop(columns=target_names).values else: ts_sz = int(inputs.shape[0] / n_ts) X_test = np.array(inputs.value).reshape(n_ts, ts_sz, 1) # special semi-supervised case - during training, only produce rows with labels if self.clustering: sloth_df = d3m_DataFrame( pandas.DataFrame(self._kmeans.predict(X_test), columns=[target_names[0]])) sloth_df = pandas.concat([inputs.d3mIndex, sloth_df], axis=1) # first column ('d3mTndex') col_dict = dict( sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, 0))) col_dict['structural_type'] = type("1") col_dict['name'] = index_names[0] col_dict['semantic_types'] = ( 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', ) sloth_df.metadata = sloth_df.metadata.update( (metadata_base.ALL_ELEMENTS, 0), col_dict) # second column ('Class') col_dict = dict( sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, 1))) col_dict['structural_type'] = type("1") col_dict['name'] = target_names[0] col_dict['semantic_types'] = ( 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget' ) sloth_df.metadata = sloth_df.metadata.update( (metadata_base.ALL_ELEMENTS, 1), col_dict) df_dict = dict( sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict_1 = dict( sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict['dimension'] = df_dict_1 df_dict_1['name'] = 'columns' df_dict_1['semantic_types'] = ( 'https://metadata.datadrivendiscovery.org/types/TabularColumn', ) df_dict_1['length'] = 2 sloth_df.metadata = sloth_df.metadata.update( (metadata_base.ALL_ELEMENTS, ), df_dict) return CallResult(sloth_df) else: series = inputs[target_names] != '' if series.any().any(): inputs = dataframe_utils.select_rows(inputs, np.flatnonzero(series)) X_test = X_test[np.flatnonzero(series)] sloth_df = d3m_DataFrame( pandas.DataFrame(self._kmeans.predict(X_test), columns=['cluster_labels'])) # add clusters as a feature in the main dataframe - last column ('clusters') col_dict = dict( sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, 0))) col_dict['structural_type'] = type(1) col_dict['name'] = 'cluster_labels' col_dict['semantic_types'] = ( 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/CategoricalData' ) sloth_df.metadata = sloth_df.metadata.update( (metadata_base.ALL_ELEMENTS, 0), col_dict) df_dict = dict( sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict_1 = dict( sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict['dimension'] = df_dict_1 df_dict_1['name'] = 'columns' df_dict_1['semantic_types'] = ( 'https://metadata.datadrivendiscovery.org/types/TabularColumn', ) df_dict_1['length'] = 1 sloth_df.metadata = sloth_df.metadata.update( (metadata_base.ALL_ELEMENTS, ), df_dict) return CallResult(utils_cp.append_columns(inputs, sloth_df))
def produce( self, *, inputs: container.DataFrame, timeout: float = None, iterations: int = None) -> base.CallResult[container.DataFrame]: # if no column index is supplied use the first real vector column found in the dataset vector_idx = self.hyperparams['vector_col_index'] if vector_idx is None: vector_idx = self._find_real_vector_column(inputs.metadata) # validate the column if not self._can_use_column(inputs.metadata, vector_idx): raise exceptions.InvalidArgumentValueError( 'column idx=' + str(vector_idx) + ' from ' + str(inputs.columns) + ' does not contain float vectors') # flag label generation if none are supplied labels = list(self.hyperparams['labels']) if labels is None: labels = [] generate_labels = True if labels is None or len(labels) == 0 else False # create a dataframe to hold the new columns vector_dataframe = container.DataFrame(data=[]) # loop over elements of the source vector column for i, v in enumerate(inputs.iloc[:, vector_idx]): elems = v.split(',') vector_length = len(elems) for j, e in enumerate(elems): # initialize columns when processing first row if i == 0: # get the name of the source vector column vector_col_metadata = inputs.metadata.query_column( vector_idx) vector_label = vector_col_metadata['name'] # create an empty column for each element of the vector if generate_labels: labels.append(vector_label + "_" + str(j)) vector_dataframe[labels[j]] = '' # write vector elements into each column - force to string as d3m convention is # to store data as pandas 'obj' type until explicitly cast vector_dataframe.at[i, labels[j]] = str(e.strip()) # create default d3m metadata structures (rows, columns etc.) and copy the semantic types # from the source vector over, replacing FloatVector with Float vector_dataframe.metadata = vector_dataframe.metadata.set_for_value( vector_dataframe) source_semantic_types = list( inputs.metadata.query_column(vector_idx)['semantic_types']) source_semantic_types.remove( 'https://metadata.datadrivendiscovery.org/types/FloatVector') source_semantic_types.append( 'https://metadata.datadrivendiscovery.org/types/Float') for i in range(0, len(labels)): vector_dataframe.metadata = vector_dataframe.metadata.\ update_column(i, {'semantic_types': source_semantic_types}) output = utils.append_columns(inputs, vector_dataframe) # wrap as a D3M container - metadata should be auto generated return base.CallResult(output)