示例#1
0
    def produce(self,
                *,
                inputs: Input,
                timeout: float = None,
                iterations: int = None) -> CallResult[Output]:
        #make_keras_pickleable()
        produce_data, learning_df, nodes_df, edges_df = self._parse_inputs(
            inputs, return_all=True)
        if self.fitted:
            result = self._sdne._Y  #produce( )#_Y
        else:
            dim = self.hyperparams['dimension']
            alpha = self.hyperparams['alpha']
            beta = self.hyperparams['beta']
            #self._model
            self._sdne = sdne.SDNE(d=dim, alpha=alpha, beta=beta, **args)

            produce_data = networkx.from_scipy_sparse_matrix(produce_data)
            self._sdne.learn_embedding(graph=produce_data)
            self._model = self._sdne._model
            result = self._sdne._Y

        target_types = [
            'https://metadata.datadrivendiscovery.org/types/TrueTarget',
            'https://metadata.datadrivendiscovery.org/types/SuggestedTarget'
        ]
        if self.hyperparams['return_list']:
            result_np = container.ndarray(result, generate_metadata=True)
            return_list = d3m_List([result_np, inputs[1], inputs[2]],
                                   generate_metadata=True)
            return CallResult(return_list, True, 1)
        else:
            learn_df = d3m_DataFrame(learning_df, generate_metadata=True)
            learn_df = get_columns_not_of_type(learn_df, target_types)

            learn_df = learn_df.remove_columns(
                [learn_df.columns.get_loc('nodeID')])
            #learn_df = learn_df.drop('nodeID', axis = 'columns')

            result_df = d3m_DataFrame(result, generate_metadata=True)
            result_df = result_df.loc[result_df.index.isin(
                learning_df['d3mIndex'].values)]

            for column_index in range(result_df.shape[1]):
                col_dict = dict(
                    result_df.metadata.query((ALL_ELEMENTS, column_index)))
                col_dict['structural_type'] = type(1.0)
                col_dict['name'] = str(learn_df.shape[1] + column_index)
                col_dict['semantic_types'] = (
                    'http://schema.org/Float',
                    'https://metadata.datadrivendiscovery.org/types/Attribute')

                result_df.metadata = result_df.metadata.update(
                    (ALL_ELEMENTS, column_index), col_dict)
            result_df.index = learn_df.index.copy()

            output = utils.append_columns(learn_df, result_df)
            #output.set_index('d3mIndex', inplace=True)
            return CallResult(output, True, 1)
    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        """
        Parameters
        ----------
        inputs : dataframe 

        Returns
        ----------
        Outputs
            The output is a transformed dataframe of X fit into an embedded space, n feature columns will equal n_components hyperparameter
            For timeseries datasets the output is the dimensions concatenated to the timeseries filename dataframe
        """ 
    
        targets = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/TrueTarget')
        if not len(targets):
            targets = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/TrueTarget')
        if not len(targets):
            targets = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/SuggestedTarget')
        target_names = [list(inputs)[t] for t in targets]
        index = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/PrimaryKey')
        index_names = [list(inputs)[i] for i in index]
 
        X_test = inputs.drop(columns = list(inputs)[index[0]])
        X_test = X_test.drop(columns = target_names).values
        
        # special semi-supervised case - during training, only produce rows with labels
        series = inputs[target_names] != ''
        if series.any().any():
            inputs = dataframe_utils.select_rows(inputs, np.flatnonzero(series))
            X_test = X_test[np.flatnonzero(series)]

        sc_df = d3m_DataFrame(pandas.DataFrame(self.sc.fit_predict(X_test), columns=['cluster_labels']))

        # just add last column of last column ('clusters')
        col_dict = dict(sc_df.metadata.query((metadata_base.ALL_ELEMENTS, 0)))
        col_dict['structural_type'] = type(1)
        if self.hyperparams['task_type'] == 'classification':
            col_dict['semantic_types'] = ('http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute')
            col_dict['name'] = 'cluster_labels'
        else:
            col_dict['semantic_types'] = ('http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')
            col_dict['name'] = target_names[0]
        sc_df.metadata = sc_df.metadata.update((metadata_base.ALL_ELEMENTS, 0), col_dict)
        
        df_dict = dict(sc_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
        df_dict_1 = dict(sc_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) 
        df_dict['dimension'] = df_dict_1
        df_dict_1['name'] = 'columns'
        df_dict_1['semantic_types'] = ('https://metadata.datadrivendiscovery.org/types/TabularColumn',)
        df_dict_1['length'] = 1        
        sc_df.metadata = sc_df.metadata.update((metadata_base.ALL_ELEMENTS,), df_dict)
                
        return CallResult(utils_cp.append_columns(inputs, sc_df))
                  
示例#3
0
    def produce(self,
                *,
                inputs: Input,
                timeout: float = None,
                iterations: int = None) -> CallResult[Output]:

        self.columns = list(inputs)
        X_ = inputs[self.columns].values

        if iterations is not None:
            self.max_iter = iterations
        else:
            self.max_iter = 10000

        #try:
        if not self.fitted:
            raise ValueError('Please fit before calling produce')
        #except:
        #    pass

        self.latent_factors = self.model.transform(X_)

        out_df = d3m_DataFrame(inputs, generate_metadata=True)
        corex_df = d3m_DataFrame(self.latent_factors, generate_metadata=True)

        for column_index in range(corex_df.shape[1]):
            col_dict = dict(
                corex_df.metadata.query((ALL_ELEMENTS, column_index)))
            col_dict['structural_type'] = type(1.0)
            # FIXME: assume we apply corex only once per template, otherwise column names might duplicate
            col_dict['name'] = str(
                out_df.shape[1] + column_index
            )  #should just be column index, no corex prefix #'corex_' +
            col_dict['semantic_types'] = (
                'http://schema.org/Float',
                'https://metadata.datadrivendiscovery.org/types/Attribute')

            corex_df.metadata = corex_df.metadata.update(
                (ALL_ELEMENTS, column_index), col_dict)
        corex_df.index = out_df.index.copy()

        out_df = utils.append_columns(out_df, corex_df)

        return CallResult(out_df, True, self.max_iter)
示例#4
0
    def produce(self,
                *,
                inputs: Input,
                timeout: float = None,
                iterations: int = None) -> CallResult[Output]:
        columns_list_to_fold = self._mapping.get('foldable_columns', [])
        if len(columns_list_to_fold) == 0:
            return CallResult(inputs, True, 1)
        if inputs.shape[0] > 20000:
            return CallResult(inputs, True, 1)
        self._column_names = list(inputs) if inputs is not None else []
        df = None
        for columns_to_fold in columns_list_to_fold:
            df = self._fold_columns(inputs, columns_to_fold)
        cols_to_drop = list()
        for col_idx, col_name in enumerate(inputs.columns):
            if col_name not in df.columns:
                cols_to_drop.append(col_idx)

        inputs = utils.remove_columns(inputs, cols_to_drop)
        new_df = inputs[0:0]
        for col_name in new_df.columns:
            new_df.loc[:, col_name] = df.loc[:, col_name]

        extends = {}
        for col_name in df.columns:
            if col_name not in new_df.columns:
                extends[col_name] = df.loc[:, col_name].tolist()

        if extends:
            extends_df = d3m_DataFrame.from_dict(extends)
            extends_df.index = new_df.index.copy()
            new_df = utils.append_columns(new_df, extends_df)
            new_df = self._update_type(new_df, list(extends.keys()))

        old_metadata = dict(new_df.metadata.query(()))
        old_metadata["dimension"] = dict(old_metadata["dimension"])
        old_metadata["dimension"]["length"] = new_df.shape[0]
        new_df.metadata = new_df.metadata.update((), old_metadata)

        return CallResult(new_df, True,
                          1) if new_df is not None else CallResult(
                              inputs, True, 1)
示例#5
0
def update_type(extends, df_origin):
    extends_df = d3m_DataFrame.from_dict(extends)
    if extends != {}:
        extends_df.index = df_origin.index.copy()
    new_df = utils.append_columns(df_origin, extends_df)

    indices = list()
    for key in extends:
        indices.append(new_df.columns.get_loc(key))

    for idx in indices:
        old_metadata = dict(new_df.metadata.query((mbase.ALL_ELEMENTS, idx)))

        numerics = pd.to_numeric(new_df.iloc[:, idx], errors='coerce')
        length = numerics.shape[0]
        nans = numerics.isnull().sum()

        if nans / length > 0.9:
            if HelperFunction.is_categorical(new_df.iloc[:, idx]):
                old_metadata['semantic_types'] = (
                    "https://metadata.datadrivendiscovery.org/types/CategoricalData",
                )
            else:
                old_metadata['semantic_types'] = ("http://schema.org/Text", )
        else:
            intcheck = (numerics % 1) == 0
            if np.sum(intcheck) / length > 0.9:
                old_metadata['semantic_types'] = (
                    "http://schema.org/Integer", )
            else:
                old_metadata['semantic_types'] = ("http://schema.org/Float", )

        old_metadata['semantic_types'] += (
            "https://metadata.datadrivendiscovery.org/types/Attribute", )

        new_df.metadata = new_df.metadata.update((mbase.ALL_ELEMENTS, idx),
                                                 old_metadata)

    return new_df
示例#6
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """
        Parameters
        ----------
        inputs : dataframe with attached metadata for semi-supervised or unsupervised data

        Returns
        ----------
        Outputs
            The output depends on the required_output hyperparameter and is either a dataframe containing a single column 
            where each entry is the cluster ID, or the input daatframe with the cluster ID of each row added as an additional feature. 
        """

        # find target and index variables
        targets = inputs.metadata.get_columns_with_semantic_type(
            'https://metadata.datadrivendiscovery.org/types/TrueTarget')
        if not len(targets):
            targets = inputs.metadata.get_columns_with_semantic_type(
                'https://metadata.datadrivendiscovery.org/types/TrueTarget')
        if not len(targets):
            targets = inputs.metadata.get_columns_with_semantic_type(
                'https://metadata.datadrivendiscovery.org/types/SuggestedTarget'
            )
        target_names = [list(inputs)[t] for t in targets]
        index = inputs.metadata.get_columns_with_semantic_type(
            'https://metadata.datadrivendiscovery.org/types/PrimaryKey')
        index_names = [list(inputs)[i] for i in index]

        X_test = inputs.copy()
        if len(index):
            X_test = X_test.drop(columns=list(inputs)[index[0]])
        if len(target_names):
            X_test = X_test.drop(columns=target_names)
        X_test = X_test.values

        # special semi-supervised case - during training, only produce rows with labels
        series = inputs[target_names] != ''
        if series.any().any():
            inputs = dataframe_utils.select_rows(inputs,
                                                 np.flatnonzero(series))
            X_test = X_test[np.flatnonzero(series)]

        if self.hyperparams['required_output'] == 'feature':

            hdb_df = d3m_DataFrame(
                pandas.DataFrame(self.clf.fit_predict(X_test),
                                 columns=['cluster_labels']))

            # just add last column of last column ('clusters')
            col_dict = dict(
                hdb_df.metadata.query((metadata_base.ALL_ELEMENTS, 0)))
            col_dict['structural_type'] = type(1)
            col_dict['name'] = 'cluster_labels'
            col_dict['semantic_types'] = (
                'http://schema.org/Integer',
                'https://metadata.datadrivendiscovery.org/types/Attribute')
            hdb_df.metadata = hdb_df.metadata.update(
                (metadata_base.ALL_ELEMENTS, 0), col_dict)

            df_dict = dict(
                hdb_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
            df_dict_1 = dict(
                hdb_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
            df_dict['dimension'] = df_dict_1
            df_dict_1['name'] = 'columns'
            df_dict_1['semantic_types'] = (
                'https://metadata.datadrivendiscovery.org/types/TabularColumn',
            )
            df_dict_1['length'] = 1
            hdb_df.metadata = hdb_df.metadata.update(
                (metadata_base.ALL_ELEMENTS, ), df_dict)

            return CallResult(utils_cp.append_columns(inputs, hdb_df))
        else:

            hdb_df = d3m_DataFrame(
                pandas.DataFrame(self.clf.fit_predict(X_test),
                                 columns=[target_names[0]]))

            hdb_df = pandas.concat([inputs.d3mIndex, hdb_df], axis=1)

            col_dict = dict(
                hdb_df.metadata.query((metadata_base.ALL_ELEMENTS, 0)))
            col_dict['structural_type'] = type(1)
            col_dict['name'] = index_names[0]
            col_dict['semantic_types'] = (
                'http://schema.org/Integer',
                'https://metadata.datadrivendiscovery.org/types/PrimaryKey')
            hdb_df.metadata = hdb_df.metadata.update(
                (metadata_base.ALL_ELEMENTS, 0), col_dict)

            col_dict = dict(
                hdb_df.metadata.query((metadata_base.ALL_ELEMENTS, 1)))
            col_dict['structural_type'] = type(1)
            col_dict['name'] = target_names[0]
            col_dict['semantic_types'] = (
                'http://schema.org/Integer',
                'https://metadata.datadrivendiscovery.org/types/PredictedTarget'
            )
            hdb_df.metadata = hdb_df.metadata.update(
                (metadata_base.ALL_ELEMENTS, 1), col_dict)

            df_dict = dict(
                hdb_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
            df_dict_1 = dict(
                hdb_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
            df_dict['dimension'] = df_dict_1
            df_dict_1['name'] = 'columns'
            df_dict_1['semantic_types'] = (
                'https://metadata.datadrivendiscovery.org/types/TabularColumn',
            )
            df_dict_1['length'] = 2
            hdb_df.metadata = hdb_df.metadata.update(
                (metadata_base.ALL_ELEMENTS, ), df_dict)

            return CallResult(hdb_df)
示例#7
0
    def _process_files(self, inputs: Input):
        fn_attributes = DataMetadata.list_columns_with_semantic_types(self=inputs.metadata, \
            semantic_types=["https://metadata.datadrivendiscovery.org/types/FileName"])
        all_attributes = DataMetadata.list_columns_with_semantic_types(self=inputs.metadata, \
            semantic_types=["https://metadata.datadrivendiscovery.org/types/Attribute"])
        fn_columns = list(set(all_attributes).intersection(fn_attributes))

        # if no file name columns are detected, default to regular behavior
        if len(fn_columns) == 0:
            return inputs

        # create an empty DataFrame of the required size
        processed_cols = pd.DataFrame("", index = copy.deepcopy(inputs.index), \
            columns = ['text_files_' + str(i) for i in range(len(fn_columns))])

        # for column_index in range(len(fn_columns)):
        for column_index in fn_columns:
            curr_column = copy.deepcopy(inputs.iloc[:, column_index])

            file_loc = inputs.metadata.query(
                (ALL_ELEMENTS, column_index))['location_base_uris']
            file_loc = file_loc[0]  # take the first elem of the tuple
            file_loc = file_loc[7:]  # get rid of 'file://' prefix

            for row_index in range(curr_column.shape[0]):
                text_file = curr_column.iloc[row_index]
                file_path = file_loc + text_file

                with open(file_path, 'rb') as file:
                    doc = file.read()
                doc = "".join(map(chr, doc))
                doc_tokens = re.compile(r"(?u)\b\w\w+\b").findall(
                    doc)  # list of strings

                processed_cols.iloc[row_index,
                                    fn_columns.index(column_index)] = " ".join(
                                        doc_tokens)

        # construct metadata for the newly generated columns
        processed_cols = d3m_DataFrame(processed_cols, generate_metadata=True)

        for column_index in range(processed_cols.shape[1]):
            col_dict = dict(
                processed_cols.metadata.query((ALL_ELEMENTS, column_index)))
            col_dict['structural_type'] = type("text")
            # FIXME: assume we apply corex only once per template, otherwise column names might duplicate
            col_dict['name'] = 'processed_file_' + str(inputs.shape[1] +
                                                       column_index)
            col_dict['semantic_types'] = (
                'http://schema.org/Text',
                'https://metadata.datadrivendiscovery.org/types/Attribute')

            processed_cols.metadata = processed_cols.metadata.update(
                (ALL_ELEMENTS, column_index), col_dict)

        # concatenate the input with the newly created columns
        updated_inputs = utils.append_columns(inputs, processed_cols)

        # remove the initial FileName columns from the df, if we do this before concatenating we might get an empty dataset error
        updated_inputs = utils.remove_columns(updated_inputs, fn_columns)

        return updated_inputs
示例#8
0
    def produce(self,
                *,
                inputs: Input,
                timeout: float = None,
                iterations: int = None) -> CallResult[Output]:
        # if corex didn't run for any reason, just return the given dataset
        if self.do_nothing:
            return CallResult(inputs, True, 1)

        inputs = self._process_files(inputs)

        if iterations is not None:
            self.max_iter = iterations
        else:
            self.max_iter = 250
        self.model.max_iter = self.max_iter

        # concatenate the columns row-wise
        concat_cols = None
        for column_index in self.text_columns:
            if concat_cols is not None:
                concat_cols = concat_cols.str.cat(inputs.iloc[:, column_index],
                                                  sep=" ")
            else:
                concat_cols = copy.deepcopy(inputs.iloc[:, column_index])
        bow = self.bow.transform(map(self._get_ngrams, concat_cols.ravel()))

        # choose between CorEx and the TfIdf matrix
        if bow.shape[1] > self.hyperparams['threshold']:
            # use CorEx
            self.latent_factors = self.model.transform(bow).astype(float)
        else:
            # just use the bag of words representation
            self.latent_factors = pd.DataFrame(bow.todense())
        # make the columns corex adds distinguishable from other columns

        # remove the selected columns from input and add the latent factors given by corex
        out_df = d3m_DataFrame(inputs, generate_metadata=True)

        self.latent_factors.columns = [
            str(out_df.shape[-1] + i)
            for i in range(self.latent_factors.shape[-1])
        ]

        # create metadata for the corex columns
        corex_df = d3m_DataFrame(self.latent_factors, generate_metadata=True)
        for column_index in range(corex_df.shape[1]):
            col_dict = dict(
                corex_df.metadata.query((ALL_ELEMENTS, column_index)))
            col_dict['structural_type'] = type(1.0)
            # FIXME: assume we apply corex only once per template, otherwise column names might duplicate
            col_dict['name'] = 'corex_' + str(out_df.shape[1] + column_index)
            col_dict['semantic_types'] = (
                'http://schema.org/Float',
                'https://metadata.datadrivendiscovery.org/types/Attribute')

            corex_df.metadata = corex_df.metadata.update(
                (ALL_ELEMENTS, column_index), col_dict)

        # concatenate is --VERY-- slow without this next line
        corex_df.index = out_df.index.copy()

        out_df = utils.append_columns(out_df, corex_df)

        # remove the initial text columns from the df, if we do this before CorEx we can get an empty dataset error
        out_df = utils.remove_columns(out_df, self.text_columns)

        # TO DO : Incorporate timeout, max_iter
        # return CallResult(d3m_DataFrame(self.latent_factors))
        return CallResult(out_df, True, 1)
示例#9
0
    def produce(
        self,
        *,
        inputs: Input,
        timeout: float = None,
        iterations: int = None
    ) -> CallResult[Output]:  # TAKES IN DF with index column
        self._extra_params()

        modeling = self.hyperparams['use_as_modeling']
        inp = self.model.input

        # outputs = [layer.output for layer in self.model.layers if 'z_mean' in layer.name or 'z_noise' in layer.name]
        # functors = [K.function([inp, K.learning_phase()], [out]) for out in outputs]
        # dec_inp = [layer.input for layer in self.model.layers if 'decoder_0' in layer.name][0]
        # # directly output sampled latent?
        # output_z = [layer.output for layer in self.model.layers if 'z_act' in layer.name or 'latent_act' in layer.name]
        # functors_z = [K.function([inp, K.learning_phase()], [out]) for out in output_z]

        # preds = [layer.output for layer in self.model.layers if 'y_pred' in layer.name]
        # pred_function = K.function([dec_inp, K.learning_phase()], [preds[0]])

        inps = inputs.remove_columns([inputs.columns.get_loc('d3mIndex')])
        #predictions = []
        #eatures = []

        features = self.enc_model.predict(inps, batch_size=self._batch)
        predictions = self.dec_model.predict(features, batch_size=self._batch)
        # for i in range(0, inps.shape[0], self._batch):
        #     data = inps.values[i:i+self._batch]
        #     z_stats = [func([data, 1.])[0] for func in functors]
        #     z_out = [func([data, 1.])[0] for func in functors_z]

        #     z_act = self.enc_model(data)
        #     y_pred = self.dec_model(z_act)
        #     _echo_args = copy.copy(self._echo_args)
        #     _echo_args['batch'] = data.shape[0]
        #     _echo_args['d_max'] = data.shape[0]

        #     #z_act = echo_sample(z_stats, **_echo_args).eval(session=get_session())

        #     y_pred= pred_function([z_act, 1.])[0]#.eval(session=K.get_session())
        #     features.extend([z_act[yp] for yp in range(z_act.shape[0])])

        #     y_pred = np.argmax(y_pred, axis = -1)
        #     predictions.extend([y_pred[yp] for yp in range(y_pred.shape[0])])
        # predictions = np.array(predictions)

        if self.label_encode is not None:
            predictions = np.argmax(predictions, axis=-1)
            predictions = self.label_encode.inverse_transform(predictions)

        if modeling:
            output = d3m_DataFrame(predictions,
                                   columns=self.output_columns,
                                   generate_metadata=True,
                                   source=self)
        else:
            out_df = d3m_DataFrame(inputs, generate_metadata=True)

            # create metadata for the corex columns
            features = np.array(features)

            if len(predictions.shape) < len(features.shape):
                predictions = np.expand_dims(predictions, axis=-1)

            constructed = np.concatenate([features, predictions], axis=-1)
            corex_df = d3m_DataFrame(constructed, generate_metadata=True)

            for column_index in range(corex_df.shape[1]):
                col_dict = dict(
                    corex_df.metadata.query((ALL_ELEMENTS, column_index)))
                col_dict['structural_type'] = type(1.0)
                # FIXME: assume we apply corex only once per template, otherwise column names might duplicate
                col_dict['name'] = str(
                    out_df.shape[1] + column_index
                )  #'echoib_'+('pred_' if column_index < self.hyperparams['n_hidden'] else 'feature_') +
                col_dict['semantic_types'] = (
                    'http://schema.org/Float',
                    'https://metadata.datadrivendiscovery.org/types/Attribute')

                corex_df.metadata = corex_df.metadata.update(
                    (ALL_ELEMENTS, column_index), col_dict)

            # concatenate is --VERY-- slow without this next line
            corex_df.index = out_df.index.copy()

            outputs = common_utils.append_columns(out_df, corex_df)

        if modeling:
            self._training_indices = [
                c for c in inputs.columns
                if isinstance(c, str) and 'index' in c.lower()
            ]

            outputs = common_utils.combine_columns(
                return_result='new',  #self.hyperparams['return_result'],
                add_index_columns=True,  #self.hyperparams['add_index_columns'],
                inputs=inputs,
                columns_list=[output],
                source=self,
                column_indices=self._training_indices)

        #predictions = d3m_DataFrame(predictions, index = inputs.index.copy())# columns = self.output_columns

        return CallResult(outputs, True, 1)
示例#10
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """
        Parameters
        ----------
        inputs : numpy ndarray of size (number_of_time_series, time_series_length) containing new time series 

        Returns
        ----------
        Outputs
            The output is a dataframe containing a single column where each entry is the associated series' cluster number.
        """

        hyperparams_class = DatasetToDataFrame.DatasetToDataFramePrimitive.metadata.query(
        )['primitive_code']['class_type_arguments']['Hyperparams']
        ds2df_client = DatasetToDataFrame.DatasetToDataFramePrimitive(
            hyperparams=hyperparams_class.defaults().replace(
                {"dataframe_resource": "learningData"}))
        metadata_inputs = ds2df_client.produce(inputs=inputs).value

        # temporary (until Uncharted adds conversion primitive to repo)
        if not self.hyperparams['long_format']:
            formatted_inputs = TimeSeriesFormatterPrimitive(
                hyperparams=self._hp).produce(inputs=inputs).value['0']
        else:
            formatted_inputs = d3m_DataFrame(
                ds2df_client.produce(inputs=inputs).value)

        # store information on target, index variable
        targets = metadata_inputs.metadata.get_columns_with_semantic_type(
            'https://metadata.datadrivendiscovery.org/types/TrueTarget')
        if not len(targets):
            targets = metadata_inputs.metadata.get_columns_with_semantic_type(
                'https://metadata.datadrivendiscovery.org/types/TrueTarget')
        if not len(targets):
            targets = metadata_inputs.metadata.get_columns_with_semantic_type(
                'https://metadata.datadrivendiscovery.org/types/SuggestedTarget'
            )
        target_names = [list(metadata_inputs)[t] for t in targets]
        index = metadata_inputs.metadata.get_columns_with_semantic_type(
            'https://metadata.datadrivendiscovery.org/types/PrimaryKey')

        # parse values from output of time series formatter
        n_ts = len(formatted_inputs.d3mIndex.unique())
        if n_ts == formatted_inputs.shape[0]:
            X_test = formatted_inputs.drop(
                columns=list(formatted_inputs)[index[0]])
            X_test = X_test.drop(columns=target_names).values
        else:
            ts_sz = int(formatted_inputs.shape[0] / n_ts)
            X_test = np.array(formatted_inputs.value).reshape(n_ts, ts_sz)

        # special semi-supervised case - during training, only produce rows with labels
        series = metadata_inputs[target_names] != ''
        if series.any().any():
            metadata_inputs = dataframe_utils.select_rows(
                metadata_inputs, np.flatnonzero(series))
            X_test = X_test[np.flatnonzero(series)]

        sloth_df = d3m_DataFrame(
            pandas.DataFrame(self.clf.fit_predict(X_test),
                             columns=['cluster_labels']))
        # last column ('clusters')
        col_dict = dict(
            sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, 0)))
        col_dict['structural_type'] = type(1)
        col_dict['name'] = 'cluster_labels'
        col_dict['semantic_types'] = (
            'http://schema.org/Integer',
            'https://metadata.datadrivendiscovery.org/types/Attribute',
            'https://metadata.datadrivendiscovery.org/types/CategoricalData')
        sloth_df.metadata = sloth_df.metadata.update(
            (metadata_base.ALL_ELEMENTS, 0), col_dict)
        df_dict = dict(sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
        df_dict_1 = dict(
            sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
        df_dict['dimension'] = df_dict_1
        df_dict_1['name'] = 'columns'
        df_dict_1['semantic_types'] = (
            'https://metadata.datadrivendiscovery.org/types/TabularColumn', )
        df_dict_1['length'] = 1
        sloth_df.metadata = sloth_df.metadata.update(
            (metadata_base.ALL_ELEMENTS, ), df_dict)

        return CallResult(utils_cp.append_columns(metadata_inputs, sloth_df))
示例#11
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """
        Parameters
        ----------
        inputs : numpy ndarray of size (number_of_time_series, time_series_length) containing new time series 

        Returns
        ----------
        Outputs
            The output is a transformed dataframe of X fit into an embedded space, n feature columns will equal n_components hyperparameter
            For timeseries datasets the output is the dimensions concatenated to the timeseries filename dataframe
        """

        hyperparams_class = DatasetToDataFrame.DatasetToDataFramePrimitive.metadata.query(
        )['primitive_code']['class_type_arguments']['Hyperparams']
        ds2df_client = DatasetToDataFrame.DatasetToDataFramePrimitive(
            hyperparams=hyperparams_class.defaults().replace(
                {"dataframe_resource": "learningData"}))
        metadata_inputs = ds2df_client.produce(inputs=inputs).value

        # temporary (until Uncharted adds conversion primitive to repo)
        if not self.hyperparams['long_format']:
            formatted_inputs = TimeSeriesFormatterPrimitive(
                hyperparams=self._hp).produce(inputs=inputs).value['0']
        else:
            formatted_inputs = d3m_DataFrame(
                ds2df_client.produce(inputs=inputs).value)

        # store information on target, index variable
        targets = metadata_inputs.metadata.get_columns_with_semantic_type(
            'https://metadata.datadrivendiscovery.org/types/TrueTarget')
        if not len(targets):
            targets = metadata_inputs.metadata.get_columns_with_semantic_type(
                'https://metadata.datadrivendiscovery.org/types/TrueTarget')
        if not len(targets):
            targets = metadata_inputs.metadata.get_columns_with_semantic_type(
                'https://metadata.datadrivendiscovery.org/types/SuggestedTarget'
            )
        target_names = [list(metadata_inputs)[t] for t in targets]
        index = metadata_inputs.metadata.get_columns_with_semantic_type(
            'https://metadata.datadrivendiscovery.org/types/PrimaryKey')
        index_names = [list(metadata_inputs)[i] for i in index]

        n_ts = len(formatted_inputs.d3mIndex.unique())
        if n_ts == formatted_inputs.shape[0]:
            X_test = formatted_inputs.drop(
                columns=list(formatted_inputs)[index[0]])
            X_test = X_test.drop(columns=target_names).values
        else:
            ts_sz = int(formatted_inputs.shape[0] / n_ts)
            X_test = np.array(formatted_inputs.value).reshape(n_ts, ts_sz)

        # fit_transform data and create new dataframe
        n_components = self.hyperparams['n_components']
        col_names = ['Dim' + str(c) for c in range(0, n_components)]

        tsne_df = d3m_DataFrame(
            pandas.DataFrame(self.clf.fit_transform(X_test),
                             columns=col_names))
        if self.hyperparams['long_format']:
            tsne_df = pandas.concat([formatted_inputs.d3mIndex, tsne_df],
                                    axis=1)

            # add index colmn metadata
            col_dict = dict(
                tsne_df.metadata.query((metadata_base.ALL_ELEMENTS, 0)))
            col_dict['structural_type'] = type('1')
            col_dict['name'] = index_names[0]
            col_dict['semantic_types'] = (
                'http://schema.org/Int',
                'https://metadata.datadrivendiscovery.org/types/PrimaryKey')
            tsne_df.metadata = tsne_df.metadata.update(
                (metadata_base.ALL_ELEMENTS, 0), col_dict)

            # add dimenion columns metadata
            for c in range(1, n_components + 1):
                col_dict = dict(
                    tsne_df.metadata.query((metadata_base.ALL_ELEMENTS, c)))
                col_dict['structural_type'] = type(1.0)
                col_dict['name'] = 'Dim' + str(c - 1)
                col_dict['semantic_types'] = (
                    'http://schema.org/Float',
                    'https://metadata.datadrivendiscovery.org/types/Attribute')
                tsne_df.metadata = tsne_df.metadata.update(
                    (metadata_base.ALL_ELEMENTS, c), col_dict)

            df_dict = dict(
                tsne_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
            df_dict_1 = dict(
                tsne_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
            df_dict['dimension'] = df_dict_1
            df_dict_1['name'] = 'columns'
            df_dict_1['semantic_types'] = (
                'https://metadata.datadrivendiscovery.org/types/TabularColumn',
            )
            df_dict_1['length'] = n_components + 1
            tsne_df.metadata = tsne_df.metadata.update(
                (metadata_base.ALL_ELEMENTS, ), df_dict)

            return CallResult(tsne_df)

        else:
            for c in range(0, n_components):
                col_dict = dict(
                    tsne_df.metadata.query((metadata_base.ALL_ELEMENTS, c)))
                col_dict['structural_type'] = type('1')
                col_dict['name'] = str(c)
                col_dict['semantic_types'] = (
                    'http://schema.org/Float',
                    'https://metadata.datadrivendiscovery.org/types/Attribute')
                tsne_df.metadata = tsne_df.metadata.update(
                    (metadata_base.ALL_ELEMENTS, c), col_dict)

            df_dict = dict(
                tsne_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
            df_dict_1 = dict(
                tsne_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
            df_dict['dimension'] = df_dict_1
            df_dict_1['name'] = 'columns'
            df_dict_1['semantic_types'] = (
                'https://metadata.datadrivendiscovery.org/types/TabularColumn',
            )
            df_dict_1['length'] = n_components
            tsne_df.metadata = tsne_df.metadata.update(
                (metadata_base.ALL_ELEMENTS, ), df_dict)

            return CallResult(utils_cp.append_columns(metadata_inputs,
                                                      tsne_df))
示例#12
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None
                ) -> CallResult[container.pandas.DataFrame]:
        """
        Parameters
        ----------
        inputs : D3M dataframe with associated metadata.

        Returns
        -------
        Outputs
            For unsupervised problems: The output is a dataframe containing a single column where each entry is the associated series' cluster number.
            For semi-supervised problems: The output is the input df containing an additional feature - cluster_label
        """
        #hyperparams_class = DatasetToDataFrame.DatasetToDataFramePrimitive.metadata.query()['primitive_code']['class_type_arguments']['Hyperparams']
        #ds2df_client = DatasetToDataFrame.DatasetToDataFramePrimitive(hyperparams = hyperparams_class.defaults().replace({"dataframe_resource":"learningData"}))
        #metadata_inputs = ds2df_client.produce(inputs = inputs).value

        #formatted_inputs = ds2df_client.produce(inputs = inputs).value

        # store information on target, index variable
        targets = inputs.metadata.get_columns_with_semantic_type(
            'https://metadata.datadrivendiscovery.org/types/TrueTarget')
        if not len(targets):
            targets = inputs.metadata.get_columns_with_semantic_type(
                'https://metadata.datadrivendiscovery.org/types/TrueTarget')
        if not len(targets):
            targets = inputs.metadata.get_columns_with_semantic_type(
                'https://metadata.datadrivendiscovery.org/types/SuggestedTarget'
            )
        target_names = [list(inputs)[t] for t in targets]
        index = inputs.metadata.get_columns_with_semantic_type(
            'https://metadata.datadrivendiscovery.org/types/PrimaryKey')
        index_names = [list(inputs)[i] for i in index]

        # load and reshape training data
        n_ts = len(inputs.d3mIndex.unique())
        if n_ts == inputs.shape[0]:
            X_test = inputs.drop(columns=list(inputs)[index[0]])
            X_test = X_test.drop(columns=target_names).values
        else:
            ts_sz = int(inputs.shape[0] / n_ts)
            X_test = np.array(inputs.value).reshape(n_ts, ts_sz, 1)

        # special semi-supervised case - during training, only produce rows with labels
        if self.clustering:

            sloth_df = d3m_DataFrame(
                pandas.DataFrame(self._kmeans.predict(X_test),
                                 columns=[target_names[0]]))

            sloth_df = pandas.concat([inputs.d3mIndex, sloth_df], axis=1)

            # first column ('d3mTndex')

            col_dict = dict(
                sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, 0)))
            col_dict['structural_type'] = type("1")
            col_dict['name'] = index_names[0]
            col_dict['semantic_types'] = (
                'http://schema.org/Integer',
                'https://metadata.datadrivendiscovery.org/types/PrimaryKey',
            )
            sloth_df.metadata = sloth_df.metadata.update(
                (metadata_base.ALL_ELEMENTS, 0), col_dict)

            # second column ('Class')
            col_dict = dict(
                sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, 1)))
            col_dict['structural_type'] = type("1")
            col_dict['name'] = target_names[0]
            col_dict['semantic_types'] = (
                'http://schema.org/Integer',
                'https://metadata.datadrivendiscovery.org/types/PredictedTarget'
            )
            sloth_df.metadata = sloth_df.metadata.update(
                (metadata_base.ALL_ELEMENTS, 1), col_dict)

            df_dict = dict(
                sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
            df_dict_1 = dict(
                sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
            df_dict['dimension'] = df_dict_1
            df_dict_1['name'] = 'columns'
            df_dict_1['semantic_types'] = (
                'https://metadata.datadrivendiscovery.org/types/TabularColumn',
            )
            df_dict_1['length'] = 2
            sloth_df.metadata = sloth_df.metadata.update(
                (metadata_base.ALL_ELEMENTS, ), df_dict)

            return CallResult(sloth_df)

        else:
            series = inputs[target_names] != ''
            if series.any().any():
                inputs = dataframe_utils.select_rows(inputs,
                                                     np.flatnonzero(series))
                X_test = X_test[np.flatnonzero(series)]

            sloth_df = d3m_DataFrame(
                pandas.DataFrame(self._kmeans.predict(X_test),
                                 columns=['cluster_labels']))

            # add clusters as a feature in the main dataframe - last column ('clusters')
            col_dict = dict(
                sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, 0)))
            col_dict['structural_type'] = type(1)
            col_dict['name'] = 'cluster_labels'
            col_dict['semantic_types'] = (
                'http://schema.org/Integer',
                'https://metadata.datadrivendiscovery.org/types/Attribute',
                'https://metadata.datadrivendiscovery.org/types/CategoricalData'
            )
            sloth_df.metadata = sloth_df.metadata.update(
                (metadata_base.ALL_ELEMENTS, 0), col_dict)
            df_dict = dict(
                sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
            df_dict_1 = dict(
                sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
            df_dict['dimension'] = df_dict_1
            df_dict_1['name'] = 'columns'
            df_dict_1['semantic_types'] = (
                'https://metadata.datadrivendiscovery.org/types/TabularColumn',
            )
            df_dict_1['length'] = 1
            sloth_df.metadata = sloth_df.metadata.update(
                (metadata_base.ALL_ELEMENTS, ), df_dict)

            return CallResult(utils_cp.append_columns(inputs, sloth_df))
示例#13
0
    def produce(
            self,
            *,
            inputs: container.DataFrame,
            timeout: float = None,
            iterations: int = None) -> base.CallResult[container.DataFrame]:

        # if no column index is supplied use the first real vector column found in the dataset
        vector_idx = self.hyperparams['vector_col_index']
        if vector_idx is None:
            vector_idx = self._find_real_vector_column(inputs.metadata)
        # validate the column
        if not self._can_use_column(inputs.metadata, vector_idx):
            raise exceptions.InvalidArgumentValueError(
                'column idx=' + str(vector_idx) + ' from ' +
                str(inputs.columns) + ' does not contain float vectors')
        # flag label generation if none are supplied
        labels = list(self.hyperparams['labels'])
        if labels is None:
            labels = []
        generate_labels = True if labels is None or len(labels) == 0 else False

        # create a dataframe to hold the new columns
        vector_dataframe = container.DataFrame(data=[])

        # loop over elements of the source vector column
        for i, v in enumerate(inputs.iloc[:, vector_idx]):
            elems = v.split(',')
            vector_length = len(elems)
            for j, e in enumerate(elems):
                # initialize columns when processing first row
                if i == 0:
                    # get the name of the source vector column
                    vector_col_metadata = inputs.metadata.query_column(
                        vector_idx)
                    vector_label = vector_col_metadata['name']

                    # create an empty column for each element of the vector
                    if generate_labels:
                        labels.append(vector_label + "_" + str(j))
                    vector_dataframe[labels[j]] = ''

                # write vector elements into each column - force to string as d3m convention is
                # to store data as pandas 'obj' type until explicitly cast
                vector_dataframe.at[i, labels[j]] = str(e.strip())

        # create default d3m metadata structures (rows, columns etc.) and copy the semantic types
        # from the source vector over, replacing FloatVector with Float
        vector_dataframe.metadata = vector_dataframe.metadata.set_for_value(
            vector_dataframe)
        source_semantic_types = list(
            inputs.metadata.query_column(vector_idx)['semantic_types'])
        source_semantic_types.remove(
            'https://metadata.datadrivendiscovery.org/types/FloatVector')
        source_semantic_types.append(
            'https://metadata.datadrivendiscovery.org/types/Float')
        for i in range(0, len(labels)):
            vector_dataframe.metadata = vector_dataframe.metadata.\
                update_column(i, {'semantic_types': source_semantic_types})

        output = utils.append_columns(inputs, vector_dataframe)

        # wrap as a D3M container - metadata should be auto generated
        return base.CallResult(output)