Пример #1
0
    def test_basic(self):
        dataset = test_utils.load_dataset(self._dataset_path)

        dataframe_hyperparams_class = (
            dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.
            get_hyperparams())
        dataframe_primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(
            hyperparams=dataframe_hyperparams_class.defaults().replace(
                {"dataframe_resource": "0"}))
        dataframe = dataframe_primitive.produce(inputs=dataset).value
        image_hyperparams_class = (
            dataframe_image_reader.DataFrameImageReaderPrimitive.metadata.
            get_hyperparams())
        image_primitive = dataframe_image_reader.DataFrameImageReaderPrimitive(
            hyperparams=image_hyperparams_class.defaults().replace(
                {"return_result": "replace"}))
        images = image_primitive.produce(inputs=dataframe).value

        image_transfer_hyperparams = ImageTransferPrimitive.metadata.get_hyperparams(
        )
        primitive_volumes = ImageTransferPrimitive.metadata.get_volumes()
        volumes = {
            primitive_volumes[0]["key"]:
            os.getenv("D3MSTATICDIR") + "/" +
            primitive_volumes[0]["file_digest"]
        }
        image_transfer_primitive = ImageTransferPrimitive(
            hyperparams=image_transfer_hyperparams.defaults().replace(
                {"filename_col": 0}),
            volumes=volumes,
        )
        result = image_transfer_primitive.produce(inputs=images).value
        self.assertEqual(result.shape[0], 5)
        self.assertEqual(result.shape[1], 512)
Пример #2
0
    def test_no_hyperparams_semantic_type(self):
        dataset = test_utils.load_dataset(self._dataset_path)

        dataframe_hyperparams_class = (
            dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams()
        )
        dataframe_primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(
            hyperparams=dataframe_hyperparams_class.defaults()
        )
        audio_df = dataframe_primitive.produce(inputs=dataset).value

        audio_df.metadata = audio_df.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 0), "http://schema.org/AudioObject"
        )

        audio_transfer_hyperparams = AudioTransferPrimitive.metadata.get_hyperparams()
        primitive_volumes = AudioTransferPrimitive.metadata.get_volumes()
        volumes = {
            primitive_volumes[0]["key"]: os.getenv("D3MSTATICDIR")
            + "/"
            + primitive_volumes[0]["file_digest"]
        }
        audio_transfer_primitive = AudioTransferPrimitive(
            hyperparams=audio_transfer_hyperparams.defaults(), volumes=volumes
        )

        result = audio_transfer_primitive.produce(inputs=audio_df).value
Пример #3
0
    def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
        '''
        Sets primitive's training data

        Parameters
        ----------
        inputs: numpy ndarray of size (number_of_time_series, time_series_length, dimension) containing training time series

        outputs: numpy ndarray of size (number_time_series,) containing classes of training time series
        '''
        if not self.hyperparams['long_format']:
            inputs = TimeSeriesFormatterPrimitive(
                hyperparams=self._hp).produce(inputs=inputs).value['0']
        else:
            hyperparams_class = DatasetToDataFrame.DatasetToDataFramePrimitive.metadata.query(
            )['primitive_code']['class_type_arguments']['Hyperparams']
            ds2df_client = DatasetToDataFrame.DatasetToDataFramePrimitive(
                hyperparams=hyperparams_class.defaults().replace(
                    {"dataframe_resource": "learningData"}))
            inputs = d3m_DataFrame(ds2df_client.produce(inputs=inputs).value)

        # load and reshape training data
        # 'series_id' and 'value' should be set by metadata
        n_ts = len(inputs.d3mIndex.unique())
        ts_sz = int(inputs.shape[0] / n_ts)
        self._X_train = np.array(inputs.value).reshape(n_ts, ts_sz, 1)
        self._y_train = np.array(inputs.label.iloc[::ts_sz]).reshape(-1, )
Пример #4
0
    def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
        '''
        Sets primitive's training data
        Parameters
        ----------
        inputs: numpy ndarray of size (number_of_time_series, time_series_length) containing training time series
        
        '''
        hyperparams_class = DatasetToDataFrame.DatasetToDataFramePrimitive.metadata.query()['primitive_code']['class_type_arguments']['Hyperparams']
        ds2df_client = DatasetToDataFrame.DatasetToDataFramePrimitive(hyperparams = hyperparams_class.defaults().replace({"dataframe_resource":"learningData"}))
        metadata_inputs = ds2df_client.produce(inputs = inputs).value
        if not self.hyperparams['long_format']:
            formatted_inputs = TimeSeriesFormatterPrimitive(hyperparams = self._hp).produce(inputs = inputs).value['0']
        else:
            formatted_inputs = ds2df_client.produce(inputs = inputs).value
        
        # store information on target, index variable
        targets = metadata_inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/TrueTarget')
        if not len(targets):
            targets = metadata_inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/TrueTarget')
        if not len(targets):
            targets = metadata_inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/SuggestedTarget')
        target_names = [list(metadata_inputs)[t] for t in targets]
        index = metadata_inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/PrimaryKey')
        
        series = metadata_inputs[target_names] != ''
        self.clustering = 0 
        if not series.any().any():
            self.clustering = 1

        # load and reshape training data
        n_ts = len(formatted_inputs.d3mIndex.unique())
        if n_ts == formatted_inputs.shape[0]:
            self._kmeans = sk_kmeans(n_clusters = self.hyperparams['nclusters'], n_init = self.hyperparams['n_init'], random_state=self.random_seed)
            self._X_train_all_data = formatted_inputs.drop(columns = list(formatted_inputs)[index[0]])
            self._X_train = self._X_train_all_data.drop(columns = target_names).values
        else:
            self._kmeans = KMeans(self.hyperparams['nclusters'], self.hyperparams['algorithm'])
            ts_sz = int(formatted_inputs.shape[0] / n_ts)
            self._X_train = np.array(formatted_inputs.value).reshape(n_ts, ts_sz, 1)
Пример #5
0
    def test_no_hyperparam(self):
        dataset = test_utils.load_dataset(self._dataset_path)

        dataframe_hyperparams_class = (
            dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.
            get_hyperparams())
        dataframe_primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(
            hyperparams=dataframe_hyperparams_class.defaults().replace(
                {"dataframe_resource": "0"}))
        dataframe = dataframe_primitive.produce(inputs=dataset).value
        image_hyperparams_class = (
            dataframe_image_reader.DataFrameImageReaderPrimitive.metadata.
            get_hyperparams())
        image_primitive = dataframe_image_reader.DataFrameImageReaderPrimitive(
            hyperparams=image_hyperparams_class.defaults().replace(
                {"return_result": "replace"}))
        images = image_primitive.produce(inputs=dataframe).value
        images.metadata = images.metadata.add_semantic_type(
            (
                metadata_base.ALL_ELEMENTS,
                images.metadata.get_column_index_from_column_name("filename"),
            ),
            "http://schema.org/ImageObject",
        )

        image_transfer_hyperparams = ImageTransferPrimitive.metadata.get_hyperparams(
        )
        primitive_volumes = ImageTransferPrimitive.metadata.get_volumes()
        volumes = {
            primitive_volumes[0]["key"]:
            os.getenv("D3MSTATICDIR") + "/" +
            primitive_volumes[0]["file_digest"]
        }
        image_transfer_primitive = ImageTransferPrimitive(
            hyperparams=image_transfer_hyperparams.defaults(), volumes=volumes)
        result = image_transfer_primitive.produce(inputs=images).value
        self.assertEqual(result.shape[0], 5)
        self.assertEqual(result.shape[1], 512)
Пример #6
0
    var.set_training_data(inputs = df, outputs = None)
    var.fit()
    test_dataset = container.Dataset.load('file:///datasets/seed_datasets_current/LL1_736_stock_market/TEST/dataset_TEST/datasetDoc.json')
    results = var.produce(inputs = d3m_DataFrame(ds2df_client.produce(inputs = test_dataset).value))
    #results = var.produce_weights(inputs = d3m_DataFrame(ds2df_client.produce(inputs = test_dataset).value))
    print(results.value)
    '''

    # acled reduced test case
    input_dataset = container.Dataset.load('file:///datasets/seed_datasets_current/LL0_acled_reduced/TRAIN/dataset_TRAIN/datasetDoc.json')
    hyperparams_class = dataset_remove_columns.RemoveColumnsPrimitive.metadata.query()['primitive_code']['class_type_arguments']['Hyperparams']
    to_remove = (1, 2, 4, 5, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 28, 29, 30)
    rm_client = dataset_remove_columns.RemoveColumnsPrimitive(hyperparams = hyperparams_class.defaults().replace({"columns":to_remove}))
    df = rm_client.produce(inputs = input_dataset).value

    hyperparams_class = DatasetToDataFrame.DatasetToDataFramePrimitive.metadata.query()['primitive_code']['class_type_arguments']['Hyperparams']
    ds2df_client = DatasetToDataFrame.DatasetToDataFramePrimitive(hyperparams = hyperparams_class.defaults().replace({"dataframe_resource":"learningData"}))
    df = ds2df_client.produce(inputs = df).value
    print(df.head())

    var_hp = VAR.metadata.query()['primitive_code']['class_type_arguments']['Hyperparams']
    var = VAR(hyperparams = var_hp.defaults().replace({}))
    var.set_training_data(inputs = df, outputs = None)
    var.fit()
    test_dataset = container.Dataset.load('file:///datasets/seed_datasets_current/LL0_acled_reduced/TEST/dataset_TEST/datasetDoc.json')
    #results = var.produce(inputs = ds2df_client.produce(inputs = rm_client.produce(inputs = test_dataset).value).value)
    results = var.produce_weights(inputs = d3m_DataFrame(ds2df_client.produce(inputs = test_dataset).value))
    print(results.value)
    

Пример #7
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """
        Produce primitive's classifications for new time series data

        Parameters
        ----------
        inputs : numpy ndarray of size (number_of_time_series, time_series_length, dimension) containing new time series 

        Returns
        ----------
        Outputs
            The output is a numpy ndarray containing a predicted class for each of the input time series
        """
        # temporary (until Uncharted adds conversion primitive to repo)
        if not self.hyperparams['long_format']:
            inputs = TimeSeriesFormatterPrimitive(
                hyperparams=self._hp).produce(inputs=inputs).value['0']
        else:
            hyperparams_class = DatasetToDataFrame.DatasetToDataFramePrimitive.metadata.query(
            )['primitive_code']['class_type_arguments']['Hyperparams']
            ds2df_client = DatasetToDataFrame.DatasetToDataFramePrimitive(
                hyperparams=hyperparams_class.defaults().replace(
                    {"dataframe_resource": "learningData"}))
            inputs = d3m_DataFrame(ds2df_client.produce(inputs=inputs).value)

        # parse values from output of time series formatter
        n_ts = len(inputs.d3mIndex.unique())
        ts_sz = int(inputs.shape[0] / n_ts)
        input_vals = np.array(inputs.value).reshape(n_ts, ts_sz, 1)

        # produce classifications using Shapelets
        classes = pandas.DataFrame(self._shapelets.predict(input_vals))
        output_df = pandas.concat(
            [pandas.DataFrame(inputs.d3mIndex.unique()), classes], axis=1)
        # get column names from metadata
        output_df.columns = ['d3mIndex', 'label']
        shallot_df = d3m_DataFrame(output_df)

        # first column ('d3mIndex')
        col_dict = dict(
            shallot_df.metadata.query((metadata_base.ALL_ELEMENTS, 0)))
        col_dict['structural_type'] = type("1")
        # confirm that this metadata still exists
        #index = inputs['0'].metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/PrimaryKey')
        #col_dict['name'] = inputs.metadata.query_column(index[0])['name']
        col_dict['name'] = 'd3mIndex'
        col_dict['semantic_types'] = (
            'http://schema.org/Integer',
            'https://metadata.datadrivendiscovery.org/types/PrimaryKey',
        )
        shallot_df.metadata = shallot_df.metadata.update(
            (metadata_base.ALL_ELEMENTS, 0), col_dict)
        # second column ('predictions')
        col_dict = dict(
            shallot_df.metadata.query((metadata_base.ALL_ELEMENTS, 1)))
        col_dict['structural_type'] = type("1")
        #index = inputs['0'].metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/SuggestedTarget')
        #col_dict['name'] = inputs.metadata.query_column(index[0])['name']
        col_dict['name'] = 'label'
        col_dict['semantic_types'] = (
            'http://schema.org/Integer',
            'https://metadata.datadrivendiscovery.org/types/SuggestedTarget',
            'https://metadata.datadrivendiscovery.org/types/TrueTarget',
            'https://metadata.datadrivendiscovery.org/types/Target')
        shallot_df.metadata = shallot_df.metadata.update(
            (metadata_base.ALL_ELEMENTS, 1), col_dict)
        return CallResult(shallot_df)
Пример #8
0
        col_dict = dict(croc_df.metadata.query(
            (metadata_base.ALL_ELEMENTS, 5)))
        col_dict['structural_type'] = type("it is a string")
        col_dict['name'] = output_label + "_tokens"
        col_dict['semantic_types'] = (
            'http://schema.org/Text',
            'https://metadata.datadrivendiscovery.org/types/Attribute')
        croc_df.metadata = croc_df.metadata.update(
            (metadata_base.ALL_ELEMENTS, 5), col_dict)

        return CallResult(croc_df)


if __name__ == '__main__':
    volumes = {}  # d3m large primitive architecture dictionary of large files
    volumes[
        "croc_weights"] = '/home/croc_weights'  # location of extracted required files archive
    client = croc(hyperparams={
        'target_columns': ['filename'],
        'output_labels': ['filename']
    },
                  volumes=volumes)
    input_dataset = container.Dataset.load(
        "file:///home/datasets/seed_datasets_current/LL1_penn_fudan_pedestrian/TRAIN/dataset_TRAIN/datasetDoc.json"
    )
    ds2df_client = DatasetToDataFrame.DatasetToDataFramePrimitive(
        hyperparams={"dataframe_resource": "0"})
    df = d3m_DataFrame(ds2df_client.produce(inputs=input_dataset).value)
    result = client.produce(inputs=df)
    print(result.value)
Пример #9
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """
        Parameters
        ----------
        inputs : numpy ndarray of size (number_of_time_series, time_series_length) containing new time series 

        Returns
        ----------
        Outputs
            The output is a dataframe containing a single column where each entry is the associated series' cluster number.
        """

        hyperparams_class = DatasetToDataFrame.DatasetToDataFramePrimitive.metadata.query(
        )['primitive_code']['class_type_arguments']['Hyperparams']
        ds2df_client = DatasetToDataFrame.DatasetToDataFramePrimitive(
            hyperparams=hyperparams_class.defaults().replace(
                {"dataframe_resource": "learningData"}))
        metadata_inputs = ds2df_client.produce(inputs=inputs).value

        # temporary (until Uncharted adds conversion primitive to repo)
        if not self.hyperparams['long_format']:
            formatted_inputs = TimeSeriesFormatterPrimitive(
                hyperparams=self._hp).produce(inputs=inputs).value['0']
        else:
            formatted_inputs = d3m_DataFrame(
                ds2df_client.produce(inputs=inputs).value)

        # store information on target, index variable
        targets = metadata_inputs.metadata.get_columns_with_semantic_type(
            'https://metadata.datadrivendiscovery.org/types/TrueTarget')
        if not len(targets):
            targets = metadata_inputs.metadata.get_columns_with_semantic_type(
                'https://metadata.datadrivendiscovery.org/types/TrueTarget')
        if not len(targets):
            targets = metadata_inputs.metadata.get_columns_with_semantic_type(
                'https://metadata.datadrivendiscovery.org/types/SuggestedTarget'
            )
        target_names = [list(metadata_inputs)[t] for t in targets]
        index = metadata_inputs.metadata.get_columns_with_semantic_type(
            'https://metadata.datadrivendiscovery.org/types/PrimaryKey')

        # parse values from output of time series formatter
        n_ts = len(formatted_inputs.d3mIndex.unique())
        if n_ts == formatted_inputs.shape[0]:
            X_test = formatted_inputs.drop(
                columns=list(formatted_inputs)[index[0]])
            X_test = X_test.drop(columns=target_names).values
        else:
            ts_sz = int(formatted_inputs.shape[0] / n_ts)
            X_test = np.array(formatted_inputs.value).reshape(n_ts, ts_sz)

        # special semi-supervised case - during training, only produce rows with labels
        series = metadata_inputs[target_names] != ''
        if series.any().any():
            metadata_inputs = dataframe_utils.select_rows(
                metadata_inputs, np.flatnonzero(series))
            X_test = X_test[np.flatnonzero(series)]

        sloth_df = d3m_DataFrame(
            pandas.DataFrame(self.clf.fit_predict(X_test),
                             columns=['cluster_labels']))
        # last column ('clusters')
        col_dict = dict(
            sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, 0)))
        col_dict['structural_type'] = type(1)
        col_dict['name'] = 'cluster_labels'
        col_dict['semantic_types'] = (
            'http://schema.org/Integer',
            'https://metadata.datadrivendiscovery.org/types/Attribute',
            'https://metadata.datadrivendiscovery.org/types/CategoricalData')
        sloth_df.metadata = sloth_df.metadata.update(
            (metadata_base.ALL_ELEMENTS, 0), col_dict)
        df_dict = dict(sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
        df_dict_1 = dict(
            sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
        df_dict['dimension'] = df_dict_1
        df_dict_1['name'] = 'columns'
        df_dict_1['semantic_types'] = (
            'https://metadata.datadrivendiscovery.org/types/TabularColumn', )
        df_dict_1['length'] = 1
        sloth_df.metadata = sloth_df.metadata.update(
            (metadata_base.ALL_ELEMENTS, ), df_dict)

        return CallResult(utils_cp.append_columns(metadata_inputs, sloth_df))
Пример #10
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """
        Parameters
        ----------
        inputs : numpy ndarray of size (number_of_time_series, time_series_length) containing new time series 

        Returns
        ----------
        Outputs
            The output is a transformed dataframe of X fit into an embedded space, n feature columns will equal n_components hyperparameter
            For timeseries datasets the output is the dimensions concatenated to the timeseries filename dataframe
        """

        hyperparams_class = DatasetToDataFrame.DatasetToDataFramePrimitive.metadata.query(
        )['primitive_code']['class_type_arguments']['Hyperparams']
        ds2df_client = DatasetToDataFrame.DatasetToDataFramePrimitive(
            hyperparams=hyperparams_class.defaults().replace(
                {"dataframe_resource": "learningData"}))
        metadata_inputs = ds2df_client.produce(inputs=inputs).value

        # temporary (until Uncharted adds conversion primitive to repo)
        if not self.hyperparams['long_format']:
            formatted_inputs = TimeSeriesFormatterPrimitive(
                hyperparams=self._hp).produce(inputs=inputs).value['0']
        else:
            formatted_inputs = d3m_DataFrame(
                ds2df_client.produce(inputs=inputs).value)

        # store information on target, index variable
        targets = metadata_inputs.metadata.get_columns_with_semantic_type(
            'https://metadata.datadrivendiscovery.org/types/TrueTarget')
        if not len(targets):
            targets = metadata_inputs.metadata.get_columns_with_semantic_type(
                'https://metadata.datadrivendiscovery.org/types/TrueTarget')
        if not len(targets):
            targets = metadata_inputs.metadata.get_columns_with_semantic_type(
                'https://metadata.datadrivendiscovery.org/types/SuggestedTarget'
            )
        target_names = [list(metadata_inputs)[t] for t in targets]
        index = metadata_inputs.metadata.get_columns_with_semantic_type(
            'https://metadata.datadrivendiscovery.org/types/PrimaryKey')
        index_names = [list(metadata_inputs)[i] for i in index]

        n_ts = len(formatted_inputs.d3mIndex.unique())
        if n_ts == formatted_inputs.shape[0]:
            X_test = formatted_inputs.drop(
                columns=list(formatted_inputs)[index[0]])
            X_test = X_test.drop(columns=target_names).values
        else:
            ts_sz = int(formatted_inputs.shape[0] / n_ts)
            X_test = np.array(formatted_inputs.value).reshape(n_ts, ts_sz)

        # fit_transform data and create new dataframe
        n_components = self.hyperparams['n_components']
        col_names = ['Dim' + str(c) for c in range(0, n_components)]

        tsne_df = d3m_DataFrame(
            pandas.DataFrame(self.clf.fit_transform(X_test),
                             columns=col_names))
        if self.hyperparams['long_format']:
            tsne_df = pandas.concat([formatted_inputs.d3mIndex, tsne_df],
                                    axis=1)

            # add index colmn metadata
            col_dict = dict(
                tsne_df.metadata.query((metadata_base.ALL_ELEMENTS, 0)))
            col_dict['structural_type'] = type('1')
            col_dict['name'] = index_names[0]
            col_dict['semantic_types'] = (
                'http://schema.org/Int',
                'https://metadata.datadrivendiscovery.org/types/PrimaryKey')
            tsne_df.metadata = tsne_df.metadata.update(
                (metadata_base.ALL_ELEMENTS, 0), col_dict)

            # add dimenion columns metadata
            for c in range(1, n_components + 1):
                col_dict = dict(
                    tsne_df.metadata.query((metadata_base.ALL_ELEMENTS, c)))
                col_dict['structural_type'] = type(1.0)
                col_dict['name'] = 'Dim' + str(c - 1)
                col_dict['semantic_types'] = (
                    'http://schema.org/Float',
                    'https://metadata.datadrivendiscovery.org/types/Attribute')
                tsne_df.metadata = tsne_df.metadata.update(
                    (metadata_base.ALL_ELEMENTS, c), col_dict)

            df_dict = dict(
                tsne_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
            df_dict_1 = dict(
                tsne_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
            df_dict['dimension'] = df_dict_1
            df_dict_1['name'] = 'columns'
            df_dict_1['semantic_types'] = (
                'https://metadata.datadrivendiscovery.org/types/TabularColumn',
            )
            df_dict_1['length'] = n_components + 1
            tsne_df.metadata = tsne_df.metadata.update(
                (metadata_base.ALL_ELEMENTS, ), df_dict)

            return CallResult(tsne_df)

        else:
            for c in range(0, n_components):
                col_dict = dict(
                    tsne_df.metadata.query((metadata_base.ALL_ELEMENTS, c)))
                col_dict['structural_type'] = type('1')
                col_dict['name'] = str(c)
                col_dict['semantic_types'] = (
                    'http://schema.org/Float',
                    'https://metadata.datadrivendiscovery.org/types/Attribute')
                tsne_df.metadata = tsne_df.metadata.update(
                    (metadata_base.ALL_ELEMENTS, c), col_dict)

            df_dict = dict(
                tsne_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
            df_dict_1 = dict(
                tsne_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
            df_dict['dimension'] = df_dict_1
            df_dict_1['name'] = 'columns'
            df_dict_1['semantic_types'] = (
                'https://metadata.datadrivendiscovery.org/types/TabularColumn',
            )
            df_dict_1['length'] = n_components
            tsne_df.metadata = tsne_df.metadata.update(
                (metadata_base.ALL_ELEMENTS, ), df_dict)

            return CallResult(utils_cp.append_columns(metadata_inputs,
                                                      tsne_df))
Пример #11
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """
        Parameters
        ----------
        inputs : numpy ndarray of size (number_of_time_series, time_series_length) containing new time series 

        Returns
        ----------
        Outputs
            The output is a dataframe containing a single column where each entry is the associated series' cluster number.
        """

        # temporary (until Uncharted adds conversion primitive to repo)
        if not self.hyperparams['long_format']:
            inputs = TimeSeriesFormatterPrimitive(
                hyperparams=self._hp).produce(inputs=inputs).value['0']
        else:
            hyperparams_class = DatasetToDataFrame.DatasetToDataFramePrimitive.metadata.query(
            )['primitive_code']['class_type_arguments']['Hyperparams']
            ds2df_client = DatasetToDataFrame.DatasetToDataFramePrimitive(
                hyperparams=hyperparams_class.defaults().replace(
                    {"dataframe_resource": "learningData"}))
            inputs = d3m_DataFrame(ds2df_client.produce(inputs=inputs).value)

        # parse values from output of time series formatter
        n_ts = len(inputs.d3mIndex.unique())
        ts_sz = int(inputs.shape[0] / n_ts)
        input_vals = np.array(inputs.value).reshape(n_ts, ts_sz)

        # use HP to produce DBSCAN clustering
        if self.hyperparams['algorithm'] == 'DBSCAN':
            #SimilarityMatrix = cluster.GenerateSimilarityMatrix(input_vals)
            _, labels, _ = cluster.ClusterSimilarityMatrix(
                input_vals, self.hyperparams['eps'],
                self.hyperparams['min_samples'])
        else:
            #SimilarityMatrix = cluster.GenerateSimilarityMatrix(input_vals)
            _, labels, _ = cluster.HClusterSimilarityMatrix(
                input_vals, self.hyperparams['min_cluster_size'],
                self.hyperparams['min_samples'])

        # transform labels for D3M classification task
        labels = [x + 1 if x >= 0 else x + 2 for x in labels]

        # add metadata to output
        labels = pandas.DataFrame(labels)
        out_df = pandas.concat(
            [pandas.DataFrame(inputs.d3mIndex.unique()), labels], axis=1)
        # get column names from metadata
        out_df.columns = ['d3mIndex', 'label']
        hdbscan_df = d3m_DataFrame(out_df)

        # first column ('d3mIndex')
        col_dict = dict(
            hdbscan_df.metadata.query((metadata_base.ALL_ELEMENTS, 0)))
        col_dict['structural_type'] = type("1")
        # confirm that this metadata still exists
        #index = inputs['0'].metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/PrimaryKey')
        #col_dict['name'] = inputs.metadata.query_column(index[0])['name']
        col_dict['name'] = 'd3mIndex'
        col_dict['semantic_types'] = (
            'http://schema.org/Integer',
            'https://metadata.datadrivendiscovery.org/types/PrimaryKey',
        )
        hdbscan_df.metadata = hdbscan_df.metadata.update(
            (metadata_base.ALL_ELEMENTS, 0), col_dict)

        # second column ('labels')
        col_dict = dict(
            hdbscan_df.metadata.query((metadata_base.ALL_ELEMENTS, 1)))
        col_dict['structural_type'] = type("1")
        #index = inputs['0'].metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/SuggestedTarget')
        #col_dict['name'] = inputs.metadata.query_column(index[0])['name']
        col_dict['name'] = 'label'
        col_dict['semantic_types'] = (
            'http://schema.org/Integer',
            'https://metadata.datadrivendiscovery.org/types/SuggestedTarget',
            'https://metadata.datadrivendiscovery.org/types/TrueTarget',
            'https://metadata.datadrivendiscovery.org/types/Target')
        hdbscan_df.metadata = hdbscan_df.metadata.update(
            (metadata_base.ALL_ELEMENTS, 1), col_dict)

        return CallResult(hdbscan_df)
Пример #12
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """
        Parameters
        ----------
        inputs : Input pandas frame where each row is a series.  Series timestamps are store in the column names.

        Returns
        -------
        Outputs
            The output is a dataframe containing a single column where each entry is the associated series' cluster number.
        """
        # temporary (until Uncharted adds conversion primitive to repo)
        if not self.hyperparams['long_format']:
            inputs = TimeSeriesFormatterPrimitive(
                hyperparams=self._hp).produce(inputs=inputs).value['0']
        else:
            hyperparams_class = DatasetToDataFrame.DatasetToDataFramePrimitive.metadata.query(
            )['primitive_code']['class_type_arguments']['Hyperparams']
            ds2df_client = DatasetToDataFrame.DatasetToDataFramePrimitive(
                hyperparams=hyperparams_class.defaults().replace(
                    {"dataframe_resource": "learningData"}))
            inputs = d3m_DataFrame(ds2df_client.produce(inputs=inputs).value)

        # parse values from output of time series formatter
        n_ts = len(inputs.d3mIndex.unique())
        ts_sz = int(inputs.shape[0] / n_ts)
        input_vals = np.array(inputs.value).reshape(n_ts, ts_sz, 1)

        # concatenate predictions and d3mIndex
        labels = pandas.DataFrame(self._kmeans.predict(input_vals))
        # maybe change d3mIndex key here to be programatically generated
        out_df_sloth = pandas.concat(
            [pandas.DataFrame(inputs.d3mIndex.unique()), labels], axis=1)
        # get column names from metadata
        out_df_sloth.columns = ['d3mIndex', 'label']
        sloth_df = d3m_DataFrame(out_df_sloth)

        # first column ('d3mIndex')
        col_dict = dict(
            sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, 0)))
        col_dict['structural_type'] = type("1")
        #index = inputs['0'].metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/PrimaryKey')
        #col_dict['name'] = inputs.metadata.query_column(index[0])['name']
        col_dict['name'] = 'd3mIndex'
        col_dict['semantic_types'] = (
            'http://schema.org/Integer',
            'https://metadata.datadrivendiscovery.org/types/PrimaryKey',
        )
        sloth_df.metadata = sloth_df.metadata.update(
            (metadata_base.ALL_ELEMENTS, 0), col_dict)

        # second column ('labels')
        col_dict = dict(
            sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, 1)))
        col_dict['structural_type'] = type("1")
        #index = inputs['0'].metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/SuggestedTarget')
        #col_dict['name'] = inputs.metadata.query_column(index[0])['name']
        col_dict['name'] = 'label'
        col_dict['semantic_types'] = (
            'http://schema.org/Integer',
            'https://metadata.datadrivendiscovery.org/types/SuggestedTarget',
            'https://metadata.datadrivendiscovery.org/types/TrueTarget',
            'https://metadata.datadrivendiscovery.org/types/Target')
        sloth_df.metadata = sloth_df.metadata.update(
            (metadata_base.ALL_ELEMENTS, 1), col_dict)

        return CallResult(sloth_df)
Пример #13
0
    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[container.pandas.DataFrame]:
        """
        Parameters
        ----------
        inputs : Input pandas frame where each row is a series.  Series timestamps are store in the column names.

        Returns
        -------
        Outputs
            For unsupervised problems: The output is a dataframe containing a single column where each entry is the associated series' cluster number.
            For semi-supervised problems: The output is the input df containing an additional feature - cluster_label
        """
        hyperparams_class = DatasetToDataFrame.DatasetToDataFramePrimitive.metadata.query()['primitive_code']['class_type_arguments']['Hyperparams']
        ds2df_client = DatasetToDataFrame.DatasetToDataFramePrimitive(hyperparams = hyperparams_class.defaults().replace({"dataframe_resource":"learningData"}))
        metadata_inputs = ds2df_client.produce(inputs = inputs).value
        
        # temporary (until Uncharted adds conversion primitive to repo)
        if not self.hyperparams['long_format']:
            formatted_inputs = TimeSeriesFormatterPrimitive(hyperparams = self._hp).produce(inputs = inputs).value['0']
        else:
            formatted_inputs = ds2df_client.produce(inputs = inputs).value 
        
        # store information on target, index variable
        targets = metadata_inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/TrueTarget')
        if not len(targets):
            targets = metadata_inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/TrueTarget')
        if not len(targets):
            targets = metadata_inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/SuggestedTarget')
        target_names = [list(metadata_inputs)[t] for t in targets]
        index = metadata_inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/PrimaryKey')
        index_names = [list(metadata_inputs)[i] for i in index]        

        # load and reshape training data
        n_ts = len(formatted_inputs.d3mIndex.unique())
        if n_ts == formatted_inputs.shape[0]:
            X_test = formatted_inputs.drop(columns = list(formatted_inputs)[index[0]])
            X_test = X_test.drop(columns = target_names).values
        else:
            ts_sz = int(formatted_inputs.shape[0] / n_ts)
            X_test = np.array(formatted_inputs.value).reshape(n_ts, ts_sz, 1)       
        
        # special semi-supervised case - during training, only produce rows with labels
        if self.clustering:
            
            sloth_df = d3m_DataFrame(pandas.DataFrame(self._kmeans.predict(X_test), columns = [target_names[0]]))

            sloth_df = pandas.concat([formatted_inputs.d3mIndex, sloth_df], axis=1)

            # first column ('d3mTndex')

            col_dict = dict(sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, 0)))
            col_dict['structural_type'] = type("1")
            col_dict['name'] = index_names[0]
            col_dict['semantic_types'] = ('http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey',)
            sloth_df.metadata = sloth_df.metadata.update((metadata_base.ALL_ELEMENTS, 0), col_dict)

            # second column ('Class')
            col_dict = dict(sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, 1)))
            col_dict['structural_type'] = type("1")
            col_dict['name'] = target_names[0]
            col_dict['semantic_types'] = ('http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')
            sloth_df.metadata = sloth_df.metadata.update((metadata_base.ALL_ELEMENTS, 1), col_dict)
            
            df_dict = dict(sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
            df_dict_1 = dict(sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) 
            df_dict['dimension'] = df_dict_1
            df_dict_1['name'] = 'columns'
            df_dict_1['semantic_types'] = ('https://metadata.datadrivendiscovery.org/types/TabularColumn',)
            df_dict_1['length'] = 2         
            sloth_df.metadata = sloth_df.metadata.update((metadata_base.ALL_ELEMENTS,), df_dict)

            return CallResult(sloth_df)

        else:
            series = metadata_inputs[target_names] != ''
            if series.any().any():
                metadata_inputs = dataframe_utils.select_rows(metadata_inputs, np.flatnonzero(series))
                X_test = X_test[np.flatnonzero(series)]
        
            sloth_df = d3m_DataFrame(pandas.DataFrame(self._kmeans.predict(X_test), columns=['cluster_labels']))

            # add clusters as a feature in the main dataframe - last column ('clusters')
            col_dict = dict(sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, 0)))
            col_dict['structural_type'] = type(1)
            col_dict['name'] = 'cluster_labels'
            col_dict['semantic_types'] = ('http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/CategoricalData')
            sloth_df.metadata = sloth_df.metadata.update((metadata_base.ALL_ELEMENTS, 0), col_dict)
            df_dict = dict(sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
            df_dict_1 = dict(sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) 
            df_dict['dimension'] = df_dict_1
            df_dict_1['name'] = 'columns'
            df_dict_1['semantic_types'] = ('https://metadata.datadrivendiscovery.org/types/TabularColumn',)
            df_dict_1['length'] = 1        
            sloth_df.metadata = sloth_df.metadata.update((metadata_base.ALL_ELEMENTS,), df_dict)
       
            return CallResult(utils_cp.append_columns(metadata_inputs, sloth_df))
Пример #14
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> base.CallResult[Outputs]:

        dataframe_resource_id, dataframe = base_utils.get_tabular_resource(
            inputs,
            self.hyperparams["dataframe_resource"])  # get attribute columns

        hyperparams_class = (
            dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.query()
            ["primitive_code"]["class_type_arguments"]["Hyperparams"])
        primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(
            hyperparams=hyperparams_class.defaults())

        dataframe_meta = primitive.produce(inputs=inputs).value

        attributes = list_columns_with_semantic_types(
            metadata=dataframe_meta.metadata,
            semantic_types=[
                "https://metadata.datadrivendiscovery.org/types/Attribute"
            ],
        )

        base_file_path = "/".join(
            inputs.metadata._current_metadata.metadata["location_uris"]
            [0].split("/")[:-1])
        edge_list = pd.read_csv(os.path.join(base_file_path, "graphs",
                                             "edgeList.csv"),
                                index_col=0)
        if len(edge_list.columns) > 2:
            graph = nx.from_pandas_edgelist(
                edge_list,
                source=edge_list.columns[0],
                target=edge_list.columns[1],
                edge_attr=edge_list.columns[2],
            )
        else:
            graph = nx.from_pandas_edgelist(edge_list,
                                            source=edge_list.columns[0],
                                            target=edge_list.columns[1])

        if len(attributes) > 1:
            # add attributers to nodes.
            attribute_node_map = dataframe_meta[
                dataframe_meta.columns[attributes]]
            attribute_node_map["nodeID"] = attribute_node_map["nodeID"].astype(
                int)
            attribute_node_map.index = attribute_node_map["nodeID"]
            attribute_cols = attribute_node_map.columns
            attribute_node_map.drop(["nodeID"], axis=1)
            attribute_node_map = attribute_node_map.to_dict(orient="index")

            for i in graph.nodes:
                default = {attribute: 0 for attribute in attribute_cols}
                default["nodeID"] = i
                graph.nodes[i].update(attribute_node_map.get(i, default))

        else:
            # featurizer expects at a minimum nodeids to be present
            for i in graph.nodes:
                default = {}
                default["nodeID"] = i
                graph.nodes[i].update(default)
        # int2str_map = dict(zip(graph.nodes, [str(n) for n in graph.nodes]))
        # graph = nx.relabel_nodes(graph, mapping=int2str_map)

        dataframe.metadata = self._update_metadata(inputs.metadata,
                                                   dataframe_resource_id)

        assert isinstance(dataframe, container.DataFrame), type(dataframe)

        U_train = {"graph": graph}
        y_train = self.produce_target(inputs=inputs).value
        X_train = dataframe  # TODO use attribute in vertex classification

        X_train = self._typify_dataframe(X_train)
        X_train.value = pd.DataFrame(X_train.value["nodeID"])
        return base.CallResult([X_train, y_train, U_train])