示例#1
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:

        if self._embedding is None:
            self._embedding = inputs[0]

        N, d = self._embedding.shape

        nodeIDs = inputs[1]
        nodeIDS = np.array([int(i) for i in nodeIDs])

        max_clusters = self.hyperparams['max_clusters']

        if max_clusters < self._embedding.shape[1]:
            self._embedding = self._embedding[:, :max_clusters].copy()

        gclust_object = graspyGCLUST(max_components=max_clusters,
                                     covariance_type="all")
        gclust_object.fit(self._embedding)
        model = gclust_object.model_

        pis, means, precs = model.weights_, model.means_, model.precisions_

        predictions = model.predict(self._embedding)

        D = np.zeros(shape=(N, N))

        if d == 1:
            cluster_label = predictions[i]
            i_embedding = self._embedding[i]
            for j in range(i + 1, N):
                j_embedding = self._embedding[j]
                eucl_dist = i_embedding - j_embedding
                Mahal_dist = eucl_dist * precs[cluster_label] * eucl_dist
                D[i, j] = Mahal_dist
                D[j, i] = Mahal_dist
        else:
            for i in range(N):
                cluster_label = predictions[i]
                i_embedding = self._embedding[i]
                for j in range(i + 1, N):
                    j_embedding = self._embedding[j]
                    eucl_dist = i_embedding - j_embedding
                    Mahal_dist = eucl_dist @ precs[cluster_label] @ eucl_dist[
                        None].T
                    D[i, j] = Mahal_dist[0]
                    D[j, i] = D[i, j]

        D_idx = np.zeros(shape=(N, N - 1))
        for i in range(N):
            D_idx[i] = np.argsort(D[i])[1:]

        columns = ['match%i' % (i + 1) for i in range(N - 1)]

        # May need to create nodeID <-> d3m index map
        output = container.DataFrame(D_idx, index=nodeIDs,
                                     columns=columns).astype(int)
        output.index.name = "d3mIndex"

        return base.CallResult(output)
示例#2
0
    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> base.CallResult[None]:
        """
        Inputs: Dataset dataFrame
        Returns: None
        """
        # If feature extract only, Skip Fit
        if self.hyperparams['feature_extract_only']:
            self._fitted = True
            return base.CallResult(None)

        if self._fitted:
            return base.CallResult(None)

        if self._training_inputs is None:
            raise ValueError("Missing training data.")

        # Get all Nested media files
        image_columns = self._training_inputs.metadata.get_columns_with_semantic_type(
            'https://metadata.datadrivendiscovery.org/types/FileName')  # [1]
        if len(image_columns) == 0:
            image_columns = self._training_inputs.metadata.get_columns_with_semantic_type(
                'https://metadata.datadrivendiscovery.org/types/Attribute'
            )  # [1]
        label_columns = self._training_outputs.metadata.get_columns_with_semantic_type(
            'https://metadata.datadrivendiscovery.org/types/TrueTarget')  # [2]
        if len(label_columns) == 0:
            label_columns = self._training_outputs.metadata.get_columns_with_semantic_type(
                'https://metadata.datadrivendiscovery.org/types/SuggestedTarget'
            )  # [2]
        base_paths = [
            self._training_inputs.metadata.query(
                (metadata_base.ALL_ELEMENTS, t)) for t in image_columns
        ]  # Image Dataset column names
        base_paths = [
            base_paths[t]['location_base_uris'][0].replace('file:///', '/')
            for t in range(len(base_paths))
        ]  # Path + media
        all_img_paths = [[
            os.path.join(base_path, filename)
            for filename in self._training_inputs.iloc[:, col]
        ] for base_path, col in zip(base_paths, image_columns)]
        all_img_labls = [[
            os.path.join(label)
            for label in self._training_outputs.iloc[:, col]
        ] for col in label_columns]

        # Check if data is matched
        for idx in range(len(all_img_paths)):
            if len(all_img_paths[idx]) != len(all_img_labls[idx]):
                raise Exception(
                    'Size mismatch between training inputs and labels!')

        if np.array([all_img_labls[0][0]]).size > 1:
            raise Exception(
                'Primitive accepts labels to be in size (minibatch, 1)!,\
                             even for multiclass classification problems, it must be in\
                             the range from 0 to C-1 as the target')

        # Organize data into training format
        all_train_data = []
        for idx in range(len(all_img_paths)):
            img_paths = all_img_paths[idx]
            img_labls = all_img_labls[idx]
            for eachIdx in range(len(img_paths)):
                all_train_data.append([img_paths[eachIdx], img_labls[eachIdx]])

        # del to free memory
        del all_img_paths, all_img_labls

        if len(all_train_data) == 0:
            raise Exception('Cannot fit when no training data is present.')

        # Set all files
        _iterations = self.hyperparams['num_iterations']

        _minibatch_size = self.hyperparams['minibatch_size']
        if _minibatch_size > len(all_train_data):
            _minibatch_size = len(all_train_data)

        # Dataset Parameters
        train_params = {
            'batch_size': _minibatch_size,
            'shuffle': self.hyperparams['shuffle'],
            'num_workers': 4
        }

        # DataLoader
        training_set = Dataset(all_data=all_train_data,
                               preprocess=self.pre_process)

        # Data Generators
        training_generator = data.DataLoader(training_set, **train_params)

        # Set model to training mode
        self.model.train()

        # Loss function
        if self.hyperparams['loss_type'] == 'crossentropy':
            criterion = nn.CrossEntropyLoss().to(self.device)
        elif self.hyperparams['loss_type'] == 'mse':
            criterion = nn.MSELoss().to(self.device)
        elif self.hyperparams['loss_type'] == 'l1':
            criterion = nn.L1Loss().to(self.device)
        else:
            raise ValueError(
                'Unsupported loss_type: {}. Available options: crossentropy, mse, l1'
                .format(self.hyperparams['loss_type']))

        # Train functions
        start = time.time()
        self._iterations_done = 0

        # Set model to training
        self.model.train()

        for itr in range(_iterations):
            epoch_loss = 0.0
            iteration = 0
            for local_batch, local_labels in training_generator:
                # Zero the parameter gradients
                self.optimizer_instance.zero_grad()
                # Check Label shapes
                if len(local_labels.shape) < 2:
                    local_labels = local_labels.unsqueeze(1)
                elif len(local_labels.shape) > 2:
                    raise Exception(
                        'Primitive accepts labels to be in size (minibatch, 1)!,\
                                     even for multiclass classification problems, it must be in\
                                     the range from 0 to C-1 as the target')
                if self.hyperparams['loss_type'] == 'crossentropy':
                    local_labels = (local_labels.long()).to(self.device)
                else:
                    local_labels = (local_labels.float()).to(self.device)
                # Forward Pass
                local_outputs = self.model(
                    local_batch.to(self.device),
                    include_last_layer=self.include_last_layer)
                # Loss and backward pass
                local_loss = criterion(local_outputs, local_labels)
                # Backward pass
                local_loss.backward()
                # Update weights
                self.optimizer_instance.step()
                # Increment
                epoch_loss += local_loss
                iteration += 1
            # Final epoch loss
            epoch_loss /= iteration
            self._iterations_done += 1
            logging.info('epoch loss: {} at Epoch: {}'.format(epoch_loss, itr))
            # print('epoch loss: {} at Epoch: {}'.format(epoch_loss, itr))
            if epoch_loss < self.hyperparams['fit_threshold']:
                self._fitted = True
                return base.CallResult(None)
        self._fitted = True

        return base.CallResult(None)
示例#3
0
    def produce(
        self,
        *,
        inputs: container.DataFrame,
        timeout: float = None,
        iterations: int = None,
    ) -> CallResult[container.DataFrame]:

        logger.debug(f"Producing {__name__}")

        # force a fit it hasn't yet been done
        if self._needs_fit:
            self.fit()

        # drop any non-numeric columns
        # drop all non-numeric columns
        num_cols = inputs.shape[1]
        inputs = inputs.select_dtypes(include="number")
        col_diff = num_cols - inputs.shape[1]
        if col_diff > 0:
            logger.warn(
                f"Removed {col_diff} unencoded columns from produce data.")

        # create dataframe to hold the result
        result = self._model.predict(inputs.values)
        if len(self._target_cols) > 1:
            result_df = container.DataFrame()
            for i, c in enumerate(self._target_cols):
                col = container.DataFrame({c: result[:, i]})
                result_df = pd.concat([result_df, col], axis=1)
            for c in range(result_df.shape[1]):
                result_df.metadata = result_df.metadata.add_semantic_type(
                    (metadata_base.ALL_ELEMENTS, c), "http://schema.org/Float")
        else:
            result_df = container.DataFrame({self._target_cols[0]: result},
                                            generate_metadata=True)
        # if we mapped values earlier map them back.
        if len(self._label_map) > 0:
            # TODO label map will not work if there are multiple output columns.
            result_df[self._target_cols[0]] = result_df[
                self._target_cols[0]].map(self._label_map)
        # mark the semantic types on the dataframe
        for i, _ in enumerate(result_df.columns):
            result_df.metadata = result_df.metadata.add_semantic_type(
                (metadata_base.ALL_ELEMENTS, i),
                "https://metadata.datadrivendiscovery.org/types/PredictedTarget",
            )
        if (self._model.mode == "classification"
                and self.hyperparams["compute_confidences"]):
            confidence = self._model.predict_proba(inputs.values)
            if self._binary:
                pos_column = (0 if self.hyperparams["pos_label"]
                              == self._label_map[0] else 1)
                result_df.insert(result_df.shape[1], "confidence",
                                 confidence[:, pos_column])
                result_df.metadata = result_df.metadata.add_semantic_type(
                    (metadata_base.ALL_ELEMENTS, len(result_df.columns) - 1),
                    "http://schema.org/Float",
                )
            else:
                # add confidence scores as some metrics require them.
                confidence = pd.Series(confidence.tolist(), name="confidence")
                result_df = pd.concat([result_df, confidence], axis=1)

                confidences = [
                    item
                    for sublist in result_df["confidence"].values.tolist()
                    for item in sublist
                ]
                labels = np.array(
                    list(self._label_map.values()) * len(result_df))

                index = [
                    item for sublist in [[i] * len(np.unique(labels))
                                         for i in result_df.index]
                    for item in sublist
                ]
                result_df_temp = container.DataFrame()
                result_df_temp["Class"] = labels
                result_df_temp["confidence"] = confidences
                result_df_temp.metadata = result_df.metadata
                result_df_temp["index_temp"] = index
                result_df_temp = result_df_temp.set_index("index_temp")
                result_df = result_df_temp
                result_df.metadata = result_df.metadata.add_semantic_type(
                    (metadata_base.ALL_ELEMENTS, len(result_df.columns) - 1),
                    "https://metadata.datadrivendiscovery.org/types/FloatVector",
                )

            result_df.metadata = result_df.metadata.add_semantic_type(
                (metadata_base.ALL_ELEMENTS, len(result_df.columns) - 1),
                "https://metadata.datadrivendiscovery.org/types/Score",
            )
            result_df.metadata = result_df.metadata.add_semantic_type(
                (metadata_base.ALL_ELEMENTS, len(result_df.columns) - 1),
                "https://metadata.datadrivendiscovery.org/types/PredictedTarget",
            )

        logger.debug(f"\n{result_df}")
        return base.CallResult(result_df)
示例#4
0
    def produce(
            self,
            *,
            left: Inputs,  # type: ignore
            right: Inputs,  # type: ignore
            timeout: float = None,
            iterations: int = None) -> base.CallResult[Outputs]:

        # attempt to extract the main table
        try:
            left_resource_id, left_df = d3m_base_utils.get_tabular_resource(
                left, None)
        except ValueError as error:
            raise exceptions.InvalidArgumentValueError(
                "Failure to find tabular resource in left dataset") from error

        try:
            right_resource_id, right_df = d3m_base_utils.get_tabular_resource(
                right, None)
        except ValueError as error:
            raise exceptions.InvalidArgumentValueError(
                "Failure to find tabular resource in right dataset") from error

        accuracy = self.hyperparams['accuracy']
        if accuracy <= 0.0 or accuracy > 1.0:
            raise exceptions.InvalidArgumentValueError('accuracy of ' +
                                                       str(accuracy) +
                                                       ' is out of range')

        left_col = self.hyperparams['left_col']
        right_col = self.hyperparams['right_col']

        # perform join based on semantic type
        join_type = self._get_join_semantic_type(left, left_resource_id,
                                                 left_col, right,
                                                 right_resource_id, right_col)
        joined: pd.Dataframe = None
        if join_type in self._STRING_JOIN_TYPES:
            joined = self._join_string_col(left_df, left_col, right_df,
                                           right_col, accuracy)
        elif join_type in self._NUMERIC_JOIN_TYPES:
            joined = self._join_numeric_col(left_df, left_col, right_df,
                                            right_col, accuracy)
        elif join_type in self._DATETIME_JOIN_TYPES:
            joined = self._join_datetime_col(left_df, left_col, right_df,
                                             right_col, accuracy)
        else:
            raise exceptions.InvalidArgumentValueError(
                'join not surpported on type ' + str(join_type))

        # create a new dataset to hold the joined data
        resource_map = {}
        for resource_id, resource in left.items():  # type: ignore
            if resource_id == left_resource_id:
                resource_map[resource_id] = joined
            else:
                resource_map[resource_id] = resource
        result_dataset = container.Dataset(resource_map,
                                           generate_metadata=True)

        return base.CallResult(result_dataset)
示例#5
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """
        Gaussian classification (i.e. seeded gaussian "clustering").

        Inputs
            D - An n x d feature numpy array
        Returns
            labels - Class labels for each unlabeled vertex
        """

        if not self._fitted:
            raise ValueError("Not fitted")

        n = self._embedding.shape[0]

        unique_labels = np.unique(self._labels)
        K = len(unique_labels)

        testing = inputs[0]

        try:
            testing_nodeIDs = np.asarray(testing['G1.nodeID'])
        except:
            testing_nodeIDs = np.asarray(testing['nodeID'])
        final_labels = np.zeros(len(testing))
        string_nodeIDs = np.array([str(i) for i in self._nodeIDs])
        #print(string_nodeIDs, file=sys.stderr)
        #print(testing_nodeIDs, file=sys.stderr)
        if self._PD and self._ENOUGH_SEEDS:
            for i in range(len(testing_nodeIDs)):
                temp = np.where(
                    string_nodeIDs == str(testing_nodeIDs[i]))[0][0]
                weighted_pdfs = np.array([
                    self._pis[j] *
                    MVN.pdf(self._embedding[temp, :], self._means[j],
                            self._covariances[j, :, :]) for j in range(K)
                ])
                label = np.argmax(weighted_pdfs)
                final_labels[i] = int(label)
        else:

            for i in range(len(testing_nodeIDs)):
                temp = np.where(
                    string_nodeIDs == str(testing_nodeIDs[i]))[0][0]
                try:
                    weighted_pdfs = np.array([
                        self._pis[j] *
                        MVN.pdf(self._embedding[temp, :], self._means[j],
                                self._covariances) for j in range(K)
                    ])
                except:
                    self._covariances += self._covariances + np.ones(
                        self._covariances.shape) * 0.00001
                    weighted_pdfs = np.array([
                        self._pis[j] *
                        MVN.pdf(self._embedding[temp, :], self._means[j],
                                self._covariances) for j in range(K)
                    ])
                label = np.argmax(weighted_pdfs)
                final_labels[i] = int(label)

        if self._problem == "VN":
            testing['classLabel'] = final_labels
            outputs = container.DataFrame(testing[['d3mIndex', 'classLabel']])
            outputs[['d3mIndex',
                     'classLabel']] = outputs[['d3mIndex',
                                               'classLabel']].astype(int)
        else:
            testing['community'] = final_labels
            outputs = container.DataFrame(testing[['d3mIndex', 'community']])
            outputs[['d3mIndex',
                     'community']] = outputs[['d3mIndex',
                                              'community']].astype(int)

        return base.CallResult(outputs)
示例#6
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> base.CallResult[Outputs]:
        """

            Args:
                inputs: Container DataFrame

            Returns:
                Container DataFrame added with DCT coefficients in a column named 'column_name_dct_coeff'

        """
        assert isinstance(inputs, container.DataFrame), type(dataframe)

        self._fitted = False
        self._training_inputs, self._training_indices = self._get_columns_to_fit(
            inputs, self.hyperparams)
        self._input_column_names = self._training_inputs.columns

        if len(self._training_indices) > 0:
            # self._clf.fit(self._training_inputs)
            self._fitted = True
        else:
            if self.hyperparams['error_on_no_input']:
                raise RuntimeError("No input columns were selected")
            self.logger.warn("No input columns were selected")

        if not self._fitted:
            raise PrimitiveNotFittedError("Primitive not fitted.")

        sk_inputs = inputs
        if self.hyperparams['use_semantic_types']:
            cols = [inputs.columns[x] for x in self._training_indices]
            sk_inputs = container.DataFrame(
                data=inputs.iloc[:, self._training_indices].values,
                columns=cols,
                generate_metadata=True)

        output_columns = []
        if len(self._training_indices) > 0:
            sk_output = self._clf.produce(sk_inputs)

            if sparse.issparse(sk_output):
                sk_output = sk_output.toarray()
            outputs = self._wrap_predictions(inputs, sk_output)
            # if len(outputs.columns) == len(self._input_column_names):
            #     outputs.columns = self._input_column_names
            output_columns = [outputs]
        else:
            if self.hyperparams['error_on_no_input']:
                raise RuntimeError("No input columns were selected")
            self.logger.warn("No input columns were selected")

        outputs = base_utils.combine_columns(
            return_result=self.hyperparams['return_result'],
            add_index_columns=self.hyperparams['add_index_columns'],
            inputs=inputs,
            column_indices=self._training_indices,
            columns_list=output_columns)

        return base.CallResult(outputs)
示例#7
0
    def produce(
            self,
            *,
            inputs: container.DataFrame,
            timeout: float = None,
            iterations: int = None) -> base.CallResult[container.DataFrame]:

        # if no column index is supplied use the first real vector column found in the dataset
        vector_idx = self.hyperparams['vector_col_index']
        if vector_idx is None:
            vector_idx = self._find_real_vector_column(inputs.metadata)
        # validate the column
        if not self._can_use_column(inputs.metadata, vector_idx):
            raise exceptions.InvalidArgumentValueError(
                'column idx=' + str(vector_idx) + ' from ' +
                str(inputs.columns) + ' does not contain float vectors')
        # flag label generation if none are supplied
        labels = list(self.hyperparams['labels'])
        if labels is None:
            labels = []
        generate_labels = True if labels is None or len(labels) == 0 else False

        # create a dataframe to hold the new columns
        vector_dataframe = container.DataFrame(data=[])

        # loop over elements of the source vector column
        for i, v in enumerate(inputs.iloc[:, vector_idx]):
            elems = v.split(',')
            vector_length = len(elems)
            for j, e in enumerate(elems):
                # initialize columns when processing first row
                if i == 0:
                    # get the name of the source vector column
                    vector_col_metadata = inputs.metadata.query_column(
                        vector_idx)
                    vector_label = vector_col_metadata['name']

                    # create an empty column for each element of the vector
                    if generate_labels:
                        labels.append(vector_label + "_" + str(j))
                    vector_dataframe[labels[j]] = ''

                # write vector elements into each column - force to string as d3m convention is
                # to store data as pandas 'obj' type until explicitly cast
                vector_dataframe.at[i, labels[j]] = str(e.strip())

        # create default d3m metadata structures (rows, columns etc.) and copy the semantic types
        # from the source vector over, replacing FloatVector with Float
        vector_dataframe.metadata = vector_dataframe.metadata.set_for_value(
            vector_dataframe)
        source_semantic_types = list(
            inputs.metadata.query_column(vector_idx)['semantic_types'])
        source_semantic_types.remove(
            'https://metadata.datadrivendiscovery.org/types/FloatVector')
        source_semantic_types.append(
            'https://metadata.datadrivendiscovery.org/types/Float')
        for i in range(0, len(labels)):
            vector_dataframe.metadata = vector_dataframe.metadata.\
                update_column(i, {'semantic_types': source_semantic_types})

        output = utils.append_columns(inputs, vector_dataframe)

        # wrap as a D3M container - metadata should be auto generated
        return base.CallResult(output)
示例#8
0
 def fit(self, *, timeout: float = None, iterations: int = None) -> None:
     return base.CallResult(None)
示例#9
0
    def produce(  # type: ignore
        self,
        *,
        inputs: Inputs,
        score_dataset: container.Dataset,
        timeout: float = None,
        iterations: int = None,
    ) -> base.CallResult[Outputs]:
        if not self.hyperparams['metrics']:
            raise ValueError("\"metrics\" hyper-parameter cannot be empty.")

        truth, all_labels = self._get_truth(score_dataset)
        predictions = self._get_predictions(inputs)

        for target_column in self.hyperparams['all_labels']:
            all_labels[target_column['column_name']] = list(
                target_column['labels'])

        outputs: typing.Dict[str, typing.List] = {
            'metric': [],
            'value': [],
        }

        if self.hyperparams['add_normalized_scores']:
            outputs['normalized'] = []

        for metric_configuration in self.hyperparams['metrics']:
            metric = problem.PerformanceMetric[metric_configuration['metric']]
            metric_class = metric.get_class()

            params = {}

            if 'all_labels' in inspect.signature(
                    metric_class).parameters and all_labels:
                params['all_labels'] = all_labels

            for param_name, param_value in metric_configuration.items():
                if param_name == 'metric':
                    continue
                if param_value is None:
                    continue
                params[param_name] = param_value

            if metric.requires_confidence(
            ) and metrics.CONFIDENCE_COLUMN not in predictions.columns:
                raise exceptions.InvalidArgumentValueError(
                    f"Metric {metric.name} requires confidence column in predictions, but it is not available.",
                )
            if metric.requires_rank(
            ) and metrics.RANK_COLUMN not in predictions.columns:
                raise exceptions.InvalidArgumentValueError(
                    f"Metric {metric.name} requires rank column in predictions, but it is not available.",
                )

            score = metric_class(**params).score(truth, predictions)

            outputs['metric'].append(metric.name)
            outputs['value'].append(score)

            if self.hyperparams['add_normalized_scores']:
                outputs['normalized'].append(metric.normalize(score))

        # Dictionary key order is preserved in Python 3.6+ which makes column order as we want it.
        results = container.DataFrame(data=outputs,
                                      columns=list(outputs.keys()),
                                      generate_metadata=True)

        # Not really necessary, but it does not hurt. In theory somebody could list same metric multiple times
        # (maybe with different params), so we use "PrimaryMultiKey" here.
        results.metadata = results.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 0),
            'https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey',
        )
        results.metadata = results.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 1),
            'https://metadata.datadrivendiscovery.org/types/Score',
        )
        if self.hyperparams['add_normalized_scores']:
            results.metadata = results.metadata.add_semantic_type(
                (metadata_base.ALL_ELEMENTS, 2),
                'https://metadata.datadrivendiscovery.org/types/Score',
            )

        return base.CallResult(results)
    def produce_collection(
        self,
        *,
        inputs: container.Dataset,
        timeout: float = None,
        iterations: int = None,
    ) -> base.CallResult[container.DataFrame]:
        logger.debug(f"Running {__name__}")

        # get the learning data (the dataset entry point)
        learning_id, learning_df = base_utils.get_tabular_resource(
            inputs, None, pick_entry_point=True)

        learning_df = learning_df.head(
            int(learning_df.shape[0] * self.hyperparams["sample"]))
        learning_df.metadata = self._update_metadata(inputs.metadata,
                                                     learning_id, learning_df)

        # find the column that is acting as the foreign key and extract the resource + column it references
        for i in range(
                learning_df.metadata.query(
                    (metadata_base.ALL_ELEMENTS, ))["dimension"]["length"]):
            column_metadata = learning_df.metadata.query_column(i)
            if ("foreign_key" in column_metadata
                    and column_metadata["foreign_key"]["type"] == "COLUMN"):
                resource_id = column_metadata["foreign_key"]["resource_id"]
                file_column_idx = column_metadata["foreign_key"][
                    "column_index"]

        # get the learning data (the dataset entry point)
        collection_id, collection_df = base_utils.get_tabular_resource(
            inputs, resource_id)

        collection_df = collection_df.head(learning_df.shape[0])
        collection_df.metadata = self._update_metadata(inputs.metadata,
                                                       collection_id,
                                                       collection_df)

        # get the base path
        base_path = collection_df.metadata.query(
            (metadata_base.ALL_ELEMENTS,
             file_column_idx))["location_base_uris"][0]

        # create fully resolved paths and load
        paths = learning_df.iloc[:, file_column_idx]  # TODO: remove, unused?

        file_paths = []
        for i, row in learning_df.iterrows():
            if i % 100 == 0:
                logger.debug(f"Loaded {i} / {len(learning_df.index)} files")
            try:
                start_end = row["start-end-time-slice-of-recording"]
                start, end = [float(x) for x in start_end.split(",")]
                file_paths.append((os.path.join(base_path,
                                                row["filename"]), start, end))
            except AttributeError as e:
                logger.warning("no start/end ts for {}".format(row))
                file_paths.append((os.path.join(base_path,
                                                row["filename"]), None, None))

        outputs = self._audio_load(self.hyperparams["n_jobs"], file_paths)

        logger.debug(f"\n{outputs}")

        result_df = pd.DataFrame({"audio":
                                  outputs})  # d3m container takes for_ever_
        return base.CallResult(
            container.DataFrame(result_df, generate_metadata=False))
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> base.CallResult[Outputs]:
        # obtain the path to dataset
        temp_json = inputs.to_json_structure()
        datasetDoc_uri = temp_json['location_uris'][0][7:]
        location_base_uri = '/'.join(datasetDoc_uri.split('/')[:-1])

        with open(datasetDoc_uri) as json_file:
            datasetDoc_json = json.load(json_file)
            dataResources = datasetDoc_json['dataResources']

        # get the task type from the task docs
        temp_path = datasetDoc_uri.split('/')
        problemDoc_uri = '/'.join(temp_path[:-2]) + '/' + '/'.join(
            temp_path[-2:]).replace('dataset', 'problem')

        with open(problemDoc_uri) as json_file:
            task_types = json.load(json_file)['about']['taskKeywords']

        # TODO consider avoiding explicit use of problem type throughout pipeline
        TASK = ""
        for task in task_types:
            if task in [
                    "communityDetection", "linkPrediction",
                    "vertexClassification", "graphMatching"
            ]:
                TASK = task
        if TASK == "":
            raise exceptions.NotSupportedError(
                "only graph tasks are supported")

        # load the graphs and convert to a networkx object
        graphs = []
        nodeIDs = []
        for i in dataResources:
            if i['resType'] == "table":
                if i['resID'] == 'learningData':
                    df = inputs['learningData']
                else:
                    node_list = pd.read_csv(location_base_uri + "/" +
                                            i['resPath'])

                    # assume it is a nodeList otherwise. currently, there
                    # aren't any D3M nodeList datasets that have more than one
                    # graph. furthermore, even if there was such, there isn't
                    # a way to match an edgeList to a nodeList. hence, we have
                    # to assume thatnodeList corresponds to the first graph
                    graph = graphs[0]

                    # the following block essentially catches VXTC synthetic
                    # dataset and overwrites nodeList indices withh edgeList.
                    # without a doubt not an AutoML way, but is necessary
                    first_idx_edge = str(
                        sorted(list(graph.nodes(data=False)))[0])
                    first_idx_node = str(sorted(list(node_list['nodeID']))[0])
                    if (first_idx_edge.isdigit() and first_idx_node.isdigit()
                            and int(first_idx_edge) != int(first_idx_node)):
                        node_list = node_list.sort_values(
                            'nodeID').reset_index(drop=True)
                        d3m_indices = np.sort(
                            np.array(list(
                                graph.nodes(data=False))).astype(int))
                        node_list['nodeID'] = d3m_indices

                    # make nodeID an index (so it is not used an attribute)
                    node_list = node_list.set_index('nodeID')
                    node_list.index = node_list.index.astype(str)

                    # iterate over attributes and assign them to nodes
                    for attribute in node_list.columns.tolist():
                        series = pd.Series(node_list[attribute],
                                           index=node_list.index)
                        nx.set_node_attributes(graph, series.to_dict(),
                                               attribute)

            elif i['resType'] == 'graph':
                graph_temp = nx.read_gml(location_base_uri + "/" +
                                         i['resPath'])
                graphs.append(graph_temp)
                if TASK in ["communityDetection", "vertexClassification"]:
                    nodeIDs_temp = list(
                        nx.get_node_attributes(graphs[0], 'nodeID').values())
                    nodeIDs_temp = np.array([str(i) for i in nodeIDs_temp])
                    nodeIDs_temp = container.ndarray(nodeIDs_temp)
                    nodeIDs.append(nodeIDs_temp)
            elif i['resType'] == "edgeList":
                temp_graph = self._read_edgelist(
                    location_base_uri + "/" + i['resPath'],
                    i["columns"],
                )
                graphs.append(temp_graph)
                if TASK in ["communityDetection", "vertexClassification"]:
                    nodeIDs_temp = list(temp_graph.nodes)
                    nodeIDs_temp = np.array([str(i) for i in nodeIDs_temp])
                    nodeIDs_temp = container.ndarray(nodeIDs_temp)
                    nodeIDs.append(nodeIDs_temp)

        return base.CallResult(container.List([df, graphs, nodeIDs, TASK]))
示例#12
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        np.random.seed(1234)

        G = inputs[0].copy()

        if type(G) == networkx.classes.graph.Graph:
            if networkx.is_weighted(G):
                E = int(networkx.number_of_edges(G))
                g = self._pass_to_ranks(G, nedges=E)
            else:
                E = int(networkx.number_of_edges(G))
                g = networkx.to_numpy_array(G)
        elif type(G) is np.ndarray:
            G = networkx.to_networkx_graph(G)
            E = int(networkx.number_of_edges(G))
            g = self._pass_to_ranks(G, nedges=E)
        else:
            raise ValueError("networkx Graph and n x d numpy arrays only")

        n = g.shape[0]

        if self.hyperparams['use_attributes']:
            adj = [g]
            MORE_ATTR = True
            attr_number = 1
            while MORE_ATTR:
                attr = 'attr'
                temp_attr = np.array(
                    list(
                        networkx.get_node_attributes(
                            G, 'attr' + str(attr_number)).values()))
                if len(temp_attr) == 0:
                    MORE_ATTR = False
                else:
                    adj.append(temp_attr)
                    attr_number += 1
            for i in range(1, len(adj)):
                adj[i] = self._pass_to_ranks(adj[i], nedges=E, matrix=True)

            if len(adj) > 1:
                g = self._omni(adj)
                D = np.linalg.pinv(np.diag(g.sum(axis=1))**(1 / 2))
                L = 1 / np.sqrt(D) @ G @ 1 / np.sqrt(D)

                M = len(adj)

                tsvd = TruncatedSVD(
                    n_components=self.hyperparams['max_dimension'])
                tsvd.fit(L)

                eig_vectors = tsvd.components_.T
                eig_values = tsvd.singular_values_

                d = self._get_elbows(eigenvalues=eig_values)

                X_hat = eig_vectors_copy.dot(np.diag(eig_values**0.5))
                avg = np.zeros(shape=(n, d))

                for i in range(M):
                    for j in range(n):
                        avg[j] += X_hat[i * n + j]
                for j in range(n):
                    avg[j, :] = avg[j, :] / M

                embedding = avg.copy()

                inputs[0] = container.ndarray(embedding)

                return base.CallResult(inputs)

        D = np.linalg.pinv(np.diag(g.sum(axis=1))**(1 / 2))

        L = D @ g @ D

        d_max = self.hyperparams['max_dimension']

        tsvd = TruncatedSVD(n_components=d_max)
        tsvd.fit(L)

        eig_vectors = tsvd.components_.T
        eig_values = tsvd.singular_values_

        eig_vectors_copy = eig_vectors[:, :].copy()

        X_hat = eig_vectors_copy.dot(np.diag(eig_values**0.5))

        inputs[0] = container.ndarray(X_hat)

        return base.CallResult(inputs)
示例#13
0
    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> base.CallResult[None]:
        if self._fitted:
            return base.CallResult(None)

        self._embedding = self._training_inputs[1][0]

        self._nodeIDs = np.array(self._training_inputs[2])

        try:
            self._seeds = self._training_inputs[0]['G1.nodeID']
        except:
            self._seeds = self._training_inputs[0]['nodeID'].astype(
                float).astype(int)

        self._seeds = np.array([int(i) for i in self._seeds])

        try:
            self._labels = self._training_inputs[0]['classLabel']
            self._problem = 'VN'

        except:
            self._labels = self._training_inputs[0]['community']
            self._problem = 'CD'

        self._labels = np.array([int(i) for i in self._labels])

        unique_labels, label_counts = np.unique(self._labels,
                                                return_counts=True)
        K = len(unique_labels)

        n, d = self._embedding.shape

        if int(K) < d:
            self._embedding = self._embedding[:, :K].copy()
            d = int(K)

        self._ENOUGH_SEEDS = True  # For full estimation

        # get unique labels
        unique_labels, label_counts = np.unique(self._labels,
                                                return_counts=True)
        for i in range(K):
            if label_counts[i] < d * (d + 1) / 2:
                self._ENOUGH_SEEDS = False
                break

        self._pis = label_counts / len(self._seeds)

        # reindex labels if necessary
        for i in range(len(self._labels)):  # reset labels to [0,.., K-1]
            itemindex = np.where(unique_labels == self._labels[i])[0][0]
            self._labels[i] = int(itemindex)

        # gather the means
        x_sums = np.zeros(shape=(K, d))

        estimated_means = np.zeros((K, d))
        for i in range(K):
            temp_seeds = self._seeds[np.where(self._labels == i)[0]]
            estimated_means[i] = np.mean(self._embedding[temp_seeds], axis=0)
        #for i in range(len(self._seeds)):
        #    nodeID = np.where(self._nodeIDs == self._seeds[i])[0][0]
        #    temp_feature_vector = self._embedding[nodeID, :]
        #    temp_label = self._labels[i]
        #    x_sums[temp_label, :] += temp_feature_vector

        #estimated_means = [x_sums[i,:]/label_counts[i] for i in range(K)]

        mean_centered_sums = np.zeros(shape=(K, d, d))

        covs = np.zeros(shape=(K, d, d))
        for i in range(K):
            feature_vectors = self._embedding[self._seeds[self._labels ==
                                                          i], :]
            covs[i] = np.cov(feature_vectors, rowvar=False)

        if self._ENOUGH_SEEDS:
            estimated_cov = covs
        else:
            estimated_cov = np.zeros(shape=(d, d))
            for i in range(K):
                estimated_cov += covs[i] * (label_counts[i] - 1)
            estimated_cov = estimated_cov / (n - K)

        self._PD = True

        self._means = container.ndarray(estimated_means)
        self._covariances = container.ndarray(estimated_cov)

        self._fitted = True

        return base.CallResult(None)
示例#14
0
    def produce(self,
                *,
                inputs: Inputs,
                iterations: int = None,
                timeout: float = None) -> base.CallResult[Outputs]:
        """
        Inputs: Dataset dataFrame
        Returns: Pandas DataFramefor for classification or regression task
        """
        # Get all Nested media files
        image_columns = inputs.metadata.get_columns_with_semantic_type(
            'https://metadata.datadrivendiscovery.org/types/FileName')  # [1]
        base_paths = [
            inputs.metadata.query((metadata_base.ALL_ELEMENTS, t))
            for t in image_columns
        ]  # Image Dataset column names
        base_paths = [
            base_paths[t]['location_base_uris'][0].replace('file:///', '/')
            for t in range(len(base_paths))
        ]  # Path + media
        all_img_paths = [[
            os.path.join(base_path, filename)
            for filename in inputs.iloc[:, col]
        ] for base_path, col in zip(base_paths, image_columns)]

        # Delete columns with path names of nested media files
        outputs = inputs.remove_columns(image_columns)

        # Set model to evaluate mode
        self.model.eval()

        return_argmax = False
        if self.hyperparams['loss_type'] == 'crossentropy':
            # Multi-class classification, therefore call argmax on inference
            return_argmax = True

        # Feature extraction without fitting
        if self.hyperparams['feature_extract_only']:
            features = []
            for idx in range(len(all_img_paths)):
                img_paths = all_img_paths[idx]
                for imagefile in img_paths:
                    if os.path.isfile(imagefile):
                        image = Image.open(imagefile)
                        image = self.val_pre_process(
                            image)  # To pytorch tensor
                        image = image.unsqueeze(0)  # 1 x C x H x W
                        feature = self.model(
                            image.to(self.device),
                            include_last_layer=self.include_last_layer)
                        if self.final_layer != None:
                            feature = self.final_layer(feature)
                        if len(feature.shape) > 1:
                            feature = torch.flatten(feature)
                        feature = feature.data.cpu().numpy()
                        #print(feature.shape)
                    else:
                        logging.warning(
                            "No such file {}. Feature vector will be set to all zeros."
                            .format(file_path))
                        feature = np.zeros((self.expected_feature_out_dim))
                    # Collect features
                    features.append(feature)
            # Feature vector data frame
            feature_vectors = container.DataFrame(features,
                                                  generate_metadata=True)

            # Update Metadata for each feature vector dataframe column
            for col in range(feature_vectors.shape[1]):
                col_dict = dict(
                    feature_vectors.metadata.query(
                        (metadata_base.ALL_ELEMENTS, col)))
                col_dict['structural_type'] = type(1.0)
                col_dict['name'] = "vector_" + str(col)
                col_dict["semantic_types"] = (
                    "http://schema.org/Float",
                    "https://metadata.datadrivendiscovery.org/types/Attribute",
                )
                feature_vectors.metadata = feature_vectors.metadata.update(
                    (metadata_base.ALL_ELEMENTS, col), col_dict)

            # Add the features to the input labels with data removed
            outputs = outputs.append_columns(feature_vectors)
        #-----------------------------------------------------------------------
        else:
            # Inference
            if not self._fitted and self.hyperparams['output_dim'] != 1000:
                raise Exception('Please fit the model before calling produce!')

            # Get Label Columns Names
            label_columns = self._training_outputs.metadata.get_columns_with_semantic_type(
                'https://metadata.datadrivendiscovery.org/types/TrueTarget'
            )  # [2]
            if len(label_columns) == 0 or label_columns == None:
                label_columns = self._training_outputs.metadata.get_columns_with_semantic_type(
                    'https://metadata.datadrivendiscovery.org/types/SuggestedTarget'
                )  # [2]
            label_columns_names = [
                list(self._training_outputs.columns)[i] for i in label_columns
            ]

            predictions = []
            for idx in range(len(all_img_paths)):
                img_paths = all_img_paths[idx]
                for imagefile in img_paths:
                    if os.path.isfile(imagefile):
                        image = Image.open(imagefile)
                        image = self.val_pre_process(
                            image)  # To pytorch tensor
                        image = image.unsqueeze(0)  # 1 x C x H x W
                        _out = self.model(
                            image.to(self.device),
                            include_last_layer=self.include_last_layer)
                        if return_argmax:
                            _out = torch.argmax(_out, dim=-1, keepdim=False)
                        _out = torch.flatten(_out)
                        _out = _out.data.cpu().numpy()
                    else:
                        logging.warning(
                            "No such file {}. Feature vector will be set to all zeros."
                            .format(file_path))
                        _out = np.zeros((self.hyperparams['output_dim']))
                    # Collect features
                    predictions.append(_out)

            # Convert to d3m type with metadata
            preds = container.DataFrame(predictions, generate_metadata=True)

            # Update Metadata for each feature vector column
            for col in range(preds.shape[1]):
                col_dict = dict(
                    preds.metadata.query((metadata_base.ALL_ELEMENTS, col)))
                col_dict['structural_type'] = type(1.0)
                col_dict['name'] = label_columns_names[col]
                col_dict["semantic_types"] = (
                    "http://schema.org/Float",
                    "https://metadata.datadrivendiscovery.org/types/PredictedTarget",
                )
                preds.metadata = preds.metadata.update(
                    (metadata_base.ALL_ELEMENTS, col), col_dict)

            # Add the features to the input labels with data removed
            outputs = outputs.append_columns(preds)
        #-----------------------------------------------------------------------

        return base.CallResult(outputs)
示例#15
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """
        Input
            G: an n x n matrix or a networkx Graph
        Return
            The largest connected component of g

        """
        try:
            G = inputs['0']
        except:
            edge_list = inputs['1']  # for edge lists
            try:
                V1_nodeIDs = np.array(edge_list.V1_nodeID.values).astype(int)
                V2_nodeIDs = np.array(edge_list.V2_nodeID.values).astype(int)
                edge_weights = np.array(
                    edge_list.edge_weight.values).astype(float).astype(int)
            except:
                V1_nodeIDs = np.array(edge_list.node1.values).astype(int)
                V2_nodeIDs = np.array(edge_list.node2.values).astype(int)
                edge_weights = np.ones(len(V1_nodeIDs))
            n_edges = len(V1_nodeIDs)

            unique_V1_nodeIDs = np.unique(V1_nodeIDs)
            unique_V2_nodeIDs = np.unique(V2_nodeIDs)

            concatenated_unique_IDs = np.concatenate(
                (unique_V1_nodeIDs, unique_V2_nodeIDs))

            unique_all = np.unique(concatenated_unique_IDs)

            n_nodes = len(unique_all)

            G = nx.Graph()
            G.add_nodes_from(unique_all)

            for i in range(n_edges):
                G.add_edge(V1_nodeIDs[i],
                           V2_nodeIDs[i],
                           weight=edge_weights[i])

        print(type(G), file=sys.stderr)

        csv = inputs['learningData']

        if len(csv) != 0:
            if len(list(nx.get_node_attributes(G, 'nodeID').values())) == 0:
                nx.set_node_attributes(G, 'nodeID', -1)
                for i in range(len(G)):
                    G.node[i]['nodeID'] = i

            nodeIDs = list(nx.get_node_attributes(G, 'nodeID').values())
            nodeIDs = container.ndarray(np.array([int(i) for i in nodeIDs]))

            return base.CallResult(container.List([G.copy(), nodeIDs, csv]))

        if type(G) == np.ndarray:
            if G.ndim == 2:
                if G.shape[0] == G.shape[1]:  # n x n matrix
                    G = Graph(G)
                else:
                    raise TypeError(
                        "Networkx graphs or n x n numpy arrays only")

        subgraphs = [G.subgraph(i).copy() for i in nx.connected_components(G)]

        G_connected = [[0]]
        for i in subgraphs:
            if len(i) > len(G_connected[0]):
                G_connected = [i]

        nodeIDs = list(
            nx.get_node_attributes(G_connected[0], 'nodeID').values())
        nodeIDs = container.ndarray(np.array([int(i) for i in nodeIDs]))

        return base.CallResult(
            container.List([G_connected[0].copy(), nodeIDs, csv]))
示例#16
0
    def produce(
            self,
            *,
            inputs: container.DataFrame,
            timeout: float = None,
            iterations: int = None) -> base.CallResult[container.DataFrame]:

        # make sure the target column is of a valid type
        target_idx = self.hyperparams['target_col_index']
        if not self._can_use_column(inputs.metadata, target_idx):
            raise exceptions.InvalidArgumentValueError(
                'column idx=' + str(target_idx) + ' from ' +
                str(inputs.columns) +
                ' does not contain continuous or discrete type')

        # check if target is discrete or continuous
        semantic_types = inputs.metadata.query_column(
            target_idx)['semantic_types']
        discrete = len(set(semantic_types).intersection(
            self._discrete_types)) > 0

        # make a copy of the inputs and clean out any missing data
        feature_df = inputs.copy()
        feature_df.dropna(inplace=True)

        # split out the target feature
        target_df = feature_df.iloc[:, target_idx]

        # drop features that are not compatible with ranking
        feature_indices = set(
            utils.list_columns_with_semantic_types(inputs.metadata,
                                                   self._semantic_types))
        role_indices = set(
            utils.list_columns_with_semantic_types(inputs.metadata,
                                                   self._roles))
        feature_indices = feature_indices.intersection(role_indices)

        all_indices = set(range(0, inputs.shape[1]))
        skipped_indices = all_indices.difference(feature_indices)
        skipped_indices.add(target_idx)  # drop the target too
        for i, v in enumerate(skipped_indices):
            feature_df.drop(inputs.columns[v], axis=1, inplace=True)

        # figure out the discrete and continuous feature indices and create an array
        # that flags them
        discrete_indices = utils.list_columns_with_semantic_types(
            inputs.metadata, self._discrete_types)
        discrete_flags = [False] * feature_df.shape[1]
        for v in discrete_indices:
            col_name = inputs.columns[v]
            if col_name in feature_df:
                col_idx = feature_df.columns.get_loc(col_name)
                discrete_flags[col_idx] = True

        target_np = target_df.values
        feature_np = feature_df.values

        # compute mutual information for discrete or continuous target
        ranked_features_np = None
        if discrete:
            ranked_features_np = mutual_info_classif(
                feature_np,
                target_np,
                discrete_features=discrete_flags,
                random_state=self._random_seed)
        else:
            ranked_features_np = mutual_info_regression(
                feature_np,
                target_np,
                discrete_features=discrete_flags,
                random_state=self._random_seed)

        # merge back into a single list of col idx / rank value tuples
        data: typing.List[typing.Tuple[int, str, float]] = []
        data = self._append_rank_info(inputs, data, ranked_features_np,
                                      feature_df)

        cols = ['idx', 'name', 'rank']
        results = container.DataFrame(data=data, columns=cols)
        results = results.sort_values(by=['rank'],
                                      ascending=False).reset_index(drop=True)

        # wrap as a D3M container - metadata should be auto generated
        return base.CallResult(results)
示例#17
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> base.CallResult[Outputs]:
        # If only one resource is in the dataset, we do not have anything to do.
        if inputs.metadata.query(())['dimension']['length'] == 1:
            return base.CallResult(inputs)

        main_resource_id = self.hyperparams['starting_resource']

        if main_resource_id is None:
            for resource_id in inputs.keys():
                if 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint' in inputs.metadata.query(
                    (resource_id, )).get('semantic_types', []):
                    main_resource_id = resource_id
                    break

        if main_resource_id is None:
            raise ValueError(
                "A Dataset with multiple resources without an entry point and no starting resource specified as a hyper-parameter."
            )

        main_data = inputs[main_resource_id]
        main_columns_length = inputs.metadata.query(
            (main_resource_id,
             metadata_base.ALL_ELEMENTS))['dimension']['length']

        # There is only one resource now.
        top_level_metadata = dict(inputs.metadata.query(()))
        top_level_metadata['dimension'] = dict(top_level_metadata['dimension'])
        top_level_metadata['dimension']['length'] = 1

        # !!! changed part: remove unloaded metadata to pass the check function
        metadata = inputs.metadata.clear(
            top_level_metadata, source=self).set_for_value(None, source=self)
        other_keys = [*inputs]
        other_keys.remove(main_resource_id)
        for each_key in other_keys:
            metadata = metadata.remove(selector=(each_key, ), recursive=True)
        # changed finished

        #metadata = inputs.metadata.clear(top_level_metadata, source=self).set_for_value(None, source=self)

        # Resource is not anymore an entry point.
        entry_point_metadata = dict(inputs.metadata.query(
            (main_resource_id, )))
        entry_point_metadata['semantic_types'] = [
            semantic_type
            for semantic_type in entry_point_metadata['semantic_types']
            if semantic_type !=
            'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint'
        ]
        metadata = metadata.update((main_resource_id, ),
                                   entry_point_metadata,
                                   source=self)

        data = None

        for column_index in range(main_columns_length):
            column_metadata = inputs.metadata.query(
                (main_resource_id, metadata_base.ALL_ELEMENTS, column_index))

            if 'foreign_key' not in column_metadata:
                # We just copy over data and metadata.
                data, metadata = self._add_column(
                    main_resource_id, data, metadata,
                    self._get_column(main_data, column_index), column_metadata)
            else:
                assert column_metadata['foreign_key'][
                    'type'] == 'COLUMN', column_metadata

                if 'column_index' in column_metadata['foreign_key']:
                    data, metadata = self._join_by_index(
                        main_resource_id,
                        inputs,
                        column_index,
                        data,
                        metadata,
                        column_metadata['foreign_key']['resource_id'],
                        column_metadata['foreign_key']['column_index'],
                    )
                elif 'column_name' in column_metadata['foreign_key']:
                    data, metadata = self._join_by_name(
                        main_resource_id,
                        inputs,
                        column_index,
                        data,
                        metadata,
                        column_metadata['foreign_key']['resource_id'],
                        column_metadata['foreign_key']['column_name'],
                    )
                else:
                    assert False, column_metadata

        resources = {}
        resources[main_resource_id] = data

        # Number of columns had changed.
        all_rows_metadata = dict(
            inputs.metadata.query(
                (main_resource_id, metadata_base.ALL_ELEMENTS)))
        all_rows_metadata['dimension'] = dict(all_rows_metadata['dimension'])
        all_rows_metadata['dimension']['length'] = data.shape[1]
        metadata = metadata.update(
            (main_resource_id, metadata_base.ALL_ELEMENTS),
            all_rows_metadata,
            for_value=resources,
            source=self)

        # !!! changed part: load all dataset to resources
        '''
        other_keys = [*inputs]
        other_keys.remove(main_resource_id)
        for each_key in other_keys:
            metadata = metadata.remove(selector = (each_key,),recursive = True, source = resources)
        '''
        '''
        # this change only works for d3m v2018.6.5, for v2018.7.10, even the "metadata.remove" will check the resouces and metadata relationship: so we have to load all data to the resources before check/remove
        # !!! changed part: remove unloaded metadata to pass the check function
        other_keys = [*inputs]
        other_keys.remove(main_resource_id)
        for each_key in other_keys:
            metadata = metadata.remove(selector = (each_key,),recursive = True, source = resources)
        # changed finished
        '''
        metadata.check(resources)

        dataset = container.Dataset(resources, metadata)

        return base.CallResult(dataset)
示例#18
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> base.CallResult[Outputs]:
        """

        Args:
            inputs: Container DataFrame
            timeout: Default
            iterations: Default

        Returns:
            Container DataFrame containing abs_sum of  time series
        """
        self.logger.info('Statistical AbsSum  Primitive called')

        # Get cols to fit.
        self._fitted = False
        self._training_inputs, self._training_indices = self._get_columns_to_fit(
            inputs, self.hyperparams)
        self._input_column_names = self._training_inputs.columns

        if len(self._training_indices) > 0:
            # self._clf.fit(self._training_inputs)
            self._fitted = True
        else:
            if self.hyperparams['error_on_no_input']:
                raise RuntimeError("No input columns were selected")
            self.logger.warn("No input columns were selected")

        if not self._fitted:
            raise PrimitiveNotFittedError("Primitive not fitted.")
        statistical_abs_sum_input = inputs
        if self.hyperparams['use_semantic_types']:
            statistical_abs_sum_input = inputs.iloc[:, self._training_indices]
        output_columns = []
        if len(self._training_indices) > 0:
            statistical_abs_sum_output = self._abs_sum(
                statistical_abs_sum_input, self.hyperparams["window_size"])

            if sparse.issparse(statistical_abs_sum_output):
                statistical_abs_sum_output = statistical_abs_sum_output.toarray(
                )
            outputs = self._wrap_predictions(inputs,
                                             statistical_abs_sum_output)

            #if len(outputs.columns) == len(self._input_column_names):
            # outputs.columns = self._input_column_names

            output_columns = [outputs]

        else:
            if self.hyperparams['error_on_no_input']:
                raise RuntimeError("No input columns were selected")
            self.logger.warn("No input columns were selected")
        outputs = base_utils.combine_columns(
            return_result=self.hyperparams['return_result'],
            add_index_columns=self.hyperparams['add_index_columns'],
            inputs=inputs,
            column_indices=self._training_indices,
            columns_list=output_columns)

        self.logger.info('Statistical AbsSum  Primitive returned')

        return base.CallResult(outputs)
示例#19
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> base.CallResult[Outputs]:

        dataframe_resource_id, dataframe = base_utils.get_tabular_resource(
            inputs,
            self.hyperparams["dataframe_resource"])  # get attribute columns

        hyperparams_class = (
            dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.query()
            ["primitive_code"]["class_type_arguments"]["Hyperparams"])
        primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(
            hyperparams=hyperparams_class.defaults())

        dataframe_meta = primitive.produce(inputs=inputs).value

        attributes = list_columns_with_semantic_types(
            metadata=dataframe_meta.metadata,
            semantic_types=[
                "https://metadata.datadrivendiscovery.org/types/Attribute"
            ],
        )

        base_file_path = "/".join(
            inputs.metadata._current_metadata.metadata["location_uris"]
            [0].split("/")[:-1])
        edge_list = pd.read_csv(os.path.join(base_file_path, "graphs",
                                             "edgeList.csv"),
                                index_col=0)
        if len(edge_list.columns) > 2:
            graph = nx.from_pandas_edgelist(
                edge_list,
                source=edge_list.columns[0],
                target=edge_list.columns[1],
                edge_attr=edge_list.columns[2],
            )
        else:
            graph = nx.from_pandas_edgelist(edge_list,
                                            source=edge_list.columns[0],
                                            target=edge_list.columns[1])

        if len(attributes) > 1:
            # add attributers to nodes.
            attribute_node_map = dataframe_meta[
                dataframe_meta.columns[attributes]]
            attribute_node_map["nodeID"] = attribute_node_map["nodeID"].astype(
                int)
            attribute_node_map.index = attribute_node_map["nodeID"]
            attribute_cols = attribute_node_map.columns
            attribute_node_map.drop(["nodeID"], axis=1)
            attribute_node_map = attribute_node_map.to_dict(orient="index")

            for i in graph.nodes:
                default = {attribute: 0 for attribute in attribute_cols}
                default["nodeID"] = i
                graph.nodes[i].update(attribute_node_map.get(i, default))

        else:
            # featurizer expects at a minimum nodeids to be present
            for i in graph.nodes:
                default = {}
                default["nodeID"] = i
                graph.nodes[i].update(default)
        # int2str_map = dict(zip(graph.nodes, [str(n) for n in graph.nodes]))
        # graph = nx.relabel_nodes(graph, mapping=int2str_map)

        dataframe.metadata = self._update_metadata(inputs.metadata,
                                                   dataframe_resource_id)

        assert isinstance(dataframe, container.DataFrame), type(dataframe)

        U_train = {"graph": graph}
        y_train = self.produce_target(inputs=inputs).value
        X_train = dataframe  # TODO use attribute in vertex classification

        X_train = self._typify_dataframe(X_train)
        X_train.value = pd.DataFrame(X_train.value["nodeID"])
        return base.CallResult([X_train, y_train, U_train])