示例#1
0
    def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):
        """
        Select columns to fit.
        Args:
            inputs: Container DataFrame
            hyperparams: d3m.metadata.hyperparams.Hyperparams

        Returns:
            list
        """
        if not hyperparams['use_semantic_types']:
            return inputs, list(range(len(inputs.columns)))

        inputs_metadata = inputs.metadata

        def can_produce_column(column_index: int) -> bool:
            return cls._can_produce_column(inputs_metadata, column_index,
                                           hyperparams)

        use_columns = hyperparams['use_columns']
        exclude_columns = hyperparams['exclude_columns']

        columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(
            inputs_metadata,
            use_columns=use_columns,
            exclude_columns=exclude_columns,
            can_use_column=can_produce_column)
        return inputs.iloc[:, columns_to_produce], columns_to_produce
示例#2
0
    def _get_columns(self, inputs_metadata: metadata_base.DataMetadata,
                     type_to_cast: type) -> typing.Sequence[int]:
        # https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/cast_to_type.py
        def can_use_column(column_index: int) -> bool:
            return self._can_use_column(inputs_metadata, column_index,
                                        type_to_cast)

        columns_to_use, columns_not_to_use = base_utils.get_columns_to_use(
            inputs_metadata, self.hyperparams['use_columns'],
            self.hyperparams['exclude_columns'], can_use_column)

        if not columns_to_use:
            raise ValueError("No columns to be cast to type '{type}'.".format(
                type=type_to_cast))
        # We prefer if all columns could be cast, not just specified columns,
        # so we warn always when there are columns which cannot be produced.
        elif columns_not_to_use:
            self.logger.warning(
                "Not all columns can be cast to type '%(type)s'. Skipping columns: %(columns)s",
                {
                    'type': type_to_cast,
                    'columns': columns_not_to_use,
                })

        return columns_to_use
示例#3
0
    def _get_columns(
            self,
            inputs_metadata: metadata_base.DataMetadata) -> typing.List[int]:
        def can_use_column(column_index: int) -> bool:
            return self._can_use_column(inputs_metadata, column_index)

        columns_to_use, columns_not_to_use = base_utils.get_columns_to_use(
            inputs_metadata,
            self.hyperparams["use_columns"],
            self.hyperparams["exclude_columns"],
            can_use_column,
        )

        # We are OK if no columns ended up being read.
        # "base_utils.combine_columns" will throw an error if it cannot work with this.

        if self.hyperparams["use_columns"] and columns_not_to_use:
            self.logger.warning(
                "Not all specified columns contain filenames for supported media types. Skipping columns: %(columns)s",
                {
                    "columns": columns_not_to_use,
                },
            )

        return columns_to_use
    def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams):
        if not hyperparams['use_semantic_types']:
            return data, list(data.columns), list(range(len(data.columns)))

        metadata = data.metadata

        def can_produce_column(column_index: int) -> bool:
            accepted_semantic_types = set()
            accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget")
            column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index))
            semantic_types = set(column_metadata.get('semantic_types', []))
            if len(semantic_types) == 0:
                cls.logger.warning("No semantic types found in column metadata")
                return False
            # Making sure all accepted_semantic_types are available in semantic_types
            if len(accepted_semantic_types - semantic_types) == 0:
                return True
            return False

        target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata,
                                                                                               use_columns=hyperparams[
                                                                                                   'use_outputs_columns'],
                                                                                               exclude_columns=
                                                                                               hyperparams[
                                                                                                   'exclude_outputs_columns'],
                                                                                               can_use_column=can_produce_column)
        targets = []
        if target_column_indices:
            targets = data.select_columns(target_column_indices)
        target_column_names = []
        for idx in target_column_indices:
            target_column_names.append(data.columns[idx])
        return targets, target_column_names, target_column_indices
示例#5
0
	def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):

		"""

			Select columns to fit.
			Args:
				inputs: Container DataFrame
				hyperparams: d3m.metadata.hyperparams.Hyperparams

			Returns:
				list

		"""

		if not hyperparams['use_semantic_types']:
			return inputs, list(range(len(inputs.columns)))

		inputs_metadata = inputs.metadata

		

		def can_produce_column(column_index: int) -> bool:
			return cls._can_produce_column(inputs_metadata, column_index, hyperparams)

		columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata,
					   use_columns=hyperparams['use_columns'],
					   exclude_columns=hyperparams['exclude_columns'],
					   can_use_column=can_produce_column)


		"""
		Encountered error: when hyperparams['use_columns'] = (2,3) and hyperparams['exclude_columns'] is (1,2)
		columns_to_produce is still [2]
		"""
		return inputs.iloc[:, columns_to_produce], columns_to_produce
示例#6
0
    def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):
        """
        Select columns to fit.
        Args:
            inputs: Container DataFrame
            hyperparams: d3m.metadata.hyperparams.Hyperparams

        Returns:
            list
        """
        if not hyperparams['use_semantic_types']:
            return inputs, list(range(len(inputs.columns)))

        inputs_metadata = inputs.metadata

        def can_produce_column(column_index: int) -> bool:
            return cls._can_produce_column(inputs_metadata, column_index, hyperparams)

        use_columns = []
        exclude_columns = []

        # if hyperparams['columns_using_method'] == 'name':
        #     inputs_cols = inputs.columns.values.tolist()
        #     for i in range(len(inputs_cols)):
        #         if inputs_cols[i] in hyperparams['use_columns_name']:
        #             use_columns.append(i)
        #         elif inputs_cols[i] in hyperparams['exclude_columns_name']:
        #             exclude_columns.append(i)      
        # else: 
        use_columns=hyperparams['use_columns']
        exclude_columns=hyperparams['exclude_columns']           
        
        columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, use_columns=use_columns, exclude_columns=exclude_columns, can_use_column=can_produce_column)
        return inputs.iloc[:, columns_to_produce], columns_to_produce
示例#7
0
    def _get_columns(
            self,
            inputs_metadata: metadata_base.DataMetadata) -> typing.List[int]:
        """ originally from from d3m.primitives.schema_discovery.profiler.Common """
        def can_use_column(column_index: int) -> bool:
            # if overwrite, we detect on all columns
            if self.hyperparams['overwrite']:
                return True

            return self._can_use_column(inputs_metadata, column_index)

        columns_to_use, columns_not_to_use = base_utils.get_columns_to_use(
            inputs_metadata, self.hyperparams['use_columns'],
            self.hyperparams['exclude_columns'], can_use_column)

        # We are OK if no columns ended up being parsed.
        # "base_utils.combine_columns" will throw an error if it cannot work with this.

        if self.hyperparams['use_columns'] and columns_not_to_use:
            self.logger.warning(
                "Not all specified columns can parsed. Skipping columns: %(columns)s",
                {
                    'columns': columns_not_to_use,
                })

        return columns_to_use
示例#8
0
    def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):

        if not hyperparams['use_semantic_types']:
            columns_to_produce = list(range(len(inputs.columns)))

        else:
            inputs_metadata = inputs.metadata

            def can_produce_column(column_index: int) -> bool:
                return cls._can_produce_column(inputs_metadata, column_index,
                                               hyperparams)

            columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(
                inputs_metadata,
                use_columns=hyperparams['use_columns'],
                exclude_columns=hyperparams['exclude_columns'],
                can_use_column=can_produce_column)

        columns_to_drop = cls._get_columns_to_drop(inputs, columns_to_produce,
                                                   hyperparams)
        for col in columns_to_drop:
            columns_to_produce.remove(col)

        return inputs.iloc[:,
                           columns_to_produce], columns_to_produce, columns_to_drop
示例#9
0
    def _get_columns(
            cls, inputs_metadata: metadata_base.DataMetadata,
            hyperparams: hyperparams.Hyperparams) -> typing.Sequence[int]:
        def can_use_column(column_index: int) -> bool:
            return cls._can_use_column(inputs_metadata, column_index)

        columns_to_use, columns_not_to_use = d3m_utils.get_columns_to_use(
            inputs_metadata, hyperparams['use_columns'],
            hyperparams['exclude_columns'], can_use_column)
        return columns_to_use
示例#10
0
    def _get_columns_to_fit(cls, inputs: Input, hyperparams: UEncHyperparameter):
        if not hyperparams['use_semantic_types']:
            return inputs, list(range(len(inputs.columns)))

        inputs_metadata = inputs.metadata

        def can_produce_column(column_index: int) -> bool:
            return cls._can_produce_column(inputs_metadata, column_index, hyperparams)

        columns_to_produce, columns_not_to_produce = utils.get_columns_to_use(
            metadata=inputs_metadata,
            use_columns=hyperparams['use_columns'], exclude_columns=hyperparams['exclude_columns'], can_use_column=can_produce_column)
        return inputs.iloc[:, columns_to_produce], columns_to_produce
示例#11
0
def get_columns_of_type(df, semantic_types):
    columns = df.metadata.list_columns_with_semantic_types(semantic_types)

    def can_use_column(column_index: int) -> bool:
        return column_index in columns

    # hyperparams['use_columns'], hyperparams['exclude_columns']
    columns_to_use, columns_not_to_use = base_utils.get_columns_to_use(
        df.metadata, [], [],
        can_use_column)  # metadata, include, exclude_columns, idx_function

    if not columns_to_use:
        raise ValueError(
            "Input data has no columns matching semantic types: {semantic_types}"
            .format(semantic_types=semantic_types, ))

    return df.select_columns(columns_to_use)
示例#12
0
    def _get_outputs_columns(self, outputs_metadata: metadata_base.DataMetadata) -> List[int]:
        def can_use_column(column_index: int) -> bool:
            return self._can_use_outputs_column(outputs_metadata, column_index)

        columns_to_use, columns_not_to_use = base_utils.get_columns_to_use(outputs_metadata, self.hyperparams['use_outputs_columns'], self.hyperparams['exclude_outputs_columns'], can_use_column)

        if not columns_to_use:
            if self.hyperparams['error_on_no_columns']:
                raise ValueError("No outputs columns.")
            else:
                self.logger.warning("No outputs columns.")

        if self.hyperparams['use_outputs_columns'] and columns_to_use and columns_not_to_use:
            self.logger.warning("Not all specified outputs columns can be used. Skipping columns: %(columns)s", {
                'columns': columns_not_to_use,
            })

        return columns_to_use
示例#13
0
    def _get_columns(
            self,
            inputs_metadata: metadata_base.DataMetadata) -> typing.List[int]:
        def can_use_column(column_index: int) -> bool:
            return self._can_use_column(inputs_metadata, column_index)

        columns_to_use, columns_not_to_use = base_utils.get_columns_to_use(
            inputs_metadata, self.hyperparams['use_columns'],
            self.hyperparams['exclude_columns'], can_use_column)

        # We are OK if no columns ended up being parsed.
        # "base_utils.combine_columns" will throw an error if it cannot work with this.

        if self.hyperparams['use_columns'] and columns_not_to_use:
            self.logger.warning(
                "Not all specified columns can parsed. Skipping columns: %(columns)s",
                {
                    'columns': columns_not_to_use,
                })

        return columns_to_use
示例#14
0
    def _get_columns(
        self, inputs_metadata: metadata_base.DataMetadata
    ) -> typing.List[int]:
        def can_use_column(column_index: int) -> bool:
            return True

        columns_to_use, columns_not_to_use = base_utils.get_columns_to_use(
            inputs_metadata,
            self.hyperparams["use_columns"],
            self.hyperparams["exclude_columns"],
            can_use_column,
        )

        if self.hyperparams["use_columns"] and columns_not_to_use:
            self.logger.warning(
                "Not all specified columns can parsed. Skipping columns: %(columns)s",
                {
                    "columns": columns_not_to_use,
                },
            )

        return columns_to_use