Exemplo n.º 1
0
    def _produce_column_metadata(
        self, inputs_metadata: metadata_base.DataMetadata, column_index: int, read_files: typing.Sequence[typing.Any],
    ) -> metadata_base.DataMetadata:
        column_metadata = inputs_metadata.select_columns([column_index])
        column_metadata = column_metadata.update_column(0, {
            'structural_type': self._file_structural_type,
            # Clear metadata useful for filename columns.
            'location_base_uris': metadata_base.NO_VALUE,
            'media_types': metadata_base.NO_VALUE,
        })

        # It is not a filename anymore.
        column_metadata = column_metadata.remove_semantic_type((metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/FileName')

        # At least one semantic type from listed semantic types should be set.
        semantic_types = column_metadata.query_column(0).get('semantic_types', [])
        if not set(semantic_types) & set(self._file_semantic_types):
            # Add the first one.
            column_metadata = column_metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, 0), self._file_semantic_types[0])

        for row_index, file in enumerate(read_files):
            # Copy metadata only if we have a container type.
            if isinstance(file, types.Container):
                column_metadata = file.metadata.copy_to(column_metadata, (), (row_index, 0))

        column_metadata = column_metadata.compact(['name', 'structural_type', 'media_types', 'location_base_uris', 'semantic_types'])

        return column_metadata
Exemplo n.º 2
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> base.CallResult[Outputs]:
        columns_to_use = self._get_columns(inputs.metadata, self.hyperparams)
        _logger.debug(f'converting columns: {columns_to_use}')
        _logger.debug(
            f'converting columns: {inputs.iloc[:, columns_to_use].columns}')
        output = inputs.copy()
        for col in columns_to_use:
            output.iloc[:, col] = pd.to_numeric(output.iloc[:, col])
            column_metadata = output.metadata.query(
                (metadata_base.ALL_ELEMENTS, col))
            semantic_type = column_metadata.get('semantic_types', None)
            if 'http://schema.org/Integer' in semantic_type:
                output.metadata = output.metadata.update(
                    (metadata_base.ALL_ELEMENTS, col),
                    {'structural_type': int})
            elif 'http://schema.org/Float' in semantic_type:
                output.metadata = output.metadata.update(
                    (metadata_base.ALL_ELEMENTS, col),
                    {'structural_type': float})
            # What to do with missing values?
            # has_missing_value = pd.isnull(output.iloc[:, col]).sum() > 0
        if self.hyperparams['drop_non_numeric_columns']:
            _logger.debug(
                f'dropping columns: {list(np.where(output.dtypes == object)[0])}'
            )
            _logger.debug(
                f'dropping columns: {output.iloc[:, list(np.where(output.dtypes == object)[0])].columns}'
            )
            # np.where returns int64 instead of int, D3M metadata checks for int
            numeric_colum_indices = [
                int(x) for x in np.where(output.dtypes != object)[0]
            ]
            output = output.iloc[:, numeric_colum_indices]
            output.metadata = DataMetadata.select_columns(
                output.metadata, numeric_colum_indices)

        return base.CallResult(output)
Exemplo n.º 3
0
def combine_columns_metadata(
    inputs: metadata_base.DataMetadata,
    column_indices: typing.Sequence[int],
    columns_list: typing.Sequence[metadata_base.DataMetadata],
    *,
    return_result: str,
    add_index_columns: bool,
) -> metadata_base.DataMetadata:
    """
    Analogous to ``combine_columns`` but operates only on metadata.
    """

    if return_result == 'append':
        outputs = inputs
        for columns in columns_list:
            outputs = outputs.append_columns(columns)

    elif return_result == 'replace':
        if not column_indices:
            return combine_columns_metadata(
                inputs,
                column_indices,
                columns_list,
                return_result='append',
                add_index_columns=add_index_columns)

        outputs = inputs

        columns_replaced = 0
        for columns in columns_list:
            columns_length = columns.query_field(
                (metadata_base.ALL_ELEMENTS, ), 'dimension')['length']
            if columns_replaced < len(column_indices):
                # It is OK if the slice of "column_indices" is shorter than "columns", Only those columns
                # listed in the slice will be replaced and others appended after the last replaced column.
                outputs = outputs.replace_columns(
                    columns, column_indices[columns_replaced:columns_replaced +
                                            columns_length])
            else:
                # We insert the rest of columns after the last columns we replaced. We know that "column_indices"
                # is non-empty and that the last item of "column_indices" points ot the last column we replaced
                # for those listed in "column_indices". We replaced more columns though, so we have to add the
                # difference, and then add 1 to insert after the last column.
                outputs = outputs.insert_columns(
                    columns, column_indices[-1] +
                    (columns_replaced - len(column_indices)) + 1)
            columns_replaced += columns_length

        if columns_replaced < len(column_indices):
            outputs = outputs.remove_columns(
                column_indices[columns_replaced:len(column_indices)])

    elif return_result == 'new':
        if not any(
                columns_metadata.query_field(
                    (metadata_base.ALL_ELEMENTS, ), 'dimension')['length']
                for columns_metadata in columns_list):
            raise ValueError("No columns produced.")

        outputs = columns_list[0]
        for columns in columns_list[1:]:
            outputs = outputs.append_columns(columns)

        if add_index_columns:
            inputs_index_columns = inputs.get_index_columns()
            outputs_index_columns = outputs.get_index_columns()

            if inputs_index_columns and not outputs_index_columns:
                # Add index columns at the beginning.
                outputs = inputs.select_columns(
                    inputs_index_columns).append_columns(
                        outputs, use_right_metadata=True)

    else:
        raise exceptions.InvalidArgumentValueError(
            "\"return_result\" has an invalid value: {return_result}".format(
                return_result=return_result))

    return outputs