Exemplo n.º 1
0
 def _produce_threaded(
     self,
     *,
     index: int,
     left_df_full: container.DataFrame, # type: ignore
     left_dfs: typing.Sequence[container.DataFrame],  # type: ignore
     right_df: container.DataFrame,  # type: ignore
     join_types: typing.Sequence[str],
     left_col: typing.Sequence[int],
     right_col: typing.Sequence[int],
     accuracy: typing.Sequence[float],
     absolute_accuracy: typing.Sequence[bool]
 ) -> typing.Tuple[int, base.CallResult[Outputs]]:
     if left_dfs[index].empty:
         return (index, None)
     output = self._produce(
         left_df_full = left_df_full,
         left_df = left_dfs[index].reset_index(drop=True),
         right_df = right_df.copy(),
         join_types = join_types,
         left_col = left_col,
         right_col = right_col,
         accuracy = accuracy,
         absolute_accuracy = absolute_accuracy
     )
     return (index, output)
Exemplo n.º 2
0
    def produce(
        self,
        *,
        inputs: container.DataFrame,
        timeout: float = None,
        iterations: int = None,
    ) -> base.CallResult[container.DataFrame]:
        logger.debug(f"Running {__name__}")

        # set values that only occur once to a special token
        outputs = inputs.copy()

        # determine columns to operate on
        cols = distil_utils.get_operating_columns(
            inputs, self.hyperparams["use_columns"], CATEGORICALS)

        for c in cols:
            vcs = pd.value_counts(list(inputs.iloc[:, c]))
            singletons = set(vcs[vcs == 1].index)
            if singletons:
                mask = outputs.iloc[:, c].isin(singletons)
                outputs.loc[mask, outputs.columns[c]] = SINGLETON_INDICATOR

        logger.debug(f"\n{outputs}")

        return base.CallResult(outputs)
    def produce(
        self,
        *,
        inputs: container.DataFrame,
        timeout: float = None,
        iterations: int = None,
    ) -> base.CallResult[container.DataFrame]:

        logger.debug(f"Running {__name__}")

        # determine columns to operate on
        cols = distil_utils.get_operating_columns(
            inputs, self.hyperparams["use_columns"], CATEGORICALS)

        logger.debug(f"Found {len(cols)} categorical columns to evaluate")

        if len(cols) is 0:
            return base.CallResult(inputs)

        imputer = CategoricalImputer(
            strategy=self.hyperparams["strategy"],
            fill_value=self.hyperparams["fill_value"],
            missing_values="",
            tie_breaking="first",
        )
        outputs = inputs.copy()
        failures: List[int] = []
        for c in cols:
            input_col = inputs.iloc[:, c]
            try:
                imputer.fit(input_col)
                result = imputer.transform(input_col)
                outputs.iloc[:, c] = result
            except ValueError as e:
                # value error gets thrown when all data is missing
                if not self.hyperparams["error_on_empty"]:
                    failures.append(c)
                else:
                    raise e

        # for columns that failed using 'most_frequent' try again using 'constant'
        if not self.hyperparams["error_on_empty"]:
            imputer = CategoricalImputer(
                strategy="constant",
                fill_value=self.hyperparams["fill_value"],
                missing_values="",
                tie_breaking="first",
            )
            for f in failures:
                outputs_col = outputs.iloc[:, f]
                imputer.fit(outputs_col)
                result = imputer.transform(outputs_col)
                outputs.iloc[:, f] = result

        logger.debug(f"\n{outputs}")

        return base.CallResult(outputs)
    def produce(
        self,
        *,
        inputs: container.DataFrame,
        timeout: float = None,
        iterations: int = None,
    ) -> base.CallResult[container.DataFrame]:
        logger.debug(f"Producing {__name__}")

        if len(self._cols) == 0:
            return base.CallResult(inputs)

        # add the binary encoded columns and remove the source columns
        outputs = inputs.copy()
        encoded_cols = container.DataFrame()
        encoded_cols_source = []
        bin_idx = 0
        for i, c in enumerate(self._cols):
            categorical_inputs = outputs.iloc[:, c]
            result = self._encoders[i].transform(categorical_inputs)
            for j in range(result.shape[1]):
                encoded_cols[(f"__binary_{bin_idx}")] = result[:, j]
                encoded_cols_source.append(c)
                bin_idx += 1

        encoded_cols.metadata = encoded_cols.metadata.generate(encoded_cols)

        for c in range(encoded_cols.shape[1]):
            encoded_cols.metadata = encoded_cols.metadata.add_semantic_type(
                (metadata_base.ALL_ELEMENTS, c), "http://schema.org/Integer"
            )
            encoded_cols.metadata = encoded_cols.metadata.add_semantic_type(
                (metadata_base.ALL_ELEMENTS, c), self._attribute_semantic
            )
            col_dict = dict(
                encoded_cols.metadata.query((metadata_base.ALL_ELEMENTS, c))
            )
            col_dict["source_column"] = outputs.metadata.query(
                (metadata_base.ALL_ELEMENTS, encoded_cols_source[c])
            )["name"]
            encoded_cols.metadata = encoded_cols.metadata.update(
                (metadata_base.ALL_ELEMENTS, c), col_dict
            )

        outputs = outputs.append_columns(encoded_cols)
        outputs = outputs.remove_columns(self._cols)

        logger.debug(f"\n{outputs}")

        return base.CallResult(outputs)
Exemplo n.º 5
0
    def _remap_graphs(
            cls,
            data: container.DataFrame) -> Tuple[container.DataFrame, int, int]:
        assert data.shape[1] == 2

        data = data.copy()

        data.columns = ("user", "item")

        uusers = np.unique([data.user, data.user])
        user_lookup = dict(zip(uusers, range(len(uusers))))
        data.user = data.user.apply(user_lookup.get)

        uitems = np.unique(data.item)
        item_lookup = dict(zip(uitems, range(len(uitems))))
        data.item = data.item.apply(item_lookup.get)

        n_users = len(uusers)
        n_items = len(uitems)

        return data, n_users, n_items
    def produce(
        self,
        *,
        inputs: container.DataFrame,
        timeout: float = None,
        iterations: int = None,
    ) -> base.CallResult[container.DataFrame]:
        logger.debug(f"Producing {__name__}")

        if len(self._cols) == 0:
            return base.CallResult(inputs)

        # encode using the previously identified categorical columns
        input_cols = inputs.iloc[:, self._cols]
        from itertools import zip_longest

        encoded_cols = container.DataFrame()
        for i in self._cols:
            col_name = inputs.columns[i]
            col = container.DataFrame.from_records(
                zip_longest(*inputs[col_name].values)).T
            col.columns = [f"{col_name}_{x}" for x in range(len(col.columns))]
            encoded_cols = pd.concat([encoded_cols, col], axis=1)

        # append the encoding columns and generate metadata
        outputs = inputs.copy()
        encoded_cols.metadata = encoded_cols.metadata.generate(encoded_cols)

        for c in range(encoded_cols.shape[1]):
            encoded_cols.metadata = encoded_cols.metadata.add_semantic_type(
                (metadata_base.ALL_ELEMENTS, c), "http://schema.org/Float")

        outputs = outputs.append_columns(encoded_cols)

        # drop the source columns
        outputs = outputs.remove_columns(self._cols)

        logger.debug(f"\n{outputs}")

        return base.CallResult(outputs)
Exemplo n.º 7
0
    def produce(
        self,
        *,
        inputs: container.DataFrame,
        timeout: float = None,
        iterations: int = None,
    ) -> base.CallResult[container.DataFrame]:
        logger.debug(f"Producing {__name__}")

        # fallthrough if there's nothing to do
        if len(self._cols) == 0 or self._encoder is None:
            return base.CallResult(inputs)

        # map encoded cols to source column names
        feature_names = self._encoder.get_feature_names()
        encoded_cols_source = []
        # feature names are xA_YY where A is the source column index and YY is the value
        for name in feature_names:
            # take the first part of the name (xA) and remove the x
            encoded_feature_index = int(name.split("_")[0][1:])
            feature_index = self._cols[encoded_feature_index]
            encoded_cols_source.append(
                inputs.metadata.query((metadata_base.ALL_ELEMENTS, feature_index))[
                    "name"
                ]
            )

        # encode using the previously identified categorical columns
        input_cols = inputs.iloc[:, self._cols]
        result = self._encoder.transform(input_cols)

        # append the encoding columns and generate metadata
        outputs = inputs.copy()
        encoded_cols: container.DataFrame = container.DataFrame()

        for i in range(result.shape[1]):
            encoded_cols[f"__onehot_{str(i)}"] = result[:, i]
        encoded_cols.metadata = encoded_cols.metadata.generate(encoded_cols)

        for c in range(encoded_cols.shape[1]):
            encoded_cols.metadata = encoded_cols.metadata.add_semantic_type(
                (metadata_base.ALL_ELEMENTS, c), "http://schema.org/Float"
            )
            encoded_cols.metadata = encoded_cols.metadata.add_semantic_type(
                (metadata_base.ALL_ELEMENTS, c), self._attribute_semantic
            )
            col_dict = dict(
                encoded_cols.metadata.query((metadata_base.ALL_ELEMENTS, c))
            )
            col_dict["source_column"] = encoded_cols_source[c]
            encoded_cols.metadata = encoded_cols.metadata.update(
                (metadata_base.ALL_ELEMENTS, c), col_dict
            )

        outputs = outputs.append_columns(encoded_cols)

        # drop the source columns
        outputs = outputs.remove_columns(self._cols)

        logger.debug(f"\n{outputs}")

        return base.CallResult(outputs)
Exemplo n.º 8
0
    def produce(
            self,
            *,
            inputs: container.DataFrame,
            timeout: float = None,
            iterations: int = None) -> base.CallResult[container.DataFrame]:

        cols = ["idx", "name", "rank"]

        # Make sure the target column is of a valid type and return no ranked features if it isn't.
        target_idx = self.hyperparams["target_col_index"]
        if not self._can_use_column(inputs.metadata, target_idx):
            return base.CallResult(container.DataFrame(data={}, columns=cols))

        # check if target is discrete or continuous
        semantic_types = inputs.metadata.query_column(
            target_idx)["semantic_types"]
        discrete = len(set(semantic_types).intersection(
            self._discrete_types)) > 0

        # make a copy of the inputs and clean out any missing data
        feature_df = inputs.copy()
        if self.hyperparams["sub_sample"]:
            sub_sample_size = (self.hyperparams["sub_sample_size"]
                               if self.hyperparams["sub_sample_size"] <
                               inputs.shape[0] else inputs.shape[0])
            rows = random.sample_without_replacement(inputs.shape[0],
                                                     sub_sample_size)
            feature_df = feature_df.iloc[rows, :]
        # makes sure that if an entire column is NA, we remove that column, so as to not remove ALL rows
        cols_to_drop = feature_df.columns[feature_df.isna().sum() ==
                                          feature_df.shape[0]]
        feature_df.drop(columns=cols_to_drop, inplace=True)
        feature_df.dropna(inplace=True)

        # split out the target feature
        target_df = feature_df.iloc[:,
                                    feature_df.columns.
                                    get_loc(inputs.columns[target_idx])]

        # drop features that are not compatible with ranking
        feature_indices = set(
            inputs.metadata.list_columns_with_semantic_types(
                self._semantic_types))
        role_indices = set(
            inputs.metadata.list_columns_with_semantic_types(self._roles))
        feature_indices = feature_indices.intersection(role_indices)
        feature_indices.remove(target_idx)
        for categ_ind in inputs.metadata.list_columns_with_semantic_types(
            ("https://metadata.datadrivendiscovery.org/types/CategoricalData",
             )):
            if categ_ind in feature_indices:
                if (np.unique(inputs[inputs.columns[categ_ind]]).shape[0] ==
                        inputs.shape[0]):
                    feature_indices.remove(categ_ind)
                elif (inputs.metadata.query(
                    (metadata_base.ALL_ELEMENTS,
                     categ_ind))["structural_type"] == str):
                    feature_df[inputs.columns[categ_ind]] = pd.to_numeric(
                        feature_df[inputs.columns[categ_ind]])
        text_indices = inputs.metadata.list_columns_with_semantic_types(
            self._text_semantic)

        tfv = TfidfVectorizer(max_features=20)
        column_to_text_features = {}
        text_feature_indices = []
        for text_index in text_indices:
            if (text_index not in feature_indices
                    and text_index in role_indices
                    and text_index != target_idx):
                word_features = tfv.fit_transform(
                    feature_df[inputs.columns[text_index]])
                if issparse(word_features):
                    column_to_text_features[inputs.columns[
                        text_index]] = pd.DataFrame.sparse.from_spmatrix(
                            word_features)
                else:
                    column_to_text_features[
                        inputs.columns[text_index]] = word_features
                text_feature_indices.append(text_index)
        text_feature_indices = set(text_feature_indices)

        # return an empty result if all features were incompatible
        numeric_features = len(feature_indices) > 0
        if not numeric_features and len(column_to_text_features) == 0:
            return base.CallResult(container.DataFrame(data={}, columns=cols))

        all_indices = set(range(0, inputs.shape[1]))
        skipped_indices = all_indices.difference(
            feature_indices.union(text_feature_indices))
        # remove columns that were dropped
        feature_indices = feature_indices - set(
            [inputs.columns.get_loc(c) for c in cols_to_drop])
        for i, v in enumerate(skipped_indices):
            feature_df.drop(inputs.columns[v], axis=1, inplace=True)

        # figure out the discrete and continuous feature indices and create an array
        # that flags them
        feature_columns = inputs.columns[list(feature_indices)]
        numeric_data = feature_df[feature_columns]
        discrete_indices = inputs.metadata.list_columns_with_semantic_types(
            self._discrete_types)
        discrete_flags = [False] * numeric_data.shape[1]
        for v in discrete_indices:
            col_name = inputs.columns[v]
            if col_name in numeric_data:
                # only mark columns with a least 1 duplicate value as discrete when predicting
                # a continuous target - there's a check in the bowels of MI code that will throw
                # an exception otherwise
                if numeric_data[col_name].duplicated().any() and not discrete:
                    col_idx = numeric_data.columns.get_loc(col_name)
                    discrete_flags[col_idx] = True

        target_np = target_df.values

        # compute mutual information for discrete or continuous target
        ranked_features_np = np.empty([0])
        text_ranked_features_np = np.empty((len(column_to_text_features), ))
        if discrete:
            if numeric_features:
                ranked_features_np = mutual_info_classif(
                    numeric_data.values,
                    target_np,
                    discrete_features=discrete_flags,
                    n_neighbors=self.hyperparams["k"],
                    random_state=self._random_seed,
                )
            for i, column in enumerate(column_to_text_features):
                text_rankings = mutual_info_classif(
                    column_to_text_features[column],
                    target_np,
                    discrete_features=[False] *
                    column_to_text_features[column].shape[1],
                    n_neighbors=self.hyperparams["k"],
                    random_state=self._random_seed,
                )
                sum_text_rank = np.sum(text_rankings)
                text_ranked_features_np[i] = sum_text_rank
        else:
            if numeric_features:
                ranked_features_np = mutual_info_regression(
                    numeric_data.values,
                    target_np,
                    discrete_features=discrete_flags,
                    n_neighbors=self.hyperparams["k"],
                    random_state=self._random_seed,
                )
            for i, column in enumerate(column_to_text_features):
                text_rankings = mutual_info_regression(
                    column_to_text_features[column],
                    target_np,
                    discrete_features=[False] *
                    column_to_text_features[column].shape[1],
                    n_neighbors=self.hyperparams["k"],
                    random_state=self._random_seed,
                )
                sum_text_rank = np.sum(text_rankings)
                text_ranked_features_np[i] = sum_text_rank

        ranked_features_np, target_entropy = self._normalize(
            ranked_features_np,
            feature_df[feature_columns],
            target_np,
            discrete,
            discrete_flags,
        )
        text_ranked_features_np = self._normalize_text(
            text_ranked_features_np, column_to_text_features, target_entropy)

        if self.hyperparams["return_as_metadata"]:
            ranked_features_np = np.append(ranked_features_np,
                                           text_ranked_features_np)
            for i, f in enumerate(feature_indices.union(text_feature_indices)):
                column_metadata = inputs.metadata.query(
                    (metadata_base.ALL_ELEMENTS, f))
                rank_dict = dict(column_metadata)
                rank_dict["rank"] = ranked_features_np[i]
                inputs.metadata = inputs.metadata.update(
                    (metadata_base.ALL_ELEMENTS, f),
                    FrozenOrderedDict(rank_dict.items()),
                )
            return base.CallResult(inputs)

        # merge back into a single list of col idx / rank value tuples
        data: typing.List[typing.Tuple[int, str, float]] = []
        data = self._append_rank_info(inputs, data, ranked_features_np,
                                      feature_df[feature_columns])
        data = self._append_rank_info(
            inputs,
            data,
            text_ranked_features_np,
            feature_df[inputs.columns[list(text_feature_indices)]],
        )

        # wrap as a D3M container - metadata should be auto generated
        results = container.DataFrame(data=data,
                                      columns=cols,
                                      generate_metadata=True)
        results = results.sort_values(by=["rank"],
                                      ascending=False).reset_index(drop=True)
        return base.CallResult(results)
    def produce(
            self,
            *,
            inputs: container.DataFrame,
            timeout: float = None,
            iterations: int = None) -> base.CallResult[container.DataFrame]:

        cols = ['idx', 'name', 'rank']

        # Make sure the target column is of a valid type and return no ranked features if it isn't.
        target_idx = self.hyperparams['target_col_index']
        if not self._can_use_column(inputs.metadata, target_idx):
            return base.CallResult(container.DataFrame(data={}, columns=cols))

        # check if target is discrete or continuous
        semantic_types = inputs.metadata.query_column(
            target_idx)['semantic_types']
        discrete = len(set(semantic_types).intersection(
            self._discrete_types)) > 0

        # make a copy of the inputs and clean out any missing data
        feature_df = inputs.copy()
        feature_df.dropna(inplace=True)

        # split out the target feature
        target_df = feature_df.iloc[:, target_idx]

        # drop features that are not compatible with ranking
        feature_indices = set(
            inputs.metadata.list_columns_with_semantic_types(
                self._semantic_types))
        role_indices = set(
            inputs.metadata.list_columns_with_semantic_types(self._roles))
        feature_indices = feature_indices.intersection(role_indices)
        feature_indices.remove(target_idx)

        # return an empty result if all features were incompatible
        if len(feature_indices) is 0:
            return base.CallResult(container.DataFrame(data={}, columns=cols))

        all_indices = set(range(0, inputs.shape[1]))
        skipped_indices = all_indices.difference(feature_indices)
        for i, v in enumerate(skipped_indices):
            feature_df.drop(inputs.columns[v], axis=1, inplace=True)

        # figure out the discrete and continuous feature indices and create an array
        # that flags them
        discrete_indices = inputs.metadata.list_columns_with_semantic_types(
            self._discrete_types)
        discrete_flags = [False] * feature_df.shape[1]
        for v in discrete_indices:
            col_name = inputs.columns[v]
            if col_name in feature_df:
                # only mark columns with a least 1 duplicate value as discrete when predicting
                # a continuous target - there's a check in the bowels of MI code that will throw
                # an exception otherwise
                if feature_df[col_name].duplicated().any() and not discrete:
                    col_idx = feature_df.columns.get_loc(col_name)
                    discrete_flags[col_idx] = True

        target_np = target_df.values
        feature_np = feature_df.values

        # compute mutual information for discrete or continuous target
        ranked_features_np = None
        if discrete:
            ranked_features_np = mutual_info_classif(
                feature_np,
                target_np,
                discrete_features=discrete_flags,
                random_state=self._random_seed)
        else:
            ranked_features_np = mutual_info_regression(
                feature_np,
                target_np,
                discrete_features=discrete_flags,
                random_state=self._random_seed)

        # merge back into a single list of col idx / rank value tuples
        data: typing.List[typing.Tuple[int, str, float]] = []
        data = self._append_rank_info(inputs, data, ranked_features_np,
                                      feature_df)

        # wrap as a D3M container - metadata should be auto generated
        results = container.DataFrame(data=data,
                                      columns=cols,
                                      generate_metadata=True)
        results = results.sort_values(by=['rank'],
                                      ascending=False).reset_index(drop=True)

        return base.CallResult(results)
Exemplo n.º 10
0
    def produce(
            self,
            *,
            inputs: container.DataFrame,
            timeout: float = None,
            iterations: int = None) -> base.CallResult[container.DataFrame]:

        # make sure the target column is of a valid type
        target_idx = self.hyperparams['target_col_index']
        if not self._can_use_column(inputs.metadata, target_idx):
            raise exceptions.InvalidArgumentValueError(
                'column idx=' + str(target_idx) + ' from ' +
                str(inputs.columns) +
                ' does not contain continuous or discrete type')

        # check if target is discrete or continuous
        semantic_types = inputs.metadata.query_column(
            target_idx)['semantic_types']
        discrete = len(set(semantic_types).intersection(
            self._discrete_types)) > 0

        # make a copy of the inputs and clean out any missing data
        feature_df = inputs.copy()
        feature_df.dropna(inplace=True)

        # split out the target feature
        target_df = feature_df.iloc[:, target_idx]

        # drop features that are not compatible with ranking
        feature_indices = set(
            utils.list_columns_with_semantic_types(inputs.metadata,
                                                   self._semantic_types))
        role_indices = set(
            utils.list_columns_with_semantic_types(inputs.metadata,
                                                   self._roles))
        feature_indices = feature_indices.intersection(role_indices)

        all_indices = set(range(0, inputs.shape[1]))
        skipped_indices = all_indices.difference(feature_indices)
        skipped_indices.add(target_idx)  # drop the target too
        for i, v in enumerate(skipped_indices):
            feature_df.drop(inputs.columns[v], axis=1, inplace=True)

        # figure out the discrete and continuous feature indices and create an array
        # that flags them
        discrete_indices = utils.list_columns_with_semantic_types(
            inputs.metadata, self._discrete_types)
        discrete_flags = [False] * feature_df.shape[1]
        for v in discrete_indices:
            col_name = inputs.columns[v]
            if col_name in feature_df:
                col_idx = feature_df.columns.get_loc(col_name)
                discrete_flags[col_idx] = True

        target_np = target_df.values
        feature_np = feature_df.values

        # compute mutual information for discrete or continuous target
        ranked_features_np = None
        if discrete:
            ranked_features_np = mutual_info_classif(
                feature_np,
                target_np,
                discrete_features=discrete_flags,
                random_state=self._random_seed)
        else:
            ranked_features_np = mutual_info_regression(
                feature_np,
                target_np,
                discrete_features=discrete_flags,
                random_state=self._random_seed)

        # merge back into a single list of col idx / rank value tuples
        data: typing.List[typing.Tuple[int, str, float]] = []
        data = self._append_rank_info(inputs, data, ranked_features_np,
                                      feature_df)

        cols = ['idx', 'name', 'rank']
        results = container.DataFrame(data=data, columns=cols)
        results = results.sort_values(by=['rank'],
                                      ascending=False).reset_index(drop=True)

        # wrap as a D3M container - metadata should be auto generated
        return base.CallResult(results)