Exemplo n.º 1
0
    def _join_numeric_col(cls, left_df: container.DataFrame, left_col: str,
                          right_df: container.DataFrame, right_col: str,
                          accuracy: float) -> pd.DataFrame:
        # use d3mIndex from left col if present
        right_df = right_df.drop(columns='d3mIndex')

        # fuzzy match each of the left join col against the right join col value and save the results as the left
        # dataframe index
        right_df[right_col] = pd.to_numeric(right_df[right_col])
        choices = right_df[right_col].unique()
        left_df[left_col] = pd.to_numeric(left_df[left_col])
        left_df.index = left_df[left_col]. \
            map(lambda x: cls._numeric_fuzzy_match(x, choices, accuracy))

        # make the right col the right dataframe index
        right_df = right_df.set_index(right_col)

        # inner join on the left / right indices
        joined = container.DataFrame(
            left_df.join(right_df, lsuffix='_1', rsuffix='_2', how='inner'))

        # sort on the d3m index if there, otherwise use the joined column
        if 'd3mIndex' in joined:
            joined = joined.sort_values(by=['d3mIndex'])
        else:
            joined = joined.sort_values(by=[left_col])
        joined = joined.reset_index(drop=True)

        return joined
Exemplo n.º 2
0
    def _join_string_col(cls, left_df: container.DataFrame, left_col: str,
                         right_df: container.DataFrame, right_col: str,
                         accuracy: float) -> pd.DataFrame:
        # use d3mIndex from left col if present
        right_df = right_df.drop(columns='d3mIndex')

        # pre-compute fuzzy matches
        left_keys = left_df[left_col].unique()
        right_keys = right_df[right_col].unique()
        matches: typing.Dict[str, typing.Optional[str]] = {}
        for left_key in left_keys:
            matches[left_key] = cls._string_fuzzy_match(
                left_key, right_keys, accuracy * 100)

        # look up pre-computed fuzzy match for each element in the left column
        left_df.index = left_df[left_col].map(lambda key: matches[key])

        # make the right col the right dataframe index
        right_df = right_df.set_index(right_col)

        # inner join on the left / right indices
        joined = container.DataFrame(
            left_df.join(right_df, lsuffix='_1', rsuffix='_2', how='inner'))

        # sort on the d3m index if there, otherwise use the joined column
        if 'd3mIndex' in joined:
            joined = joined.sort_values(by=['d3mIndex'])
        else:
            joined = joined.sort_values(by=[left_col])
        joined = joined.reset_index(drop=True)

        return joined
Exemplo n.º 3
0
    def _join_datetime_col(cls,
                           left_df: container.DataFrame,
                           left_col: str,
                           right_df: container.DataFrame,
                           right_col: str,
                           accuracy: float) -> pd.DataFrame:
        # use d3mIndex from left col if present
        right_df = right_df.drop(columns='d3mIndex')

        # compute a tolerance delta for time matching based on a percentage of the minimum left/right time
        # range
        choices = np.array([np.datetime64(parser.parse(dt)) for dt in right_df[right_col].unique()])
        left_keys = np.array([np.datetime64(parser.parse(dt)) for dt in left_df[left_col].values])
        time_tolerance = (1.0 - accuracy) * cls._compute_time_range(left_keys, choices)
        
        left_df.index = np.array([cls._datetime_fuzzy_match(dt, choices, time_tolerance) for dt in left_keys])

        # make the right col the right dataframe index
        right_df = right_df.set_index(right_col)

        # inner join on the left / right indices
        joined = container.DataFrame(left_df.join(right_df, lsuffix='_1', rsuffix='_2', how='inner'))

        # sort on the d3m index if there, otherwise use the joined column
        if 'd3mIndex' in joined:
            joined = joined.sort_values(by=['d3mIndex'])
        else:
            joined = joined.sort_values(by=[left_col])
        joined = joined.reset_index(drop=True)

        return joined
    def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams):
        if not hyperparams['use_semantic_types']:
            return data, list(data.columns), list(range(len(data.columns)))

        metadata = data.metadata

        def can_produce_column(column_index: int) -> bool:
            accepted_semantic_types = set()
            accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget")
            column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index))
            semantic_types = set(column_metadata.get('semantic_types', []))
            if len(semantic_types) == 0:
                cls.logger.warning("No semantic types found in column metadata")
                return False
            # Making sure all accepted_semantic_types are available in semantic_types
            if len(accepted_semantic_types - semantic_types) == 0:
                return True
            return False

        target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata,
                                                                                               use_columns=hyperparams[
                                                                                                   'use_outputs_columns'],
                                                                                               exclude_columns=
                                                                                               hyperparams[
                                                                                                   'exclude_outputs_columns'],
                                                                                               can_use_column=can_produce_column)
        targets = []
        if target_column_indices:
            targets = data.select_columns(target_column_indices)
        target_column_names = []
        for idx in target_column_indices:
            target_column_names.append(data.columns[idx])
        return targets, target_column_names, target_column_indices
Exemplo n.º 5
0
    def produce(
        self,
        *,
        inputs: container.DataFrame,
        timeout: float = None,
        iterations: int = None,
    ) -> base.CallResult[container.DataFrame]:
        logger.debug(f"Running {__name__}")

        # set values that only occur once to a special token
        outputs = inputs.copy()

        # determine columns to operate on
        cols = distil_utils.get_operating_columns(
            inputs, self.hyperparams["use_columns"], CATEGORICALS)

        for c in cols:
            vcs = pd.value_counts(list(inputs.iloc[:, c]))
            singletons = set(vcs[vcs == 1].index)
            if singletons:
                mask = outputs.iloc[:, c].isin(singletons)
                outputs.loc[mask, outputs.columns[c]] = SINGLETON_INDICATOR

        logger.debug(f"\n{outputs}")

        return base.CallResult(outputs)
Exemplo n.º 6
0
 def _update_metadata_dimension(
         df: container.DataFrame) -> container.DataFrame:
     old_metadata = dict(df.metadata.query(()))
     old_metadata["dimension"] = dict(old_metadata["dimension"])
     old_metadata["dimension"]["length"] = df.shape[0]
     df.metadata = df.metadata.update((), old_metadata)
     return df
Exemplo n.º 7
0
    def _split_aggregated(self, df: container.DataFrame,
                          split_col_names: list) -> container.DataFrame:
        lengths = [len(df.loc[0, col_name]) for col_name in split_col_names]

        for idx, col_name in enumerate(split_col_names):
            if self._sorted_pipe_ids:
                if len(self._sorted_pipe_ids) == lengths[idx]:
                    extend_col_names = [
                        "{}_{}".format(col_name, i)
                        for i in self._sorted_pipe_ids
                    ]
                else:
                    raise ValueError(
                        "Unique number of pipeline ids not equal to the number of aggregated values"
                    )
            else:
                extend_col_names = [
                    "{}_{}".format(col_name, i) for i in range(lengths[idx])
                ]

            extends = container.DataFrame(df.loc[:, col_name].values.tolist(),
                                          columns=extend_col_names)

            df = common_utils.horizontal_concat(left=df, right=extends)
            origin_metadata = dict(
                df.metadata.query(
                    (mbase.ALL_ELEMENTS, df.columns.get_loc(col_name))))

            for name in extend_col_names:
                col_idx = df.columns.get_loc(name)
                origin_metadata["name"] = name
                df.metadata = df.metadata.update((mbase.ALL_ELEMENTS, col_idx),
                                                 origin_metadata)

        return df
Exemplo n.º 8
0
 def _produce_threaded(
     self,
     *,
     index: int,
     left_df_full: container.DataFrame, # type: ignore
     left_dfs: typing.Sequence[container.DataFrame],  # type: ignore
     right_df: container.DataFrame,  # type: ignore
     join_types: typing.Sequence[str],
     left_col: typing.Sequence[int],
     right_col: typing.Sequence[int],
     accuracy: typing.Sequence[float],
     absolute_accuracy: typing.Sequence[bool]
 ) -> typing.Tuple[int, base.CallResult[Outputs]]:
     if left_dfs[index].empty:
         return (index, None)
     output = self._produce(
         left_df_full = left_df_full,
         left_df = left_dfs[index].reset_index(drop=True),
         right_df = right_df.copy(),
         join_types = join_types,
         left_col = left_col,
         right_col = right_col,
         accuracy = accuracy,
         absolute_accuracy = absolute_accuracy
     )
     return (index, output)
    def produce(
        self,
        *,
        inputs: container.DataFrame,
        timeout: float = None,
        iterations: int = None,
    ) -> base.CallResult[container.DataFrame]:

        logger.debug(f"Running {__name__}")

        # determine columns to operate on
        cols = distil_utils.get_operating_columns(
            inputs, self.hyperparams["use_columns"], CATEGORICALS)

        logger.debug(f"Found {len(cols)} categorical columns to evaluate")

        if len(cols) is 0:
            return base.CallResult(inputs)

        imputer = CategoricalImputer(
            strategy=self.hyperparams["strategy"],
            fill_value=self.hyperparams["fill_value"],
            missing_values="",
            tie_breaking="first",
        )
        outputs = inputs.copy()
        failures: List[int] = []
        for c in cols:
            input_col = inputs.iloc[:, c]
            try:
                imputer.fit(input_col)
                result = imputer.transform(input_col)
                outputs.iloc[:, c] = result
            except ValueError as e:
                # value error gets thrown when all data is missing
                if not self.hyperparams["error_on_empty"]:
                    failures.append(c)
                else:
                    raise e

        # for columns that failed using 'most_frequent' try again using 'constant'
        if not self.hyperparams["error_on_empty"]:
            imputer = CategoricalImputer(
                strategy="constant",
                fill_value=self.hyperparams["fill_value"],
                missing_values="",
                tie_breaking="first",
            )
            for f in failures:
                outputs_col = outputs.iloc[:, f]
                imputer.fit(outputs_col)
                result = imputer.transform(outputs_col)
                outputs.iloc[:, f] = result

        logger.debug(f"\n{outputs}")

        return base.CallResult(outputs)
Exemplo n.º 10
0
 def _generate_labels(self, inputs: container.DataFrame) -> None:
     self._labels = {}
     for col_idx, (label, col) in enumerate(inputs.iteritems()):
         # Get all the unique data in the column and assign each element an int representation.
         # We reserve 0 for unseen labels so we increment the encodings by one
         unique_data = col.unique()
         self._labels[col_idx] = {
             label: encoded + 1
             for encoded, label in enumerate(unique_data)
         }
Exemplo n.º 11
0
    def _remap_graphs(
            cls,
            data: container.DataFrame) -> Tuple[container.DataFrame, int, int]:
        assert data.shape[1] == 2

        data = data.copy()

        data.columns = ("user", "item")

        uusers = np.unique([data.user, data.user])
        user_lookup = dict(zip(uusers, range(len(uusers))))
        data.user = data.user.apply(user_lookup.get)

        uitems = np.unique(data.item)
        item_lookup = dict(zip(uitems, range(len(uitems))))
        data.item = data.item.apply(item_lookup.get)

        n_users = len(uusers)
        n_items = len(uitems)

        return data, n_users, n_items
    def produce(
        self,
        *,
        inputs: container.DataFrame,
        timeout: float = None,
        iterations: int = None,
    ) -> base.CallResult[container.DataFrame]:

        df = inputs.select_columns(
            inputs.metadata.list_columns_with_semantic_types(
                ("http://schema.org/Float",)
            )
        )
        df = df.to_numpy().reshape(
            df.shape[0], 2048, self.hyperparams["height"], self.hyperparams["width"]
        )
        all_img_features = []
        batch_size = self.hyperparams["batch_size"]
        spatial_a = 2.0
        spatial_b = 2.0
        for i in range(math.ceil(df.shape[0] / batch_size)):
            features = df[i * batch_size : (i + 1) * batch_size]
            spatial_weight = features.sum(axis=1, keepdims=True)
            z = (spatial_weight ** spatial_a).sum(axis=(2, 3), keepdims=True)
            z = z ** (1.0 / spatial_a)
            spatial_weight = (spatial_weight / z) ** (1.0 / spatial_b)

            _, c, w, h = features.shape
            nonzeros = (features != 0).astype(float).sum(axis=(2, 3)) / 1.0 / (
                w * h
            ) + 1e-6
            channel_weight = np.log(nonzeros.sum(axis=1, keepdims=True) / nonzeros)

            features = features * spatial_weight
            features = features.sum(axis=(2, 3))
            features = features * channel_weight
            all_img_features.append(features)
        all_img_features = np.vstack(all_img_features)
        col_names = [f"feat_{i}" for i in range(0, all_img_features.shape[1])]
        feature_df = pd.DataFrame(all_img_features, columns=col_names)

        outputs = container.DataFrame(feature_df.head(1), generate_metadata=True)
        outputs.metadata = outputs.metadata.update(
            (metadata_base.ALL_ELEMENTS,),
            {"dimension": {"length": feature_df.shape[0]}},
        )
        outputs = outputs.append(feature_df.iloc[1:])
        for idx in range(outputs.shape[1]):
            outputs.metadata = outputs.metadata.add_semantic_type(
                (metadata_base.ALL_ELEMENTS, idx), "http://schema.org/Float"
            )

        return base.CallResult(outputs)
    def produce(
        self,
        *,
        inputs: container.DataFrame,
        timeout: float = None,
        iterations: int = None,
    ) -> base.CallResult[container.DataFrame]:
        logger.debug(f"Producing {__name__}")

        if len(self._cols) == 0:
            return base.CallResult(inputs)

        # add the binary encoded columns and remove the source columns
        outputs = inputs.copy()
        encoded_cols = container.DataFrame()
        encoded_cols_source = []
        bin_idx = 0
        for i, c in enumerate(self._cols):
            categorical_inputs = outputs.iloc[:, c]
            result = self._encoders[i].transform(categorical_inputs)
            for j in range(result.shape[1]):
                encoded_cols[(f"__binary_{bin_idx}")] = result[:, j]
                encoded_cols_source.append(c)
                bin_idx += 1

        encoded_cols.metadata = encoded_cols.metadata.generate(encoded_cols)

        for c in range(encoded_cols.shape[1]):
            encoded_cols.metadata = encoded_cols.metadata.add_semantic_type(
                (metadata_base.ALL_ELEMENTS, c), "http://schema.org/Integer"
            )
            encoded_cols.metadata = encoded_cols.metadata.add_semantic_type(
                (metadata_base.ALL_ELEMENTS, c), self._attribute_semantic
            )
            col_dict = dict(
                encoded_cols.metadata.query((metadata_base.ALL_ELEMENTS, c))
            )
            col_dict["source_column"] = outputs.metadata.query(
                (metadata_base.ALL_ELEMENTS, encoded_cols_source[c])
            )["name"]
            encoded_cols.metadata = encoded_cols.metadata.update(
                (metadata_base.ALL_ELEMENTS, c), col_dict
            )

        outputs = outputs.append_columns(encoded_cols)
        outputs = outputs.remove_columns(self._cols)

        logger.debug(f"\n{outputs}")

        return base.CallResult(outputs)
Exemplo n.º 14
0
    def _is_unique_key(self, input_column: container.DataFrame) -> bool:
        column_values = input_column.iloc[:, 0]

        # There should be at least one row. This prevents a degenerate case
        # where we would mark a column of no rows as a unique key column.
        # (Otherwise we also get division by zero below.)
        if not len(column_values):
            return False

        # Here we look at every value as-is. Even empty strings and other missing/nan values.
        if any(input_column.duplicated()):
            return False

        return True
Exemplo n.º 15
0
    def combine(self, prediction_groups: typing.Dict,
                inputs: container.DataFrame):
        all_results = []

        only_one = 'only_one_time_series' in prediction_groups
        for i, row in inputs.iterrows():
            # date = pd.Timestamp(et.time_indicator.get_datetime(row))
            date = self.time_indicator.get_datetime(row)
            if only_one:
                key = 'only_one_time_series'
            else:
                key = []
                for x in self.categorical_indices:
                    key.append(row.iloc[x])
                key = tuple(key)
            predictions = prediction_groups[key]
            all_results.append(predictions.loc[date, 0])
        return np.array(all_results).T
Exemplo n.º 16
0
    def produce(self,
                *,
                inputs: Input,
                timeout: float = None,
                iterations: int = None) -> CallResult[Output]:
        columns_list_to_fold = self._mapping.get('foldable_columns', [])
        if len(columns_list_to_fold) == 0:
            return CallResult(inputs, True, 1)
        if inputs.shape[0] > 20000:
            return CallResult(inputs, True, 1)
        self._column_names = list(inputs) if inputs is not None else []
        df = None
        for columns_to_fold in columns_list_to_fold:
            df = self._fold_columns(inputs, columns_to_fold)
        cols_to_drop = list()
        for col_idx, col_name in enumerate(inputs.columns):
            if col_name not in df.columns:
                cols_to_drop.append(col_idx)

        inputs = utils.remove_columns(inputs, cols_to_drop)
        new_df = inputs[0:0]
        for col_name in new_df.columns:
            new_df.loc[:, col_name] = df.loc[:, col_name]

        extends = {}
        for col_name in df.columns:
            if col_name not in new_df.columns:
                extends[col_name] = df.loc[:, col_name].tolist()

        if extends:
            extends_df = d3m_DataFrame.from_dict(extends)
            extends_df.index = new_df.index.copy()
            new_df = utils.append_columns(new_df, extends_df)
            new_df = self._update_type(new_df, list(extends.keys()))

        old_metadata = dict(new_df.metadata.query(()))
        old_metadata["dimension"] = dict(old_metadata["dimension"])
        old_metadata["dimension"]["length"] = new_df.shape[0]
        new_df.metadata = new_df.metadata.update((), old_metadata)

        return CallResult(new_df, True,
                          1) if new_df is not None else CallResult(
                              inputs, True, 1)
Exemplo n.º 17
0
def update_type(extends, df_origin):
    extends_df = pd.DataFrame.from_dict(extends)
    extends_df = d3m_DataFrame(extends_df, generate_metadata=True)
    if extends != {}:
        extends_df.index = df_origin.index.copy()

    new_df = d3m_DataFrame.append_columns(df_origin, extends_df)

    indices = list()
    for key in extends:
        indices.append(new_df.columns.get_loc(key))

    for idx in indices:
        old_metadata = dict(new_df.metadata.query((mbase.ALL_ELEMENTS, idx)))

        numerics = pd.to_numeric(new_df.iloc[:, idx], errors='coerce')
        length = numerics.shape[0]
        nans = numerics.isnull().sum()

        if nans / length > 0.9:
            if HelperFunction.is_categorical(new_df.iloc[:, idx]):
                old_metadata['semantic_types'] = (
                    "https://metadata.datadrivendiscovery.org/types/CategoricalData",
                )
            else:
                old_metadata['semantic_types'] = ("http://schema.org/Text", )
        else:
            intcheck = (numerics % 1) == 0
            if np.sum(intcheck) / length > 0.9:
                old_metadata['semantic_types'] = (
                    "http://schema.org/Integer", )
            else:
                old_metadata['semantic_types'] = ("http://schema.org/Float", )

        old_metadata['semantic_types'] += (
            "https://metadata.datadrivendiscovery.org/types/Attribute", )

        new_df.metadata = new_df.metadata.update((mbase.ALL_ELEMENTS, idx),
                                                 old_metadata)

    return new_df
    def produce(
        self,
        *,
        inputs: container.DataFrame,
        timeout: float = None,
        iterations: int = None,
    ) -> base.CallResult[container.DataFrame]:
        logger.debug(f"Producing {__name__}")

        if len(self._cols) == 0:
            return base.CallResult(inputs)

        # encode using the previously identified categorical columns
        input_cols = inputs.iloc[:, self._cols]
        from itertools import zip_longest

        encoded_cols = container.DataFrame()
        for i in self._cols:
            col_name = inputs.columns[i]
            col = container.DataFrame.from_records(
                zip_longest(*inputs[col_name].values)).T
            col.columns = [f"{col_name}_{x}" for x in range(len(col.columns))]
            encoded_cols = pd.concat([encoded_cols, col], axis=1)

        # append the encoding columns and generate metadata
        outputs = inputs.copy()
        encoded_cols.metadata = encoded_cols.metadata.generate(encoded_cols)

        for c in range(encoded_cols.shape[1]):
            encoded_cols.metadata = encoded_cols.metadata.add_semantic_type(
                (metadata_base.ALL_ELEMENTS, c), "http://schema.org/Float")

        outputs = outputs.append_columns(encoded_cols)

        # drop the source columns
        outputs = outputs.remove_columns(self._cols)

        logger.debug(f"\n{outputs}")

        return base.CallResult(outputs)
    def _update_type_info(self, semantic_types: Sequence[str],
                          outputs: container.DataFrame,
                          i: int) -> container.DataFrame:
        # update the structural / df type from the semantic type
        if "http://schema.org/Integer" in semantic_types:
            outputs.metadata = outputs.metadata.update_column(
                i, {"structural_type": int})
            outputs.iloc[:, i] = pd.to_numeric(outputs.iloc[:, i])
        elif "http://schema.org/Float" in semantic_types:
            outputs.metadata = outputs.metadata.update_column(
                i, {"structural_type": float})
            outputs.iloc[:, i] = pd.to_numeric(outputs.iloc[:, i])
        elif "http://schema.org/Boolean" in semantic_types:
            outputs.metadata = outputs.metadata.update_column(
                i, {"structural_type": bool})
            outputs.iloc[:, i] = outputs.iloc[:, i].astype("bool")

        return outputs
Exemplo n.º 20
0
def calculate_score(ground_truth: container.DataFrame, prediction: container.DataFrame,
                    performance_metrics: typing.List[typing.Dict],
                    task_type, regression_metric: set):
    """
    static method used to calculate the score based on given predictions and metric tpyes
    Parameters
    ---------
    ground_truth: the ground truth of target
    prediction: the predicted results of target
    performance_metrics: the metehod to calculate the score
    task_type: the task type of the problem
    """
    result_metrics = []
    target_amount = 0
    if prediction is not None:
        prediction = graph_problem_conversion(task_type, prediction)

    for metric_description in performance_metrics:
        metricDesc = metric_description['metric']
        params: typing.Dict = metric_description.get('params', {})
        if params:
            metric: problem.PerformanceMetric = metricDesc.get_class()(**params)
        else:
            metric = metricDesc.get_class()
        # updated for d3m v2019.5.8: we need to instantiate the metric class first if it was not done yet
        if type(metric) is AbstractMetaclass:
            metric = metric()

        # special design for objectDetectionAP
        if metric_description["metric"] == problem.PerformanceMetric.OBJECT_DETECTION_AVERAGE_PRECISION:

            if ground_truth is not None and prediction is not None:
                # training_image_name_column = ground_truth.iloc[:,
                #                              ground_truth.shape[1] - 2]
                # prediction.insert(loc=0, column='image_name',
                #                            value=training_image_name_column)

                ground_truth_to_send = ground_truth.iloc[:, ground_truth.shape[1] - 2: ground_truth.shape[1]]
                prediction_to_send = prediction# .iloc[:, prediction.shape[1] - 2: prediction.shape[1]]
                if prediction_to_send['d3mIndex'].dtype.name != ground_truth_to_send['d3mIndex'].dtype.name:
                    ground_truth_to_send = ground_truth_to_send['d3mIndex'].astype(str)
                    prediction_to_send = prediction_to_send['d3mIndex'].astype(str)

                # truth = ground_truth_to_send.astype(str).values.tolist()
                # predictions = prediction_to_send.astype(str).values.tolist()
                value = metric.score(ground_truth_to_send, prediction_to_send)

                result_metrics.append({
                    'column_name': ground_truth.columns[-1],
                    'metric': metric_description['metric'],
                    'value': value
                })
            return result_metrics
        # END special design for objectDetectionAP

        do_regression_mode = metric_description["metric"] in regression_metric
        try:
            # generate the metrics for training results
            if ground_truth is not None and prediction is not None:
                if "d3mIndex" not in ground_truth.columns:
                    raise NotSupportedError("No d3mIndex found for ground truth!")
                else:
                    ground_truth_amount = len(ground_truth.columns) - 1

                if "d3mIndex" not in prediction.columns:
                    # for the condition that ground_truth have index but
                    # prediction don't have
                    target_amount = len(prediction.columns)
                    prediction.insert(0,'d3mIndex' ,ground_truth['d3mIndex'].copy())
                else:
                    target_amount = len(prediction.columns) - 1

                if prediction['d3mIndex'].dtype.name != ground_truth['d3mIndex'].dtype.name:
                    ground_truth['d3mIndex'] = ground_truth['d3mIndex'].astype(str).copy()
                    prediction['d3mIndex'] = prediction['d3mIndex'].astype(str).copy()

                if not (ground_truth_amount == target_amount):
                    _logger.error("Ground truth's amount and prediction's amount does not match")
                    _logger.error('predicition columns :' + str(prediction.columns))
                    _logger.error('Ground truth columns:' + str(ground_truth.columns))
                    raise ValueError("Ground truth's amount and prediction's amount does not match")
                #     from runtime import ForkedPdb
                #     ForkedPdb().set_trace()

                if do_regression_mode:
                    # regression mode require the targets must be float
                    for each_column in range(-target_amount, 0, 1):
                        prediction.iloc[:,each_column] = prediction.iloc[:,each_column].astype(float).copy()

                # update 2019.4.12, now d3m v2019.4.4 have new metric function, we have to change like this
                ground_truth_d3m_index_column_index = ground_truth.columns.tolist().index("d3mIndex")
                prediction_d3m_index_column_index = prediction.columns.tolist().index("d3mIndex")

                for each_column in range(-target_amount, 0, 1):
                    result_metrics.append({
                        'column_name': ground_truth.columns[each_column],
                        'metric': metric_description['metric'],
                        'value': metric.score(truth=ground_truth.iloc[:,[ground_truth_d3m_index_column_index,each_column]],
                                              predictions=prediction.iloc[:,[prediction_d3m_index_column_index,each_column]])
                    })
            elif ground_truth is None:
                raise NotSupportedError("Metric calculation failed because ground truth is None!")
            elif prediction is not None:
                raise NotSupportedError("Metric calculation failed because prediction is None!")

        except Exception:
            traceback.print_exc()
            raise NotSupportedError('[ERROR] metric calculation failed')
    # END for loop

    if len(result_metrics) > target_amount:
        _logger.warning("[WARN] Training metrics's amount is larger than target amount.")

    # return the training and test metrics
    return result_metrics
Exemplo n.º 21
0
    def _produce(
        self,
        *,
        left_df_full: container.DataFrame, # type: ignore
        left_df: container.DataFrame,  # type: ignore
        right_df: container.DataFrame,  # type: ignore
        join_types: typing.Sequence[str],
        left_col: typing.Sequence[int],
        right_col: typing.Sequence[int],
        accuracy: typing.Sequence[float],
        absolute_accuracy: typing.Sequence[bool]
    ) -> base.CallResult[Outputs]:

        # cycle through the columns to join the dataframes
        right_cols_to_drop = []
        new_left_cols = []
        new_right_cols = []
        for col_index in range(len(left_col)):
            # depending on the joining type, make a new dataframe that has columns we will want to merge on
            # keep track of which columns we will want to drop later on
            if len(self._STRING_JOIN_TYPES.intersection(join_types[col_index])) > 0:
                new_left_df = self._create_string_merge_cols(
                    left_df,
                    left_col[col_index],
                    right_df,
                    right_col[col_index],
                    accuracy[col_index],
                    col_index,
                )
                left_df[new_left_df.columns] = new_left_df
                right_name = "righty_string" + str(col_index)
                right_df.rename(
                    columns={right_col[col_index]: right_name}, inplace=True
                )
                new_left_cols += list(new_left_df.columns)
                new_right_cols.append(right_name)
            elif len(self._NUMERIC_JOIN_TYPES.intersection(join_types[col_index])) > 0:
                new_left_df = self._create_numeric_merge_cols(
                    left_df,
                    left_col[col_index],
                    right_df,
                    right_col[col_index],
                    accuracy[col_index],
                    col_index,
                    absolute_accuracy[col_index],
                )
                left_df[new_left_df.columns] = new_left_df
                right_name = "righty_numeric" + str(col_index)
                right_df.rename(
                    columns={right_col[col_index]: right_name}, inplace=True
                )
                new_left_cols += list(new_left_df.columns)
                new_right_cols.append(right_name)
            elif len(self._GEO_JOIN_TYPES.intersection(join_types[col_index])) > 0:
                new_left_df, new_right_df = self._create_geo_vector_merging_cols(
                    left_df,
                    left_col[col_index],
                    right_df,
                    right_col[col_index],
                    accuracy[col_index],
                    col_index,
                    absolute_accuracy[col_index],
                )
                left_df[new_left_df.columns] = new_left_df
                right_df[new_right_df.columns] = new_right_df
                new_left_cols += list(new_left_df.columns)
                new_right_cols += list(new_right_df.columns)
                right_cols_to_drop.append(right_col[col_index])
            elif len(self._VECTOR_JOIN_TYPES.intersection(join_types[col_index])) > 0:
                new_left_df, new_right_df = self._create_vector_merging_cols(
                    left_df,
                    left_col[col_index],
                    right_df,
                    right_col[col_index],
                    accuracy[col_index],
                    col_index,
                    absolute_accuracy[col_index],
                )
                left_df[new_left_df.columns] = new_left_df
                right_df[new_right_df.columns] = new_right_df
                new_left_cols += list(new_left_df.columns)
                new_right_cols += list(new_right_df.columns)
                right_cols_to_drop.append(right_col[col_index])
            elif len(self._DATETIME_JOIN_TYPES.intersection(join_types[col_index])) > 0:
                tolerance = self._compute_datetime_tolerance(left_df_full, left_col[col_index], right_df, right_col[col_index], accuracy[col_index])
                new_left_df, new_right_df = self._create_datetime_merge_cols(
                    left_df,
                    left_col[col_index],
                    right_df,
                    right_col[col_index],
                    tolerance,
                    col_index,
                )
                left_df[new_left_df.columns] = new_left_df
                right_df[new_right_df.columns] = new_right_df
                new_left_cols += list(new_left_df.columns)
                new_right_cols += list(new_right_df.columns)
                right_cols_to_drop.append(right_col[col_index])
            else:
                raise exceptions.InvalidArgumentValueError(
                    "join not surpported on type " + str(join_types[col_index])
                )

        if "d3mIndex" in right_df.columns:
            right_cols_to_drop.append("d3mIndex")
        right_df.drop(columns=right_cols_to_drop, inplace=True)

        joined = pd.merge(
            left_df,
            right_df,
            how=self.hyperparams["join_type"],
            left_on=new_left_cols,
            right_on=new_right_cols,
            suffixes=["_left", "_right"],
        )

        # don't want to keep columns that were created specifically for merging
        # also, inner merge keeps the right column we merge on, we want to remove it
        joined.drop(columns=new_left_cols + new_right_cols, inplace=True)

        return joined
Exemplo n.º 22
0
    def produce(
        self,
        *,
        inputs: container.DataFrame,
        timeout: float = None,
        iterations: int = None,
    ) -> base.CallResult[container.DataFrame]:

        start = time.time()
        logger.debug(f"Producing {__name__}")

        cols = self._get_columns(inputs.metadata)
        # outputs = container.DataFrame(generate_metadata=False)
        outputs = [None] * inputs.shape[1]

        parsing_semantics = self.hyperparams["parsing_semantics"]

        def fromstring(x: str) -> np.ndarray:
            # if column isn't a string, we'll just pass it through assuming it doesn't need to be parsed
            if type(x) is not str:
                return x

            return np.fromstring(x, dtype=float, sep=",")

        for col_index in range(len(inputs.columns)):
            if col_index in cols:
                column_metadata = inputs.metadata.query(
                    (metadata_base.ALL_ELEMENTS, col_index)
                )
                semantic_types = column_metadata.get("semantic_types", [])
                desired_semantics = set(semantic_types).intersection(parsing_semantics)
                if desired_semantics:
                    if (
                        "https://metadata.datadrivendiscovery.org/types/FloatVector"
                        in desired_semantics
                    ):
                        outputs[col_index] = inputs.iloc[:, col_index].apply(
                            fromstring, convert_dtype=False
                        )
                        if outputs[col_index].shape[0] > 0:
                            inputs.metadata = inputs.metadata.update_column(
                                col_index,
                                {"structural_type": type(outputs[col_index][0])},
                            )
                    elif "http://schema.org/DateTime" in desired_semantics:
                        outputs[col_index] = inputs.iloc[:, col_index].apply(
                            utils.parse_datetime_to_float,
                            fuzzy=self.hyperparams["fuzzy_time_parsing"],
                            convert_dtype=False,
                        )
                        inputs.metadata = inputs.metadata.update_column(
                            col_index, {"structural_type": float}
                        )
                    elif (
                        "https://metadata.datadrivendiscovery.org/types/CategoricalData"
                        in desired_semantics
                    ):
                        # need to make sure if a categorical type is a numeric string, convert it
                        if inputs[inputs.columns[col_index]][0].isnumeric():
                            outputs[col_index] = pd.to_numeric(
                                inputs.iloc[:, col_index],
                                errors=self.hyperparams["error_handling"],
                            )
                            if outputs[col_index].shape[0] > 0:
                                updated_type = type(outputs[col_index][0].item())
                                inputs.metadata = inputs.metadata.update_column(
                                    col_index, {"structural_type": updated_type}
                                )
                        else:
                            # if it's categorical but not numerical, ensure the string stays
                            outputs[col_index] = inputs.iloc[:, col_index]
                    else:
                        outputs[col_index] = pd.to_numeric(
                            inputs.iloc[:, col_index],
                            errors=self.hyperparams["error_handling"],
                        )
                        # Update structural type to reflect the results of the to_numeric call.  We can't rely on the semantic type because
                        # error coersion may result in a type becoming a float due to the presence of NaN.
                        if outputs[col_index].shape[0] > 0:
                            updated_type = type(outputs[col_index][0].item())
                            inputs.metadata = inputs.metadata.update_column(
                                col_index, {"structural_type": updated_type}
                            )
                else:
                    # columns without specified semantics need to be concatenated
                    outputs[col_index] = inputs.iloc[:, col_index]
            else:
                # columns not specified still need to be concatenated
                outputs[col_index] = inputs.iloc[:, col_index]

        outputs = container.DataFrame(pd.concat(outputs, axis=1))
        outputs.metadata = inputs.metadata
        end = time.time()
        logger.debug(f"Produce {__name__} completed in {end - start} ms")

        return base.CallResult(outputs)
        B = np.random.randn(r, n)
        X_org = np.append(X_org, np.dot(A, B), axis=1)

# mask a fraction of entries randomly
X_incomplete = X_org.copy()
m, n = X_org.shape
for i in range(n):
    idx = np.random.choice(m, int(np.round(m * missing_rate)), replace=False)
    X_incomplete[idx, i] = np.nan

# recover the missing entries
# hp= hrmc_sf.Hyperparams.defaults()
hp = high_rank_imputer.Hyperparams({
    'd': 0,
    'alpha': 1,
    'beta': 1,
    'tol': 1e-4,
    'maxiter': 500
})
# if d=0, d will be automatically estimated; otherwise (d>=1), the value of d will be applied
sf = high_rank_imputer.HighRankImputer(hyperparams=hp)
df_incomplete = DataFrame(X_incomplete.T)
# the missing entries in the matrix must be noted by NaN
df_recovered = sf.produce(inputs=df_incomplete).value

X_recovered = df_recovered.values.T

# compute the recovery error (relative mean squared error, within [0,1], the smaller the better)
RMSE = np.square(X_recovered - X_org).sum() / np.square(X_org).sum()
print("RMSE:", RMSE)
Exemplo n.º 24
0
    def produce(
        self,
        *,
        inputs: container.DataFrame,
        timeout: float = None,
        iterations: int = None,
    ) -> base.CallResult[container.DataFrame]:
        logger.debug(f"Producing {__name__}")

        # fallthrough if there's nothing to do
        if len(self._cols) == 0 or self._encoder is None:
            return base.CallResult(inputs)

        # map encoded cols to source column names
        feature_names = self._encoder.get_feature_names()
        encoded_cols_source = []
        # feature names are xA_YY where A is the source column index and YY is the value
        for name in feature_names:
            # take the first part of the name (xA) and remove the x
            encoded_feature_index = int(name.split("_")[0][1:])
            feature_index = self._cols[encoded_feature_index]
            encoded_cols_source.append(
                inputs.metadata.query((metadata_base.ALL_ELEMENTS, feature_index))[
                    "name"
                ]
            )

        # encode using the previously identified categorical columns
        input_cols = inputs.iloc[:, self._cols]
        result = self._encoder.transform(input_cols)

        # append the encoding columns and generate metadata
        outputs = inputs.copy()
        encoded_cols: container.DataFrame = container.DataFrame()

        for i in range(result.shape[1]):
            encoded_cols[f"__onehot_{str(i)}"] = result[:, i]
        encoded_cols.metadata = encoded_cols.metadata.generate(encoded_cols)

        for c in range(encoded_cols.shape[1]):
            encoded_cols.metadata = encoded_cols.metadata.add_semantic_type(
                (metadata_base.ALL_ELEMENTS, c), "http://schema.org/Float"
            )
            encoded_cols.metadata = encoded_cols.metadata.add_semantic_type(
                (metadata_base.ALL_ELEMENTS, c), self._attribute_semantic
            )
            col_dict = dict(
                encoded_cols.metadata.query((metadata_base.ALL_ELEMENTS, c))
            )
            col_dict["source_column"] = encoded_cols_source[c]
            encoded_cols.metadata = encoded_cols.metadata.update(
                (metadata_base.ALL_ELEMENTS, c), col_dict
            )

        outputs = outputs.append_columns(encoded_cols)

        # drop the source columns
        outputs = outputs.remove_columns(self._cols)

        logger.debug(f"\n{outputs}")

        return base.CallResult(outputs)
Exemplo n.º 25
0
 def _detect_text(cls, X: container.DataFrame, thresh: int = 8) -> bool:
     """ returns true if median entry has more than `thresh` tokens"""
     X = X[X.notnull()]
     n_toks = X.apply(lambda xx: len(str(xx).split(" "))).values
     return np.median(n_toks) >= thresh
Exemplo n.º 26
0
 def _convert_lists(dataframe: container.DataFrame) -> container.DataFrame:
     # convert colum contents to numpy array of values similar to what extract semantic types would do
     for index, row in dataframe.iterrows():
         row["bravo"] = container.ndarray([int(i) for i in row["bravo"].split(",")])
     return dataframe
Exemplo n.º 27
0
 def _encode_labels(self,
                    inputs: container.DataFrame) -> container.DataFrame:
     for col_idx, (label, col) in enumerate(inputs.iteritems()):
         encodes = [self._labels[col_idx].get(label, 0) for label in col]
         inputs.iloc[:, col_idx] = encodes
     return inputs
Exemplo n.º 28
0
def combine_columns(
    inputs: container.DataFrame,
    column_indices: typing.Sequence[int],
    columns_list: typing.Sequence[container.DataFrame],
    *,
    return_result: str,
    add_index_columns: bool,
) -> container.DataFrame:
    """
    Method which appends existing columns, replaces them, or creates new result from them, based on
    ``return_result`` argument, which can be ``append``, ``replace``, or ``new``.

    ``add_index_columns`` controls if when creating a new result, primary index columns should be added
    if they are not already among columns.

    ``inputs`` is a DataFrame for which we are appending on replacing columns, or if we are creating new result,
    from where a primary index column can be taken.

    ``column_indices`` controls which columns in ``inputs`` were used to create ``columns_list``,
    and which columns should be replaced when replacing them.

    ``columns_list`` is a list of DataFrames representing all together new columns. The reason it is a list is
    to make it easier to operate per-column when preparing ``columns_list`` and not have to concat them all
    together unnecessarily.

    Top-level metadata in ``columns_list`` is ignored, except when creating new result.
    In that case top-level metadata from the first element in the list is used.

    When ``column_indices`` columns are being replaced with ``columns_list``, existing metadata in ``column_indices``
    columns is not preserved but replaced with metadata in ``columns_list``. Ideally, metadata for ``columns_list``
    has been constructed by copying source metadata from ``column_indices`` columns and modifying it as
    necessary to adapt it to new columns. But ``columns_list`` also can have completely new metadata, if this
    is more reasonable, but it should be understood that in this case when replacing ``column_indices``
    columns, any custom additional metadata on those columns will be lost.

    ``column_indices`` and ``columns_list`` do not have to match in number of columns. Columns are first
    replaced in order for matching indices and columns. If then there are more ``column_indices`` than
    ``columns_list``, additional ``column_indices`` columns are removed. If there are more ``columns_list`` than
    ``column_indices`` columns, then additional ``columns_list`` are inserted after the last replaced column.

    If ``column_indices`` is empty, then the replacing behavior is equivalent to appending.
    """

    if return_result == 'append':
        outputs = inputs
        for columns in columns_list:
            outputs = outputs.append_columns(columns)

    elif return_result == 'replace':
        if not column_indices:
            return combine_columns(inputs,
                                   column_indices,
                                   columns_list,
                                   return_result='append',
                                   add_index_columns=add_index_columns)

        # Compute the difference in "columns"
        to_be_added = list(
            numpy.setdiff1d(numpy.arange(len(inputs.columns)), column_indices))
        columns_replaced = 0
        if len(to_be_added) < len(column_indices):
            # More efficient to concatenate than replace one-by-one
            outputs = pandas.concat(columns_list, axis=1)
            outputs = container.DataFrame(data=outputs,
                                          generate_metadata=False)
            indices = range(columns_list[0].shape[1])
            outputs.metadata = inputs.metadata.select_columns(
                columns=list(indices))

            c = 0
            for columns in columns_list:
                columns_length = columns.shape[1]
                if c == 0:
                    outputs.metadata = outputs.metadata.replace_columns(
                        columns.metadata, list(indices))
                else:
                    outputs.metadata = outputs.metadata.append_columns(
                        columns.metadata)
                c += 1

            for col in to_be_added:
                insert_index = col.item()
                if insert_index > outputs.shape[1]:
                    insert_index = outputs.shape[1]
                outputs = outputs.insert_columns(
                    inputs.select_columns([col.item()]), insert_index)
            outputs.metadata = outputs.metadata.compact(['structural_type'])
        else:
            # We copy here and disable copying inside "replace_columns" to copy only once.
            # We have to copy because "replace_columns" is modifying data in-place.
            outputs = copy.copy(inputs)
            for columns in columns_list:
                columns_length = columns.shape[1]
                if columns_replaced < len(column_indices):
                    # It is OK if the slice of "column_indices" is shorter than "columns", Only those columns
                    # listed in the slice will be replaced and others appended after the last replaced column.
                    outputs = outputs.replace_columns(
                        columns,
                        column_indices[columns_replaced:columns_replaced +
                                       columns_length],
                        copy=False)
                else:
                    # We insert the rest of columns after the last columns we replaced. We know that "column_indices"
                    # is non-empty and that the last item of "column_indices" points ot the last column we replaced
                    # for those listed in "column_indices". We replaced more columns though, so we have to add the
                    # difference, and then add 1 to insert after the last column.
                    outputs = outputs.insert_columns(
                        columns, column_indices[-1] +
                        (columns_replaced - len(column_indices)) + 1)
                columns_replaced += columns_length

            if columns_replaced < len(column_indices):
                outputs = outputs.remove_columns(
                    column_indices[columns_replaced:len(column_indices)])
    elif return_result == 'new':
        if not any(columns.shape[1] for columns in columns_list):
            raise ValueError("No columns produced.")

        outputs = columns_list[0]
        for columns in columns_list[1:]:
            outputs = outputs.append_columns(columns)

        if add_index_columns:
            inputs_index_columns = inputs.metadata.get_index_columns()
            outputs_index_columns = outputs.metadata.get_index_columns()

            if inputs_index_columns and not outputs_index_columns:
                # Add index columns at the beginning.
                outputs = inputs.select_columns(
                    inputs_index_columns).append_columns(
                        outputs, use_right_metadata=True)

    else:
        raise exceptions.InvalidArgumentValueError(
            "\"return_result\" has an invalid value: {return_result}".format(
                return_result=return_result))

    return outputs
Exemplo n.º 29
0
from d3m.container import DataFrame
from pyglrm_d3m.huber_pca import HuberPCA

A = DataFrame([[1, 2, 3, 4], [2, 4, 6, 8], [4, 5, 6, 7]])

model = HuberPCA(hyperparams={'k': 2})  #create a class for Huber PCA
model.set_training_data(inputs=A)
model.fit()

#get parameter
parameter = model.get_params()
print("Initial parameter (Y): {}".format(parameter['Y'].values))

#modify parameter
print(
    "Now we change the (0,0) entry of Y to 0, and set the modified Y as parameter of the Huber PCA class."
)
parameter['Y'].values[0, 0] = 0
model.set_params(params={'Y': parameter['Y']})

#check if parameter has been modified
parameter = model.get_params()
print("Modified parameter (Y): {}".format(parameter['Y'].values))
from pyglrm_d3m.huber_pca import HuberPCA
from d3m.container import DataFrame

A = DataFrame([[1, 2, 3, 4], [2, 4, 6, 8], [4, 5, 6, 7]],
              generate_metadata=True)

model = HuberPCA(hyperparams={'k': 2})  #create a class for Huber PCA
model.set_training_data(inputs=A)
model.fit()
a_new = DataFrame([[6, 7, 8, 9]])  #initialize a new row to be tested
x = model.produce(
    inputs=a_new).value.values  #get the latent representation of a_new

print(x)