Пример #1
0
    def __init__(self,
                 other: typing.Dict[str, typing.Any] = None,
                 **values: typing.Any) -> None:
        if other is None:
            other = {}

        values = dict(other, **values)

        params_keys = set(self.__params_items__.keys())  # type: ignore
        values_keys = set(values.keys())

        missing = params_keys - values_keys
        if len(missing):
            raise exceptions.InvalidArgumentValueError(
                "Not all parameters are specified: {missing}".format(
                    missing=missing))

        extra = values_keys - params_keys
        if len(extra):
            raise exceptions.InvalidArgumentValueError(
                "Additional parameters are specified: {extra}".format(
                    extra=extra))

        for name, value in values.items():
            value_type = self.__params_items__[name]  # type: ignore
            if not utils.is_instance(value, value_type):
                raise exceptions.InvalidArgumentTypeError(
                    "Value '{value}' for parameter '{name}' is not an instance of the type: {value_type}"
                    .format(value=value, name=name, value_type=value_type))

        super().__init__(values)
Пример #2
0
    def _geo_fuzzy_match(match, choices, col, accuracy, is_absolute):
        # assume the accuracy is meters
        if not is_absolute:
            raise exceptions.InvalidArgumentTypeError(
                "geo fuzzy match requires an absolute accuracy parameter that specifies the tolerance in meters"
            )

        # keep the set of choices that falls within the acceptable distance
        return choices[choices[col].map(lambda x: hs.haversine(match, x, Unit.METERS)) < accuracy]
Пример #3
0
 def _convert_value(
     self, value: typing.Any
 ) -> typing.Union[np.ndarray, typing.List, typing.Any]:
     if isinstance(value, container.ndarray):
         return value.view(np.ndarray)
     elif isinstance(value, container.List):
         return [self._convert_value(v) for v in value]
     elif isinstance(value, container.DataFrame):
         return value.values
     else:
         raise exceptions.InvalidArgumentTypeError(
             'Input value must be an instance of `container.ndarray`, `container.List`, or `container.DataFrame.'
         )
Пример #4
0
def select_rows(
    resource: container_pandas.DataFrame, row_indices_to_keep: typing.Sequence[int]
) -> container_pandas.DataFrame:
    if not isinstance(resource, container_pandas.DataFrame):
        raise exceptions.InvalidArgumentTypeError(
            "Only DataFrame resources can have rows selected, not '{type}'.".format(
                type=type(resource)
            )
        )

    row_indices = sorted(row_indices_to_keep)
    resource = resource.iloc[row_indices, :].reset_index(drop=True)

    # TODO: Expose this as a general metadata method.
    #       In that case this has to be done recursively over all nested ALL_ELEMENTS.
    #       Here we are operating at resource level so we have to iterate only over first
    #       ALL_ELEMENTS and resource's element itself.

    # Change the metadata. Update the number of rows in the split.
    # This makes a copy so that we can modify metadata in-place.
    resource.metadata = resource.metadata.update(
        (),
        {
            "dimension": {
                "length": len(row_indices),
            },
        },
    )

    # Remove all rows not in this split and reorder those which are.
    for element_metadata_entry in [
        resource.metadata._current_metadata,
    ]:
        if element_metadata_entry is None:
            continue

        elements = element_metadata_entry.elements
        new_elements_evolver = utils.EMPTY_PMAP.evolver()
        for i, row_index in enumerate(row_indices):
            if row_index in elements:
                new_elements_evolver.set(i, elements[row_index])
        element_metadata_entry.elements = new_elements_evolver.persistent()
        element_metadata_entry.is_elements_empty = not element_metadata_entry.elements
        element_metadata_entry.update_is_empty()

    return resource
Пример #5
0
    def produce(
        self,
        *,
        left: Inputs,  # type: ignore
        right: Inputs,  # type: ignore
        timeout: float = None,
        iterations: int = None,
    ) -> base.CallResult[Outputs]:

        # attempt to extract the main table
        try:
            left_resource_id, left_df = d3m_base_utils.get_tabular_resource(left, None)
        except ValueError as error:
            raise exceptions.InvalidArgumentValueError(
                "Failure to find tabular resource in left dataset"
            ) from error

        try:
            right_resource_id, right_df = d3m_base_utils.get_tabular_resource(
                right, None
            )
        except ValueError as error:
            raise exceptions.InvalidArgumentValueError(
                "Failure to find tabular resource in right dataset"
            ) from error

        accuracy = self.hyperparams["accuracy"]
        absolute_accuracy = self.hyperparams["absolute_accuracy"]

        # hyperparams may be parsed as tuples
        # floats could be integers if round number is passed in
        if isinstance(accuracy, collections.Iterable):
            accuracy = [float(a) for a in accuracy]
        else:
            accuracy = float(accuracy)
        if isinstance(absolute_accuracy, collections.Iterable):
            absolute_accuracy = list(absolute_accuracy)

        if type(accuracy) == float and not type(absolute_accuracy) == bool:
            raise exceptions.InvalidArgumentValueError(
                "only 1 value of accuracy provided, but multiple values for absolute accuracy provided"
            )
        if (not type(accuracy) == float) and type(absolute_accuracy) == bool:
            raise exceptions.InvalidArgumentValueError(
                "only 1 for absolute accuracy provided, but multiple values of accuracy provided"
            )
        if type(accuracy) == float and not absolute_accuracy:
            if accuracy <= 0.0 or accuracy > 1.0:
                raise exceptions.InvalidArgumentValueError(
                    "accuracy of " + str(accuracy) + " is out of range"
                )
        elif type(accuracy) == list and type(absolute_accuracy) == list:
            if not len(accuracy) == len(absolute_accuracy):
                raise exceptions.InvalidArgumentValueError(
                    "the count of accuracy hyperparams does not match the count of absolute_accuracy hyperparams"
                )
            for i in range(len(accuracy)):
                if (accuracy[i] <= 0.0 or accuracy[i] > 1.0) and not absolute_accuracy[i]:
                    raise exceptions.InvalidArgumentValueError(
                        "accuracy of " + str(acc) + " is out of range"
                    )

        left_col = self.hyperparams["left_col"]
        right_col = self.hyperparams["right_col"]

        if type(left_col) != type(right_col) or (
            type(left_col) == list
            and len(left_col) != len(right_col)
            and type(accuracy) != list
            and len(accuracy) != len(left_col)
        ):
            raise exceptions.InvalidArgumentTypeError(
                "both left_col and right_col need to have same data type and if they are lists, the same list lengths"
            )
        if type(left_col) == str:
            left_col = [left_col]
            right_col = [right_col]
            accuracy = [accuracy]
            absolute_accuracy = [absolute_accuracy]

        join_types = [
            self._get_join_semantic_type(
                left,
                left_resource_id,
                left_col[i],
                right,
                right_resource_id,
                right_col[i],
            )
            for i in range(len(left_col))
        ]

        num_splits = 32
        joined_split = [None for i in range(num_splits)]
        left_df_split = np.array_split(left_df, num_splits)
        jobs = [delayed(self._produce_threaded)(
            index = i,
            left_df_full = left_df,
            left_dfs = left_df_split,
            right_df = right_df,
            join_types = join_types,
            left_col = left_col,
            right_col = right_col,
            accuracy = accuracy,
            absolute_accuracy = absolute_accuracy
        ) for i in range(num_splits)]
        joined_data = Parallel(n_jobs=self.hyperparams["n_jobs"], backend="loky", verbose=10)(jobs)

        # joined data needs to maintain order to mimic none split joining
        for i, d in joined_data:
            joined_split[i] = d
        joined = pd.concat(joined_split, ignore_index = True)

        # create a new dataset to hold the joined data
        resource_map = {}
        float_vector_columns = {}
        for resource_id, resource in left.items():  # type: ignore
            if resource_id == left_resource_id:
                for column in joined.columns:
                    # need to avoid bug in container.Dataset, it doesn't like vector columns
                    if type(joined[column].iloc[0]) == np.ndarray:
                        float_vector_columns[column] = joined[column]
                        joined[column] = np.NAN
                resource_map[resource_id] = joined
            else:
                resource_map[resource_id] = resource

        # Generate metadata for the dataset using only the first row of the resource for speed -
        # metadata generation runs over each cell in the dataframe, but we only care about column
        # level generation.  Once that's done, set the actual dataframe value.
        result_dataset = container.Dataset(
            {k: v.head(1) for k, v in resource_map.items()}, generate_metadata=True
        )
        for k, v in resource_map.items():
            result_dataset[k] = v
            result_dataset.metadata = result_dataset.metadata.update(
                (k,), {"dimension": {"length": v.shape[0]}}
            )

        for key in float_vector_columns.keys():
            df = result_dataset[left_resource_id]
            df[key] = float_vector_columns[key]
            float_vec_loc = df.columns.get_loc(key)
            float_vec_col_indices = df.metadata.list_columns_with_semantic_types(
                ("https://metadata.datadrivendiscovery.org/types/FloatVector",)
            )
            if float_vec_loc not in float_vec_col_indices:
                df.metadata = df.metadata.add_semantic_type(
                    (metadata_base.ALL_ELEMENTS, float_vec_loc),
                    "https://metadata.datadrivendiscovery.org/types/FloatVector",
                )

        return base.CallResult(result_dataset)
Пример #6
0
def register_primitive(primitive_path: str,
                       primitive: typing.Type[base.PrimitiveBase]) -> None:
    """
    Registers a primitive under ``d3m.primitives`` namespace.

    This is useful to register primitives not necessary installed on the system
    or which are generated at runtime. It is also useful for testing purposes.

    ``primitive_path`` has to start with ``d3m.primitives``.

    Parameters
    ----------
    primitive_path:
        A primitive path to register a primitive under.
    primitive:
        A primitive class to register.
    """

    if not primitive_path:
        raise exceptions.InvalidArgumentValueError(
            "Path under which to register a primitive is required.")

    if not primitive_path.startswith('d3m.primitives.'):
        raise exceptions.InvalidArgumentValueError(
            "Path under which to register a primitive does not start with \"d3m.primitives\"."
        )

    if not inspect.isclass(primitive):
        raise exceptions.InvalidArgumentTypeError(
            "Primitive to register has to be a class.")

    if not issubclass(primitive, base.PrimitiveBase):
        raise exceptions.InvalidArgumentTypeError(
            "Primitive to register is not a subclass of PrimitiveBase.")

    if primitive.metadata.query()['python_path'] != primitive_path:
        raise exceptions.InvalidArgumentValueError(
            "Primitive's \"python_path\" in metadata does not match the path under which to register it: {python_path} vs. {primitive_path}"
            .format(
                python_path=primitive.metadata.query()['python_path'],
                primitive_path=primitive_path,
            ))

    modules_path, name = primitive_path.rsplit('.', 1)
    # We remove "d3m.primitives" from the list of modules.
    modules = modules_path.split('.')[2:]

    if 'd3m.primitives' not in sys.modules:
        import d3m.primitives  # type: ignore

    # Create any modules which do not yet exist.
    current_path = 'd3m.primitives'
    for module_name in modules:
        module_path = current_path + '.' + module_name

        if module_path not in sys.modules:
            try:
                importlib.import_module(module_path)
            except ModuleNotFoundError:
                # This can happen if this module is not listed in any of entry points. But we want to allow
                # registering primitives also outside of existing entry points, so we create a module here.

                # Because we just could not load the module, we know that if the attribute exists,
                # it has to be something else, which we do not want to clobber.
                if hasattr(sys.modules[current_path], module_name):
                    raise ValueError(
                        "'{module_path}' is already defined.".format(
                            module_path))

                module_spec = importlib.machinery.ModuleSpec(
                    module_path, namespace.Loader(), is_package=True)
                module = importlib.util.module_from_spec(module_spec)
                module_spec.loader.exec_module(module)

                sys.modules[module_path] = module
                setattr(sys.modules[current_path], module_name, module)

        current_path = module_path

    if hasattr(sys.modules[current_path], name):
        existing_value = getattr(sys.modules[current_path], name)
        # Registering twice the same primitive is a noop.
        if existing_value is primitive:
            return

        # Maybe we are just registering this primitive. But if not...
        if existing_value is not _SENTINEL:
            raise ValueError(
                "'{module}.{name}' is already defined as '{existing_value}'.".
                format(module=current_path,
                       name=name,
                       existing_value=existing_value))

    setattr(sys.modules[current_path], name, primitive)
    _loaded_primitives.add(primitive)