Пример #1
0
    def test_maybe_download_passes_params(self, downloader: Downloader,
                                          requests_mock, csv_data: bytes):
        csv_url = urljoin(downloader._options.url, "foobar/?format=csv")
        csv_df = pd.read_csv(BytesIO(csv_data))
        json_url = urljoin(downloader._options.url, "foobar/?format=json")
        json_handle = StringIO()
        csv_df.to_json(json_handle)

        requests_mock.register_uri("GET", csv_url, content=csv_data)
        requests_mock.register_uri("GET",
                                   json_url,
                                   content=bytes(json_handle.getvalue(),
                                                 encoding="utf-8"))

        res1 = downloader.maybe_download(csv_url, callback=pd.read_csv)
        res2 = downloader.maybe_download(csv_url, callback=pd.read_csv)

        assert res1 is res2
        assert requests_mock.called_once
        np.testing.assert_array_equal(res1.index, csv_df.index)
        np.testing.assert_array_equal(res1.columns, csv_df.columns)
        np.testing.assert_array_equal(res1.values, csv_df.values)

        res1 = downloader.maybe_download(json_url, callback=pd.read_json)
        res2 = downloader.maybe_download(json_url, callback=pd.read_json)

        assert res1 is res2
        assert len(requests_mock.request_history) == 2
        np.testing.assert_array_equal(res1.index, csv_df.index)
        np.testing.assert_array_equal(res1.columns, csv_df.columns)
        np.testing.assert_array_equal(res1.values, csv_df.values)
Пример #2
0
    def test_maybe_download_wrong_callable(self, downloader: Downloader,
                                           requests_mock, csv_data: bytes):
        url = urljoin(downloader._options.url, "foobar")
        requests_mock.register_uri("GET", url, content=csv_data)

        with pytest.raises(ValueError, match=r"Expected object or value"):
            downloader.maybe_download(url, callback=pd.read_json)
Пример #3
0
    def test_maybe_download_no_cache(self, downloader: Downloader,
                                     requests_mock, csv_data: bytes):
        url = urljoin(downloader._options.url, "foobar")
        requests_mock.register_uri("GET", url, content=csv_data)

        res1 = downloader.maybe_download(url, callback=pd.read_csv)
        downloader._options.cache.clear()
        res2 = downloader.maybe_download(url, callback=pd.read_csv)

        assert res1 is not res2
        assert len(requests_mock.request_history) == 2
        np.testing.assert_array_equal(res1.index, res2.index)
        np.testing.assert_array_equal(res1.columns, res2.columns)
        np.testing.assert_array_equal(res1.values, res2.values)
Пример #4
0
    def test_maybe_download_is_not_final(self, downloader: Downloader,
                                         requests_mock, csv_data: bytes):
        endpoint = "barbaz"
        url = urljoin(downloader._options.url, endpoint)
        requests_mock.register_uri("GET", url, content=csv_data)
        csv_df = pd.read_csv(BytesIO(csv_data))

        res = downloader.maybe_download(endpoint,
                                        callback=pd.read_csv,
                                        is_final=False)

        assert requests_mock.called_once
        np.testing.assert_array_equal(res.index, csv_df.index)
        np.testing.assert_array_equal(res.columns, csv_df.columns)
        np.testing.assert_array_equal(res.values, csv_df.values)
Пример #5
0
 def test_maybe_download_not_callable(self, downloader: Downloader):
     with pytest.raises(TypeError):
         downloader.maybe_download("foo", callback=None)
Пример #6
0
class OmnipathRequestABC(ABC, metaclass=OmnipathRequestMeta):
    """Base class for all :mod:`omnipath` requests."""

    __string__ = frozenset({"uniprot", "genesymbol"})
    __logical__ = frozenset()
    __categorical__ = frozenset()

    _json_reader = _error_handler(partial(pd.read_json, typ="frame"))
    _tsv_reader = _error_handler(
        partial(pd.read_csv,
                sep="\t",
                header=0,
                squeeze=False,
                low_memory=False))
    _query_type: Optional[QueryType] = None

    def __init__(self):
        self._downloader = Downloader(options)

    @classmethod
    @d.dedent
    def resources(cls, **kwargs) -> Tuple[str]:
        """%(query_resources)s"""
        return cls()._resources(**kwargs)

    @classmethod
    @d.dedent
    def params(cls) -> Dict[str, Any]:
        """%(query_params)s"""
        return {q.param: q.valid for q in cls._query_type.value}

    @classmethod
    def _annotations(cls) -> Dict[str, type]:
        """Return the type annotation for the query parameters."""
        return {q.param: q.annotation for q in cls._query_type.value}

    @classmethod
    def _docs(cls) -> Dict[str, Optional[str]]:
        """Return the type annotation for the query parameters."""
        return {q.param: q.doc for q in cls._query_type.value}

    def _get(self, **kwargs) -> pd.DataFrame:
        kwargs = self._modify_params(kwargs)
        kwargs = self._inject_fields(kwargs)
        kwargs, callback = self._convert_params(kwargs)
        kwargs = self._validate_params(kwargs)
        kwargs = self._finalize_params(kwargs)

        res = self._downloader.maybe_download(self._query_type.endpoint,
                                              params=kwargs,
                                              callback=callback,
                                              is_final=False)

        if self._downloader._options.convert_dtypes:
            res = self._convert_dtypes(res)

        return self._post_process(res)

    def _convert_params(
            self, params: Dict[str, Any]) -> Tuple[Dict[str, Any], Callable]:
        organism = params.pop("organism", params.pop("organisms", None))
        if organism is not None:
            organism = Organism(organism)
            try:
                params[self._query_type("organism").param] = organism.code
            except ValueError:
                pass

        # check the requested format
        fmt = params.pop("format", params.pop("formats", None))
        fmt = Format(Format.TSV if fmt is None else fmt)
        if fmt not in (Format.TSV, Format.JSON):
            logging.warning(
                f"Invalid `{Key.FORMAT.s}={fmt.s!r}`. Using `{Key.FORMAT.s}={Format.TSV.s!r}`"
            )
            fmt = Format.TSV
        callback = self._tsv_reader if fmt == Format.TSV else self._json_reader
        try:
            params[self._query_type("format").param] = fmt.s
        except ValueError:
            pass

        # check the license
        license = params.pop(
            "license", params.pop("licenses",
                                  self._downloader._options.license))
        if license is not None:
            license = License(license)
            try:
                params[self._query_type("license").param] = license
            except ValueError:
                pass

        if self._downloader._options.password is not None:
            params.setdefault(Key.PASSWORD.s,
                              self._downloader._options.password)

        return params, callback

    def _inject_fields(self, params: Dict[str, Any]) -> Dict[str, Any]:
        try:
            _inject_params(
                params,
                key=self._query_type(Key.FIELDS.value).param,
                value=getattr(DEFAULT_FIELD, self._query_type.name).value,
            )
        except AttributeError:
            # no default field for this query
            pass
        except Exception as e:
            logging.warning(
                f"Unable to inject `{Key.FIELDS.value}` for `{self}`. Reason: `{e}`"
            )

        return params

    def _validate_params(
        self,
        params: Dict[str,
                     Any]) -> Dict[str, Optional[Union[str, Sequence[str]]]]:
        """For each passed parameter, validate if it has the correct value."""
        res = {}
        for k, v in params.items():
            # first get the validator for the parameter, then validate
            res[self._query_type(k).param] = self._query_type(k)(v)
        return res

    def _finalize_params(self, params: Dict[str, Any]) -> Dict[str, str]:
        """Convert all the parameters to strings."""
        # this is largely redundant
        res = {}
        for k, v in params.items():
            if isinstance(v, str):
                res[k] = v
            elif isinstance(v, bool):
                res[k] = str(int(v))
            elif isinstance(v, (int, float)):
                res[k] = str(v)
            elif isinstance(v, Iterable):
                res[k] = ",".join(sorted(v))
            elif isinstance(v, Enum):
                res[k] = str(v.value)
            elif v is not None:
                logging.warning(
                    f"Unable to process parameter `{k}={v}`. Ignoring")

        return dict(sorted(res.items(), key=itemgetter(0)))

    def _convert_dtypes(self, res: pd.DataFrame, **_) -> pd.DataFrame:
        """Automatically convert dtypes for this type of query."""
        def to_logical(col: pd.Series) -> pd.Series:
            if is_numeric_dtype(col):
                return col > 0
            return col.astype(str).str.lower().isin(
                ("y", "t", "yes", "true", "1"))

        def handle_logical(df: pd.DataFrame, columns: frozenset) -> None:
            cols = list(frozenset(df.columns) & columns)
            if cols:
                df[cols] = df[cols].apply(to_logical)

        def handle_categorical(df: pd.DataFrame, columns: frozenset) -> None:
            cols = frozenset(df.columns) & columns
            cols = [
                col for col, dtype in zip(cols, df[cols].dtypes)
                if not is_float_dtype(dtype)
            ]
            if cols:
                df[cols] = df[cols].astype("category")

        def handle_string(df: pd.DataFrame, columns: frozenset) -> None:
            for col in frozenset(df.columns) & columns:
                mask = pd.isnull(df[col])
                df[col] = df[col].astype(str)
                df.loc[mask, col] = None

        if not isinstance(res, pd.DataFrame):
            raise TypeError(
                f"Expected the result to be of type `pandas.DataFrame`, found `{type(res).__name__}`."
            )

        handle_logical(res, self.__logical__)
        handle_categorical(res, self.__categorical__)
        handle_string(res, self.__string__)

        return res

    def _resources(self, **kwargs) -> Tuple[str]:
        """
        Return available resources for this type of query.

        Parameters
        ----------
        **kwargs
            Keyword arguments used for filtering unwanted resources.

        Returns
        -------
        tuple
            Unique and sorted resources.
        """
        return tuple(
            sorted(
                res for res, params in self._downloader.resources.items()
                if self._query_type.endpoint in params.get(Key.QUERIES.s, {})
                and self._resource_filter(
                    params[Key.QUERIES.s][self._query_type.endpoint], **kwargs)
            ))

    def _modify_params(self, params: Dict[str, Any]) -> Dict[str, Any]:
        """
        Remove parameters from this query.

        Parameters
        ----------
        params
            The parameters to filter.

        Returns
        -------
        :class:`dict`
            The filtered parameters.
        """
        return params

    @abstractmethod
    def _post_process(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Post process the result, e.g. by adding extra columns.

        df
            The result from :meth:`get`.

        Returns
        -------
        :class:`pandas.DataFrame`
            The maybe modified result.
        """
        pass

    @abstractmethod
    def _resource_filter(self, data: Mapping[str, Any], **kwargs) -> bool:
        """
        Filter out resources relevant to this query.

        Parameters
        ----------
        data
            Data which is used as a basis for the filtering.
        kwargs
            Additional keyword arguments.

        Returns
        --------
        bool
            `True` if the resource should be included, otherwise `False`.
        """
        pass

    def __str__(self) -> str:
        return f"<{self.__class__.__name__}>"

    def __repr__(self) -> str:
        return str(self)