示例#1
0
文件: base.py 项目: pykeen/pykeen
    def _load_helper(
        self,
        relative_path: pathlib.PurePath,
        entity_to_id: Optional[Mapping[str, Any]] = None,
        relation_to_id: Optional[Mapping[str, Any]] = None,
    ) -> TriplesFactory:
        if not self.path.is_file():
            if self.url is None:
                raise ValueError("url should be set")
            logger.info("downloading data from %s to %s", self.url, self.path)
            download(url=self.url, path=self.path)

        with zipfile.ZipFile(file=self.path) as zf:
            # relative paths within zip file's always follow Posix path, even on Windows
            with zf.open(relative_path.as_posix()) as file:
                logger.debug("loading %s", relative_path)
                df = pd.read_csv(
                    file,
                    usecols=[
                        self.head_column, self.relation_column,
                        self.tail_column
                    ],
                    header=self.header,
                    sep=self.sep,
                )
                return TriplesFactory.from_labeled_triples(
                    triples=df.values,
                    create_inverse_triples=self.create_inverse_triples,
                    metadata={"path": relative_path},
                    entity_to_id=entity_to_id,
                    relation_to_id=relation_to_id,
                )
示例#2
0
    def _load(self) -> None:
        path = self.cache_root.joinpath("OpenEA_dataset_v2.0.zip")

        # ensure file is present
        if not path.is_file() or self.force:
            logger.info(f"Downloading file from Dropbox (Link: {self.__class__.DROPBOX_LINK})")
            download(url=self.__class__.DROPBOX_LINK, path=path, hexdigests={"sha512": self.SHA512})

        df = read_zipfile_csv(
            path=path,
            inner_path=str(self._relative_path_relations),
            header=None,
            names=[LABEL_HEAD, LABEL_RELATION, LABEL_TAIL],
            sep="\t",
            encoding="utf8",
            dtype=str,
        )

        # create triples factory
        tf = TriplesFactory.from_labeled_triples(
            triples=df.values,
            create_inverse_triples=self.create_inverse_triples,
            metadata={"path": path},
        )

        # split
        self._training, self._testing, self._validation = cast(
            Tuple[TriplesFactory, TriplesFactory, TriplesFactory],
            tf.split(
                ratios=self.ratios,
                random_state=self.random_state,
            ),
        )
        logger.info("[%s] done splitting data from %s", self.__class__.__name__, path)
示例#3
0
文件: base.py 项目: sednanref/pykeen
    def _load(self) -> None:
        if not self._get_path().is_file():
            download(self.url, self._get_path())  # noqa:S310

        _actual_path = self.cache_root.joinpath(self._relative_path)
        if not _actual_path.is_file():
            logger.error(
                '[%s] untaring from %s (%s) to %s',
                self.__class__.__name__,
                self._get_path(),
                self._relative_path,
                _actual_path,
            )
            with tarfile.open(self._get_path()) as tar_file:
                # tarfile does not like pathlib
                tar_file.extract(str(self._relative_path), self.cache_root)

        df = pd.read_csv(_actual_path, sep=self.delimiter)
        tf_path = self._get_path()
        tf = TriplesFactory.from_labeled_triples(
            triples=df.values,
            create_inverse_triples=self.create_inverse_triples,
            metadata={'path': tf_path},
        )
        self._training, self._testing, self._validation = cast(
            Tuple[TriplesFactory, TriplesFactory, TriplesFactory],
            tf.split(
                ratios=self.ratios,
                random_state=self.random_state,
            ),
        )
        logger.info('[%s] done splitting data from %s',
                    self.__class__.__name__, tf_path)
示例#4
0
文件: base.py 项目: pykeen/pykeen
    def __init__(
        self,
        training_url: str,
        testing_url: str,
        validation_url: str,
        cache_root: Optional[str] = None,
        force: bool = False,
        eager: bool = False,
        create_inverse_triples: bool = False,
        load_triples_kwargs: Optional[Mapping[str, Any]] = None,
        download_kwargs: Optional[Mapping[str, Any]] = None,
    ):
        """Initialize dataset.

        :param training_url: The URL of the training file
        :param testing_url: The URL of the testing file
        :param validation_url: The URL of the validation file
        :param cache_root:
            An optional directory to store the extracted files. Is none is given, the default PyKEEN directory is used.
            This is defined either by the environment variable ``PYKEEN_HOME`` or defaults to ``~/.data/pykeen``.
        :param force: If true, redownload any cached files
        :param eager: Should the data be loaded eagerly? Defaults to false.
        :param create_inverse_triples: Should inverse triples be created? Defaults to false.
        :param load_triples_kwargs: Arguments to pass through to :func:`TriplesFactory.from_path`
            and ultimately through to :func:`pykeen.triples.utils.load_triples`.
        :param download_kwargs: Keyword arguments to pass to :func:`pystow.utils.download`
        """
        self.cache_root = self._help_cache(cache_root)

        self.training_url = training_url
        self.testing_url = testing_url
        self.validation_url = validation_url

        training_path = self.cache_root.joinpath(
            name_from_url(self.training_url))
        testing_path = self.cache_root.joinpath(name_from_url(
            self.testing_url))
        validation_path = self.cache_root.joinpath(
            name_from_url(self.validation_url))

        download_kwargs = {} if download_kwargs is None else dict(
            download_kwargs)
        download_kwargs.setdefault("backend", "urllib")

        for url, path in [
            (self.training_url, training_path),
            (self.testing_url, testing_path),
            (self.validation_url, validation_path),
        ]:
            if force or not path.is_file():
                download(url, path, **download_kwargs)

        super().__init__(
            training_path=training_path,
            testing_path=testing_path,
            validation_path=validation_path,
            eager=eager,
            create_inverse_triples=create_inverse_triples,
            load_triples_kwargs=load_triples_kwargs,
        )
示例#5
0
文件: base.py 项目: pykeen/pykeen
    def _get_df(self) -> pd.DataFrame:
        path = self._get_path()
        if not path.is_file():
            download(self.url, self._get_path())  # noqa:S310

        with zipfile.ZipFile(path) as zip_file:
            with zip_file.open(self._relative_path.as_posix()) as file:
                df = pd.read_csv(file, sep=self.delimiter)
        return df
示例#6
0
文件: base.py 项目: wuxiaoxue/pykeen
    def __init__(
        self,
        training_url: str,
        testing_url: str,
        validation_url: str,
        cache_root: Optional[str] = None,
        stream: bool = True,
        force: bool = False,
        eager: bool = False,
        create_inverse_triples: bool = False,
        load_triples_kwargs: Optional[Mapping[str, Any]] = None,
    ):
        """Initialize dataset.

        :param training_url: The URL of the training file
        :param testing_url: The URL of the testing file
        :param validation_url: The URL of the validation file
        :param cache_root:
            An optional directory to store the extracted files. Is none is given, the default PyKEEN directory is used.
            This is defined either by the environment variable ``PYKEEN_HOME`` or defaults to ``~/.pykeen``.
        :param stream: Use :mod:`requests` be used for download if true otherwise use :mod:`urllib`
        :param force: If true, redownload any cached files
        :param eager: Should the data be loaded eagerly? Defaults to false.
        :param create_inverse_triples: Should inverse triples be created? Defaults to false.
        :param load_triples_kwargs: Arguments to pass through to :func:`TriplesFactory.from_path`
            and ultimately through to :func:`pykeen.triples.utils.load_triples`.
        """
        self.cache_root = self._help_cache(cache_root)

        self.training_url = training_url
        self.testing_url = testing_url
        self.validation_url = validation_url

        training_path = os.path.join(self.cache_root,
                                     name_from_url(self.training_url))
        testing_path = os.path.join(self.cache_root,
                                    name_from_url(self.testing_url))
        validation_path = os.path.join(self.cache_root,
                                       name_from_url(self.validation_url))

        for url, path in [
            (self.training_url, training_path),
            (self.testing_url, testing_path),
            (self.validation_url, validation_path),
        ]:
            if os.path.exists(path) and not force:
                continue
            download(url, path, stream=stream, backend='requests')

        super().__init__(
            training_path=training_path,
            testing_path=testing_path,
            validation_path=validation_path,
            eager=eager,
            create_inverse_triples=create_inverse_triples,
            load_triples_kwargs=load_triples_kwargs,
        )
示例#7
0
 def test_hash_success(self):
     """Test checking actually works."""
     self.assertFalse(self.path.exists())
     download(
         url=TEST_TXT.as_uri(),
         path=self.path,
         hexdigests={
             "md5": self.expected_md5,
         },
     )
示例#8
0
 def test_hash_error(self):
     """Test hash error on download."""
     self.assertFalse(self.path.exists())
     with self.assertRaises(HexDigestError):
         download(
             url=TEST_TXT.as_uri(),
             path=self.path,
             hexdigests={
                 "md5": self.mismatching_md5_hexdigest,
             },
         )
示例#9
0
 def test_hash_remote_error(self):
     """Test hash error on download."""
     self.assertFalse(self.path.exists())
     with self.assertRaises(HexDigestError):
         download(
             url=TEST_TXT.as_uri(),
             path=self.path,
             hexdigests_remote={
                 "md5": TEST_TXT_WRONG_MD5.as_uri(),
             },
             hexdigests_strict=True,
         )
示例#10
0
 def test_hash_remote_verbose_success(self):
     """Test checking actually works."""
     self.assertFalse(self.path.exists())
     download(
         url=TEST_TXT.as_uri(),
         path=self.path,
         hexdigests_remote={
             "md5": TEST_TXT_VERBOSE_MD5.as_uri(),
         },
         hexdigests_strict=False,
     )
     self.assertTrue(self.path.exists())
示例#11
0
 def test_hash_remote_verbose_failure(self):
     """Test checking actually works."""
     self.assertFalse(self.path.exists())
     with self.assertRaises(HexDigestError):
         download(
             url=TEST_TXT.as_uri(),
             path=self.path,
             hexdigests_remote={
                 "md5": TEST_TXT_VERBOSE_MD5.as_uri(),
             },
             hexdigests_strict=True,
         )
示例#12
0
文件: base.py 项目: pykeen/pykeen
    def _get_df(self) -> pd.DataFrame:
        if not self._get_path().is_file():
            logger.info("downloading data from %s to %s", self.url,
                        self._get_path())
            download(url=self.url, path=self._get_path())  # noqa:S310
        df = pd.read_csv(self._get_path(), **self.read_csv_kwargs)

        usecols = self.read_csv_kwargs.get("usecols")
        if usecols is not None:
            logger.info("reordering columns: %s", usecols)
            df = df[usecols]

        return df
示例#13
0
文件: base.py 项目: wuxiaoxue/pykeen
    def _get_df(self) -> pd.DataFrame:
        if not os.path.exists(self._get_path()):
            logger.info('downloading data from %s to %s', self.url,
                        self._get_path())
            download(url=self.url, path=self._get_path())  # noqa:S310
        df = pd.read_csv(self._get_path(), **self.read_csv_kwargs)

        usecols = self.read_csv_kwargs.get('usecols')
        if usecols is not None:
            logger.info('reordering columns: %s', usecols)
            df = df[usecols]

        return df
示例#14
0
    def test_force(self):
        """Test overwriting wrong file."""
        # now if force=True it should not bother with the hash check
        self.path.write_text("test file content")

        self.assertTrue(self.path.exists())
        download(
            url=TEST_TXT.as_uri(),
            path=self.path,
            hexdigests={
                "md5": self.expected_md5,
            },
            force=True,
        )
示例#15
0
    def test_override_hash_error(self):
        """Test hash error on download."""
        self.path.write_text("test file content")

        self.assertTrue(self.path.exists())
        with self.assertRaises(HexDigestError):
            download(
                url=TEST_TXT.as_uri(),
                path=self.path,
                hexdigests={
                    "md5": self.expected_md5,
                },
                force=False,
            )
示例#16
0
    def test_override_hash_remote_error(self):
        """Test hash error on download."""
        self.path.write_text("test file content")

        self.assertTrue(self.path.exists())
        with self.assertRaises(HexDigestError):
            download(
                url=TEST_TXT.as_uri(),
                path=self.path,
                hexdigests_remote={
                    "md5": TEST_TXT_MD5.as_uri(),
                },
                hexdigests_strict=True,
                force=False,
            )
示例#17
0
文件: base.py 项目: pykeen/pykeen
    def _get_df(self) -> pd.DataFrame:
        if not self._get_path().is_file():
            download(self.url, self._get_path())  # noqa:S310

        _actual_path = self.cache_root.joinpath(self._relative_path)
        if not _actual_path.is_file():
            logger.error(
                "[%s] untaring from %s (%s) to %s",
                self.__class__.__name__,
                self._get_path(),
                self._relative_path,
                _actual_path,
            )
            with tarfile.open(self._get_path()) as tar_file:
                # tarfile does not like pathlib
                tar_file.extract(str(self._relative_path), self.cache_root)

        df = pd.read_csv(_actual_path, sep=self.delimiter)
        return df
示例#18
0
def _ensure_obo_path(prefix: str,
                     url: Optional[str] = None,
                     force: bool = False) -> str:
    """Get the path to the OBO file and download if missing."""
    if url is not None:
        warnings.warn('Should make curations in the bioregistry instead',
                      DeprecationWarning)
        path = get_prefix_obo_path(prefix).as_posix()
        download(url=url, path=path, force=force)
        return path

    curated_url = get_curated_urls().get(prefix)
    if curated_url:
        logger.debug('[%s] checking for OBO at curated URL: %s', prefix,
                     curated_url)
        return ensure_path(prefix, url=curated_url, force=force)

    path = get_prefix_obo_path(prefix)
    if os.path.exists(path):
        logger.debug('[%s] OBO already exists at %s', prefix, path)
        return path.as_posix()

    obofoundry = get_obofoundry(mappify=True)
    entry = obofoundry.get(prefix)
    if entry is None:
        raise NoOboFoundry(f'OBO Foundry is missing the prefix: {prefix}')

    build = entry.get('build')
    if build is None:
        raise MissingOboBuild(f'OBO Foundry is missing a build for: {prefix}')

    url = build.get('source_url')
    if url is None:
        raise MissingOboBuild(
            f'OBO Foundry build is missing a URL for: {prefix}, {build}')

    return ensure_path(prefix, url=url, force=force)
示例#19
0
文件: base.py 项目: wuxiaoxue/pykeen
    def _load_helper(self, relative_path: str) -> TriplesFactory:
        if not os.path.exists(self.path):
            if self.url is None:
                raise ValueError('url should be set')
            logger.info('downloading data from %s to %s', self.url, self.path)
            download(url=self.url, path=self.path)

        with zipfile.ZipFile(file=self.path) as zf:
            with zf.open(relative_path) as file:
                logger.debug('loading %s', relative_path)
                df = pd.read_csv(
                    file,
                    usecols=[
                        self.head_column, self.relation_column,
                        self.tail_column
                    ],
                    header=self.header,
                    sep=self.sep,
                )
                return TriplesFactory.from_labeled_triples(
                    triples=df.values,
                    create_inverse_triples=self.create_inverse_triples,
                    metadata={'path': relative_path},
                )