Exemplo n.º 1
0
    def __init__(
        self,
        training_url: str,
        testing_url: str,
        validation_url: str,
        cache_root: Optional[str] = None,
        force: bool = False,
        eager: bool = False,
        create_inverse_triples: bool = False,
        load_triples_kwargs: Optional[Mapping[str, Any]] = None,
        download_kwargs: Optional[Mapping[str, Any]] = None,
    ):
        """Initialize dataset.

        :param training_url: The URL of the training file
        :param testing_url: The URL of the testing file
        :param validation_url: The URL of the validation file
        :param cache_root:
            An optional directory to store the extracted files. Is none is given, the default PyKEEN directory is used.
            This is defined either by the environment variable ``PYKEEN_HOME`` or defaults to ``~/.data/pykeen``.
        :param force: If true, redownload any cached files
        :param eager: Should the data be loaded eagerly? Defaults to false.
        :param create_inverse_triples: Should inverse triples be created? Defaults to false.
        :param load_triples_kwargs: Arguments to pass through to :func:`TriplesFactory.from_path`
            and ultimately through to :func:`pykeen.triples.utils.load_triples`.
        :param download_kwargs: Keyword arguments to pass to :func:`pystow.utils.download`
        """
        self.cache_root = self._help_cache(cache_root)

        self.training_url = training_url
        self.testing_url = testing_url
        self.validation_url = validation_url

        training_path = self.cache_root.joinpath(
            name_from_url(self.training_url))
        testing_path = self.cache_root.joinpath(name_from_url(
            self.testing_url))
        validation_path = self.cache_root.joinpath(
            name_from_url(self.validation_url))

        download_kwargs = {} if download_kwargs is None else dict(
            download_kwargs)
        download_kwargs.setdefault("backend", "urllib")

        for url, path in [
            (self.training_url, training_path),
            (self.testing_url, testing_path),
            (self.validation_url, validation_path),
        ]:
            if force or not path.is_file():
                download(url, path, **download_kwargs)

        super().__init__(
            training_path=training_path,
            testing_path=testing_path,
            validation_path=validation_path,
            eager=eager,
            create_inverse_triples=create_inverse_triples,
            load_triples_kwargs=load_triples_kwargs,
        )
Exemplo n.º 2
0
    def __init__(
        self,
        training_url: str,
        testing_url: str,
        validation_url: str,
        cache_root: Optional[str] = None,
        stream: bool = True,
        force: bool = False,
        eager: bool = False,
        create_inverse_triples: bool = False,
        load_triples_kwargs: Optional[Mapping[str, Any]] = None,
    ):
        """Initialize dataset.

        :param training_url: The URL of the training file
        :param testing_url: The URL of the testing file
        :param validation_url: The URL of the validation file
        :param cache_root:
            An optional directory to store the extracted files. Is none is given, the default PyKEEN directory is used.
            This is defined either by the environment variable ``PYKEEN_HOME`` or defaults to ``~/.pykeen``.
        :param stream: Use :mod:`requests` be used for download if true otherwise use :mod:`urllib`
        :param force: If true, redownload any cached files
        :param eager: Should the data be loaded eagerly? Defaults to false.
        :param create_inverse_triples: Should inverse triples be created? Defaults to false.
        :param load_triples_kwargs: Arguments to pass through to :func:`TriplesFactory.from_path`
            and ultimately through to :func:`pykeen.triples.utils.load_triples`.
        """
        self.cache_root = self._help_cache(cache_root)

        self.training_url = training_url
        self.testing_url = testing_url
        self.validation_url = validation_url

        training_path = os.path.join(self.cache_root,
                                     name_from_url(self.training_url))
        testing_path = os.path.join(self.cache_root,
                                    name_from_url(self.testing_url))
        validation_path = os.path.join(self.cache_root,
                                       name_from_url(self.validation_url))

        for url, path in [
            (self.training_url, training_path),
            (self.testing_url, testing_path),
            (self.validation_url, validation_path),
        ]:
            if os.path.exists(path) and not force:
                continue
            _urlretrieve(url, path, stream=stream)

        super().__init__(
            training_path=training_path,
            testing_path=testing_path,
            validation_path=validation_path,
            eager=eager,
            create_inverse_triples=create_inverse_triples,
            load_triples_kwargs=load_triples_kwargs,
        )
Exemplo n.º 3
0
    def __init__(
        self,
        training_url: str,
        testing_url: str,
        validation_url: str,
        cache_root: Optional[str] = None,
        eager: bool = False,
        create_inverse_triples: bool = False,
        stream: bool = True,
        force: bool = False,
    ):
        """Initialize dataset.

        :param training_url: The URL of the training file
        :param testing_url: The URL of the testing file
        :param validation_url: The URL of the validation file
        :param cache_root:
            An optional directory to store the extracted files. Is none is given, the default PyKEEN directory is used.
            This is defined either by the environment variable ``PYKEEN_HOME`` or defaults to ``~/.pykeen``.
        :param eager: Should the data be loaded eagerly? Defaults to false.
        :param create_inverse_triples: Should inverse triples be created? Defaults to false.
        :param stream:
        :param force:
        """
        self.cache_root = self._help_cache(cache_root)

        self.training_url = training_url
        self.testing_url = testing_url
        self.validation_url = validation_url

        training_path = os.path.join(self.cache_root,
                                     name_from_url(self.training_url))
        testing_path = os.path.join(self.cache_root,
                                    name_from_url(self.testing_url))
        validation_path = os.path.join(self.cache_root,
                                       name_from_url(self.validation_url))

        for url, path in [
            (self.training_url, training_path),
            (self.testing_url, testing_path),
            (self.validation_url, validation_path),
        ]:
            if os.path.exists(path) and not force:
                continue
            _urlretrieve(url, path, stream=stream)

        super().__init__(
            training_path=training_path,
            testing_path=testing_path,
            validation_path=validation_path,
            eager=eager,
            create_inverse_triples=create_inverse_triples,
        )
Exemplo n.º 4
0
def ensure_path(
    prefix: str,
    *parts: str,
    url: str,
    version: VersionHint = None,
    name: Optional[str] = None,
    force: bool = False,
    stream: bool = False,
    urlretrieve_kwargs: Optional[Mapping[str, Any]] = None,
    error_on_missing: bool = False,
) -> str:
    """Download a file if it doesn't exist."""
    if name is None:
        name = name_from_url(url)

    path = prefix_directory_join(prefix, *parts, name=name, version=version)

    if not path.exists() and error_on_missing:
        raise FileNotFoundError

    if not path.exists() or force:
        _urlretrieve(url=url,
                     path=path,
                     stream=stream,
                     **(urlretrieve_kwargs or {}))

    return path.as_posix()
Exemplo n.º 5
0
 def test_name_from_url(self):
     """Test :func:`name_from_url`."""
     data = [
         ("test.tsv", "https://example.com/test.tsv"),
         ("test.tsv", "https://example.com/deeper/test.tsv"),
         ("test.tsv.gz", "https://example.com/deeper/test.tsv.gz"),
     ]
     for name, url in data:
         with self.subTest(name=name, url=url):
             self.assertEqual(name, name_from_url(url))
Exemplo n.º 6
0
def ensure_path(
    prefix: str,
    url: str,
    *,
    path: Optional[str] = None,
    use_requests: bool = False,
    force: bool = False,
    bucket: Optional[str] = None,
    s3_client: Optional[BaseClient] = None,
) -> str:
    """Download a file if it doesn't exist.

    :param force: If set to true, will re-download from source and re-upload to S3
    """
    if path is None:
        path = name_from_url(url)

    path = prefix_directory_join(prefix, path)

    if not os.path.exists(path) or force:
        if bucket is not None:  # try downloading from AWS if available
            s3_client = _ensure_s3_client(s3_client)
            s3_key = _get_s3_key(prefix, path)
            if not _has_file(s3_client, bucket=bucket,
                             key=s3_key) and not force:
                logger.info('downloading from AWS (bucket=%s): %s to %s',
                            bucket, s3_key, path)
                s3_client.download_file(bucket, s3_key, path)
                return path

        logger.info('downloading from source %s to %s', url, path)
        if use_requests:
            res = requests.get(url)
            with open(path, 'wb') as file:
                file.write(res.content)
        else:
            urlretrieve(url, path)  # noqa:S310

    if bucket is not None:
        s3_client = _ensure_s3_client(s3_client)
        s3_key = _get_s3_key(prefix, path)
        if _has_file(s3_client, bucket=bucket, key=s3_key) and not force:
            logger.debug('already available on S3. Not uploading again.')
            return path

        logger.info('uploading to AWS (bucket=%s): %s to %s', bucket, path,
                    s3_key)
        s3_client.upload_file(path, bucket, s3_key)

    return path
Exemplo n.º 7
0
    def __init__(
        self,
        relative_training_path: Union[str, pathlib.PurePath],
        relative_testing_path: Union[str, pathlib.PurePath],
        relative_validation_path: Union[str, pathlib.PurePath],
        url: Optional[str] = None,
        name: Optional[str] = None,
        cache_root: Optional[str] = None,
        eager: bool = False,
        create_inverse_triples: bool = False,
    ):
        """Initialize dataset.

        :param relative_training_path: The path inside the zip file for the training data
        :param relative_testing_path: The path inside the zip file for the testing data
        :param relative_validation_path: The path inside the zip file for the validation data
        :param url:
            The url where to download the dataset from
        :param name:
            The name of the file. If not given, tries to get the name from the end of the URL
        :param cache_root:
            An optional directory to store the extracted files. Is none is given, the default PyKEEN directory is used.
            This is defined either by the environment variable ``PYKEEN_HOME`` or defaults to ``~/.pykeen``.
        :param eager: Should the data be loaded eagerly? Defaults to false.
        :param create_inverse_triples: Should inverse triples be created? Defaults to false.

        :raises ValueError: if there's no URL specified and there is no data already at the calculated path
        """
        self.cache_root = self._help_cache(cache_root)

        self.name = name or name_from_url(url)
        self.path = self.cache_root.joinpath(self.name)
        logger.debug("file path at %s", self.path)

        self.url = url
        if not self.path.is_file() and not self.url:
            raise ValueError(
                f"must specify url to download from since path does not exist: {self.path}"
            )

        self.relative_training_path = pathlib.PurePath(relative_training_path)
        self.relative_testing_path = pathlib.PurePath(relative_testing_path)
        self.relative_validation_path = pathlib.PurePath(
            relative_validation_path)
        self.create_inverse_triples = create_inverse_triples
        if eager:
            self._load()
            self._load_validation()
Exemplo n.º 8
0
    def __init__(
        self,
        url: str,
        name: Optional[str] = None,
        cache_root: Optional[str] = None,
        eager: bool = False,
        create_inverse_triples: bool = False,
        random_state: TorchRandomHint = None,
        read_csv_kwargs: Optional[Dict[str, Any]] = None,
    ):
        """Initialize dataset.

        :param url:
            The url where to download the dataset from
        :param name:
            The name of the file. If not given, tries to get the name from the end of the URL
        :param cache_root:
            An optional directory to store the extracted files. Is none is given, the default PyKEEN directory is used.
            This is defined either by the environment variable ``PYKEEN_HOME`` or defaults to ``~/.pykeen``.
        :param eager: Should the data be loaded eagerly? Defaults to false.
        :param create_inverse_triples: Should inverse triples be created? Defaults to false.
        :param random_state: An optional random state to make the training/testing/validation split reproducible.
        :param read_csv_kwargs: Keyword arguments to pass through to :func:`pandas.read_csv`.

        :raises ValueError: if there's no URL specified and there is no data already at the calculated path
        """
        super().__init__(
            cache_root=cache_root,
            create_inverse_triples=create_inverse_triples,
            random_state=random_state,
            eager=False,  # because it gets hooked below
        )

        self.name = name or name_from_url(url)

        self.read_csv_kwargs = read_csv_kwargs or {}
        self.read_csv_kwargs.setdefault("sep", "\t")

        self.url = url
        if not self._get_path().is_file() and not self.url:
            raise ValueError(
                f"must specify url to download from since path does not exist: {self._get_path()}"
            )

        if eager:
            self._load()
Exemplo n.º 9
0
    def __init__(
        self,
        url: str,
        name: Optional[str] = None,
        cache_root: Optional[str] = None,
        eager: bool = False,
        create_inverse_triples: bool = False,
        random_state: TorchRandomHint = None,
        read_csv_kwargs: Optional[Dict[str, Any]] = None,
    ):
        """Initialize dataset.

        :param url:
            The url where to download the dataset from
        :param name:
            The name of the file. If not given, tries to get the name from the end of the URL
        :param cache_root:
            An optional directory to store the extracted files. Is none is given, the default PyKEEN directory is used.
            This is defined either by the environment variable ``PYKEEN_HOME`` or defaults to ``~/.pykeen``.
        :param eager: Should the data be loaded eagerly? Defaults to false.
        :param create_inverse_triples: Should inverse triples be created? Defaults to false.
        """
        super().__init__(
            cache_root=cache_root,
            create_inverse_triples=create_inverse_triples,
            random_state=random_state,
            eager=False,  # because it gets hooked below
        )

        self.name = name or name_from_url(url)

        self.read_csv_kwargs = read_csv_kwargs or {}
        self.read_csv_kwargs.setdefault('sep', '\t')

        self.url = url
        if not os.path.exists(self._get_path()) and not self.url:
            raise ValueError(
                f'must specify url to download from since path does not exist: {self._get_path()}'
            )

        if eager:
            self._load()
Exemplo n.º 10
0
    def __init__(
        self,
        relative_training_path: str,
        relative_testing_path: str,
        relative_validation_path: str,
        url: Optional[str] = None,
        name: Optional[str] = None,
        cache_root: Optional[str] = None,
        eager: bool = False,
        create_inverse_triples: bool = False,
    ):
        """Initialize dataset.

        :param url:
            The url where to download the dataset from
        :param name:
            The name of the file. If not given, tries to get the name from the end of the URL
        :param cache_root:
            An optional directory to store the extracted files. Is none is given, the default PyKEEN directory is used.
            This is defined either by the environment variable ``PYKEEN_HOME`` or defaults to ``~/.pykeen``.
        :param eager: Should the data be loaded eagerly? Defaults to false.
        :param create_inverse_triples: Should inverse triples be created? Defaults to false.
        """
        self.cache_root = self._help_cache(cache_root)

        self.name = name or name_from_url(url)
        self.path = os.path.join(self.cache_root, self.name)
        logger.debug('file path at %s', self.path)

        self.url = url
        if not os.path.exists(self.path) and not self.url:
            raise ValueError(
                f'must specify url to download from since path does not exist: {self.path}'
            )

        self.relative_training_path = relative_training_path
        self.relative_testing_path = relative_testing_path
        self.relative_validation_path = relative_validation_path
        self.create_inverse_triples = create_inverse_triples
        if eager:
            self._load()
            self._load_validation()
Exemplo n.º 11
0
    def __init__(
        self,
        url: str,
        relative_path: str,
        name: Optional[str] = None,
        cache_root: Optional[str] = None,
        eager: bool = False,
        create_inverse_triples: bool = False,
        delimiter: Optional[str] = None,
        random_state: TorchRandomHint = None,
        randomize_cleanup: bool = False,
    ):
        """Initialize dataset.

        :param url:
            The url where to download the dataset from
        :param name:
            The name of the file. If not given, tries to get the name from the end of the URL
        :param cache_root:
            An optional directory to store the extracted files. Is none is given, the default PyKEEN directory is used.
            This is defined either by the environment variable ``PYKEEN_HOME`` or defaults to ``~/.pykeen``.
        :param relative_path:
            The path inside the archive to the contained dataset.
        :param random_state:
            An optional random state to make the training/testing/validation split reproducible.
        :param delimiter:
            The delimiter for the contained dataset.
        """
        self.cache_root = self._help_cache(cache_root)

        self.name = name or name_from_url(url)
        self.random_state = random_state
        self.delimiter = delimiter or '\t'
        self.randomize_cleanup = randomize_cleanup
        self.url = url
        self.create_inverse_triples = create_inverse_triples
        self._relative_path = relative_path

        if eager:
            self._load()