示例#1
0

from __future__ import absolute_import
import contextlib
import gzip
import os

import requests

from gutenberg._domain_model.persistence import local_path
from gutenberg._domain_model.types import validate_etextno
from gutenberg._util.os import makedirs
from gutenberg._util.os import remove


_TEXT_CACHE = local_path('text')


def _format_download_uri(etextno):
    """Returns the download location on the Project Gutenberg servers for a
    given text.

    """
    uri_root = r'http://www.gutenberg.lib.md.us'

    if 0 < etextno < 10:
        oldstyle_files = (
            'when11',
            'bill11',
            'jfk11',
            'getty11',
示例#2
0
try:
    import urllib2
except ImportError:
    import urllib.request as urllib2

from rdflib.graph import Graph
from rdflib.term import URIRef

from gutenberg._domain_model.persistence import local_path
from gutenberg._domain_model.vocabulary import DCTERMS
from gutenberg._domain_model.vocabulary import PGTERMS
from gutenberg._util.logging import disable_logging
from gutenberg._util.os import makedirs
from gutenberg._util.os import remove

_METADATA_CACHE = local_path(
    os.path.join('~/Desktop/machineLearning/metadata', 'metadata.db'))
_METADATA_DATABASE_SINGLETON = None


@contextlib.contextmanager
def _download_metadata_archive():
    """Makes a remote call to the Project Gutenberg servers and downloads the
    entire Project Gutenberg meta-data catalog. The catalog describes the texts
    on Project Gutenberg in RDF. The function returns a file-pointer to the
    catalog.

    """
    data_url = r'http://www.gutenberg.org/cache/epub/feeds/rdf-files.tar.bz2'
    with tempfile.NamedTemporaryFile(delete=False) as metadata_archive:
        shutil.copyfileobj(urllib2.urlopen(data_url), metadata_archive)
    yield metadata_archive.name
示例#3
0
try:
    import urllib2
except ImportError:
    import urllib.request as urllib2

from rdflib.graph import Graph
from rdflib.term import URIRef

from gutenberg._domain_model.persistence import local_path
from gutenberg._domain_model.vocabulary import DCTERMS
from gutenberg._domain_model.vocabulary import PGTERMS
from gutenberg._util.logging import disable_logging
from gutenberg._util.os import makedirs
from gutenberg._util.os import remove

_METADATA_CACHE = local_path(os.path.join('metadata', 'metadata.db'))
_METADATA_DATABASE_SINGLETON = None


@contextlib.contextmanager
def _download_metadata_archive():
    """Makes a remote call to the Project Gutenberg servers and downloads the
    entire Project Gutenberg meta-data catalog. The catalog describes the texts
    on Project Gutenberg in RDF. The function returns a file-pointer to the
    catalog.

    """
    data_url = r'http://www.gutenberg.org/cache/epub/feeds/rdf-files.tar.bz2'
    with tempfile.NamedTemporaryFile(delete=False) as metadata_archive:
        shutil.copyfileobj(urllib2.urlopen(data_url), metadata_archive)
    yield metadata_archive.name
示例#4
0
from six import with_metaclass

from gutenberg._domain_model.exceptions import CacheAlreadyExistsException
from gutenberg._domain_model.exceptions import InvalidCacheException
from gutenberg._domain_model.persistence import local_path
from gutenberg._domain_model.vocabulary import DCTERMS
from gutenberg._domain_model.vocabulary import PGTERMS
from gutenberg._util.logging import disable_logging
from gutenberg._util.os import makedirs
from gutenberg._util.os import remove
from gutenberg._util.url import urlopen

_GUTENBERG_CATALOG_URL = \
    r'http://www.gutenberg.org/cache/epub/feeds/rdf-files.tar.bz2'
_DB_IDENTIFIER = 'urn:gutenberg:metadata'
_DB_PATH = local_path(os.path.join('metadata', 'metadata.db'))


class MetadataCache(with_metaclass(abc.ABCMeta, object)):
    """Super-class for all metadata cache implementations.

    """
    def __init__(self, store, cache_uri):
        self.store = store
        self.cache_uri = cache_uri
        self.graph = Graph(store=self.store, identifier=_DB_IDENTIFIER)
        self.is_open = False
        self.catalog_source = _GUTENBERG_CATALOG_URL

    @property
    def exists(self):
示例#5
0
"""Module to deal with text acquisition."""

from __future__ import absolute_import
import contextlib
import gzip
import os

import requests

from gutenberg._domain_model.persistence import local_path
from gutenberg._domain_model.types import validate_etextno
from gutenberg._util.os import makedirs
from gutenberg._util.os import remove

_TEXT_CACHE = local_path('text')


def _format_download_uri(etextno):
    """Returns the download location on the Project Gutenberg servers for a
    given text.

    """
    uri_root = r'http://www.gutenberg.lib.md.us'

    if 0 < etextno < 10:
        oldstyle_files = (
            'when11',
            'bill11',
            'jfk11',
            'getty11',
            'const11',
示例#6
0
文件: metadata.py 项目: c-w/Gutenberg
from six import with_metaclass

from gutenberg._domain_model.exceptions import CacheAlreadyExistsException
from gutenberg._domain_model.exceptions import InvalidCacheException
from gutenberg._domain_model.persistence import local_path
from gutenberg._domain_model.vocabulary import DCTERMS
from gutenberg._domain_model.vocabulary import PGTERMS
from gutenberg._util.logging import disable_logging
from gutenberg._util.os import makedirs
from gutenberg._util.os import remove
from gutenberg._util.url import urlopen

_GUTENBERG_CATALOG_URL = \
    r'http://www.gutenberg.org/cache/epub/feeds/rdf-files.tar.bz2'
_DB_IDENTIFIER = 'urn:gutenberg:metadata'
_DB_PATH = local_path(os.path.join('metadata', 'metadata.db'))


class MetadataCache(with_metaclass(abc.ABCMeta, object)):
    """Super-class for all metadata cache implementations.

    """
    def __init__(self, store, cache_uri):
        self.store = store
        self.cache_uri = cache_uri
        self.graph = Graph(store=self.store, identifier=_DB_IDENTIFIER)
        self.is_open = False
        self.catalog_source = _GUTENBERG_CATALOG_URL

    @property
    def exists(self):
示例#7
0
    import urllib2
except ImportError:
    import urllib.request as urllib2

from rdflib.graph import Graph
from rdflib.term import URIRef

from gutenberg._domain_model.persistence import local_path
from gutenberg._domain_model.vocabulary import DCTERMS
from gutenberg._domain_model.vocabulary import PGTERMS
from gutenberg._util.logging import disable_logging
from gutenberg._util.os import makedirs
from gutenberg._util.os import remove


_METADATA_CACHE = local_path(os.path.join('metadata', 'metadata.db'))
_METADATA_DATABASE_SINGLETON = None


@contextlib.contextmanager
def _download_metadata_archive():
    """Makes a remote call to the Project Gutenberg servers and downloads the
    entire Project Gutenberg meta-data catalog. The catalog describes the texts
    on Project Gutenberg in RDF. The function returns a file-pointer to the
    catalog.

    """
    data_url = r'http://www.gutenberg.org/cache/epub/feeds/rdf-files.tar.bz2'
    with tempfile.NamedTemporaryFile(delete=False) as metadata_archive:
        shutil.copyfileobj(urllib2.urlopen(data_url), metadata_archive)
    yield metadata_archive.name
示例#8
0
    import urllib2
except ImportError:
    import urllib.request as urllib2

from rdflib.graph import Graph
from rdflib.term import URIRef

from gutenberg._domain_model.persistence import local_path
from gutenberg._domain_model.vocabulary import DCTERMS
from gutenberg._domain_model.vocabulary import PGTERMS
from gutenberg._util.logging import disable_logging
from gutenberg._util.os import makedirs
from gutenberg._util.os import remove


_METADATA_CACHE = local_path(os.path.join('~/Desktop/machineLearning/metadata', 'metadata.db'))
_METADATA_DATABASE_SINGLETON = None


@contextlib.contextmanager
def _download_metadata_archive():
    """Makes a remote call to the Project Gutenberg servers and downloads the
    entire Project Gutenberg meta-data catalog. The catalog describes the texts
    on Project Gutenberg in RDF. The function returns a file-pointer to the
    catalog.

    """
    data_url = r'http://www.gutenberg.org/cache/epub/feeds/rdf-files.tar.bz2'
    with tempfile.NamedTemporaryFile(delete=False) as metadata_archive:
        shutil.copyfileobj(urllib2.urlopen(data_url), metadata_archive)
    yield metadata_archive.name
示例#9
0
文件: text.py 项目: hugovk/Gutenberg
from __future__ import absolute_import

import gzip
import os
from contextlib import closing

import requests

from gutenberg._domain_model.exceptions import UnknownDownloadUriException
from gutenberg._domain_model.persistence import local_path
from gutenberg._domain_model.types import validate_etextno
from gutenberg._util.os import makedirs
from gutenberg._util.os import remove


_TEXT_CACHE = local_path("text")


def _format_download_uri(etextno):
    """Returns the download location on the Project Gutenberg servers for a
    given text.

    Raises:
        UnknownDownloadUri: If no download location can be found for the text.

    """
    uri_root = r"http://www.gutenberg.lib.md.us"

    if 0 < etextno < 10:
        oldstyle_files = ("when11", "bill11", "jfk11", "getty11", "const11", "liber11", "mayfl11", "linc211", "linc111")
        etextno = int(etextno)