Пример #1
0
    def __init__(self,
                 config=None,
                 config_file=None,
                 debug=None,
                 http_adapter_kwargs=None):
        """Initialize :class:`ArchiveSession <ArchiveSession>` object with config.

        :type config: dict
        :param config: (optional) A config dict used for initializing the
                       :class:`ArchiveSession <ArchiveSession>` object.

        :type config_file: str
        :param config_file: (optional) Path to config file used for initializing the
                            :class:`ArchiveSession <ArchiveSession>` object.

        :type http_adapter_kwargs: dict
        :param http_adapter_kwargs: (optional) Keyword arguments used to initialize the
                                    :class:`requests.adapters.HTTPAdapter <HTTPAdapter>`
                                    object.

        :returns: :class:`ArchiveSession` object.
        """
        super(ArchiveSession, self).__init__()
        http_adapter_kwargs = {} if not http_adapter_kwargs else http_adapter_kwargs
        debug = False if not debug else True

        self.config = get_config(config, config_file)
        self.config_file = config_file
        self.cookies.update(self.config.get('cookies', {}))
        self.secure = self.config.get('general', {}).get('secure', True)
        self.host = self.config.get('general', {}).get('host', 'archive.org')
        if 'archive.org' not in self.host:
            self.host += '.archive.org'
        self.protocol = 'https:' if self.secure else 'http:'
        user_email = self.config.get('cookies', dict()).get('logged-in-user')
        if user_email:
            user_email = user_email.split(';')[0]
            user_email = unquote(user_email)
        self.user_email = user_email
        self.access_key = self.config.get('s3', {}).get('access')
        self.secret_key = self.config.get('s3', {}).get('secret')
        self.http_adapter_kwargs = http_adapter_kwargs

        self.headers = default_headers()
        self.headers.update({'User-Agent': self._get_user_agent_string()})
        self.headers.update({'Connection': 'close'})

        self.mount_http_adapter()

        logging_config = self.config.get('logging', {})
        if logging_config.get('level'):
            self.set_file_logger(
                logging_config.get('level', 'NOTSET'),
                logging_config.get('file', 'internetarchive.log'))
            if debug or (logger.level <= 10):
                self.set_file_logger(
                    logging_config.get('level', 'NOTSET'),
                    logging_config.get('file', 'internetarchive.log'),
                    'urllib3')
Пример #2
0
    def __init__(self,
                 config=None,
                 config_file=None,
                 debug=None,
                 http_adapter_kwargs=None):
        """Initialize :class:`ArchiveSession <ArchiveSession>` object with config.

        :type config: dict
        :param config: (optional) A config dict used for initializing the
                       :class:`ArchiveSession <ArchiveSession>` object.

        :type config_file: str
        :param config_file: (optional) Path to config file used for initializing the
                            :class:`ArchiveSession <ArchiveSession>` object.

        :type http_adapter_kwargs: dict
        :param http_adapter_kwargs: (optional) Keyword arguments used to initialize the
                                    :class:`requests.adapters.HTTPAdapter <HTTPAdapter>`
                                    object.

        :returns: :class:`ArchiveSession` object.
        """
        super(ArchiveSession, self).__init__()
        http_adapter_kwargs = {} if not http_adapter_kwargs else http_adapter_kwargs
        debug = False if not debug else True

        self.config = get_config(config, config_file)
        self.cookies.update(self.config.get('cookies', {}))
        # Avoid InsecurePlatformWarning errors on older versions of Python.
        if sys.version_info < (2, 7, 9):
            self.secure = self.config.get('general', {}).get('secure', False)
        else:
            self.secure = self.config.get('general', {}).get('secure', True)
        self.protocol = 'https:' if self.secure else 'http:'
        self.access_key = self.config.get('s3', {}).get('access')
        self.secret_key = self.config.get('s3', {}).get('secret')
        self.http_adapter_kwargs = http_adapter_kwargs

        self.headers = default_headers()
        self.headers['User-Agent'] = self._get_user_agent_string()
        self._mount_http_adapter()

        logging_config = self.config.get('logging', {})
        if logging_config.get('level'):
            self.set_file_logger(logging_config.get('level', 'NOTSET'),
                                 logging_config.get('file', 'internetarchive.log'))
            if debug or (logger.level <= 10):
                self.set_file_logger(logging_config.get('level', 'NOTSET'),
                                     logging_config.get('file', 'internetarchive.log'),
                                     'requests.packages.urllib3')
Пример #3
0
    def __init__(self,
                 config=None,
                 config_file=None,
                 debug=None,
                 http_adapter_kwargs=None):
        """Initialize :class:`ArchiveSession <ArchiveSession>` object with config.

        :type config: dict
        :param config: (optional) A config dict used for initializing the
                       :class:`ArchiveSession <ArchiveSession>` object.

        :type config_file: str
        :param config_file: (optional) Path to config file used for initializing the
                            :class:`ArchiveSession <ArchiveSession>` object.

        :type http_adapter_kwargs: dict
        :param http_adapter_kwargs: (optional) Keyword arguments used to initialize the
                                    :class:`requests.adapters.HTTPAdapter <HTTPAdapter>`
                                    object.

        :returns: :class:`ArchiveSession` object.
        """
        super(ArchiveSession, self).__init__()
        http_adapter_kwargs = {} if not http_adapter_kwargs else http_adapter_kwargs
        debug = False if not debug else True

        self.config = get_config(config, config_file)
        self.config_file = config_file
        self.cookies.update(self.config.get('cookies', {}))
        self.secure = self.config.get('general', {}).get('secure', True)
        self.protocol = 'https:' if self.secure else 'http:'
        self.access_key = self.config.get('s3', {}).get('access')
        self.secret_key = self.config.get('s3', {}).get('secret')
        self.http_adapter_kwargs = http_adapter_kwargs

        self.headers = default_headers()
        self.headers.update({'User-Agent': self._get_user_agent_string()})
        self.headers.update({'Connection': 'close'})

        self.mount_http_adapter()

        logging_config = self.config.get('logging', {})
        if logging_config.get('level'):
            self.set_file_logger(logging_config.get('level', 'NOTSET'),
                                 logging_config.get('file', 'internetarchive.log'))
            if debug or (logger.level <= 10):
                self.set_file_logger(logging_config.get('level', 'NOTSET'),
                                     logging_config.get('file', 'internetarchive.log'),
                                     'urllib3')
Пример #4
0
    def __init__(self, config=None, config_file=None, debug=None, http_adapter_kwargs=None):
        """Initialize :class:`ArchiveSession <ArchiveSession>` object with config.

        :type config: dict
        :param config: (optional) A config dict used for initializing the
                       :class:`ArchiveSession <ArchiveSession>` object.

        :type config_file: str
        :param config_file: (optional) Path to config file used for initializing the
                            :class:`ArchiveSession <ArchiveSession>` object.

        :type http_adapter_kwargs: dict
        :param http_adapter_kwargs: (optional) Keyword arguments used to initialize the
                                    :class:`requests.adapters.HTTPAdapter <HTTPAdapter>`
                                    object.

        :returns: :class:`ArchiveSession` object.
        """
        super(ArchiveSession, self).__init__()
        http_adapter_kwargs = {} if not http_adapter_kwargs else http_adapter_kwargs
        debug = False if not debug else True

        self.config = get_config(config, config_file)
        self.cookies.update(self.config.get("cookies", {}))
        # Avoid InsecurePlatformWarning errors on older versions of Python.
        if sys.version_info < (2, 7, 9):
            self.secure = self.config.get("general", {}).get("secure", False)
        else:
            self.secure = self.config.get("general", {}).get("secure", True)
        self.protocol = "https:" if self.secure else "http:"
        self.access_key = self.config.get("s3", {}).get("access")
        self.secret_key = self.config.get("s3", {}).get("secret")
        self.http_adapter_kwargs = http_adapter_kwargs

        self.headers = default_headers()
        self.headers["User-Agent"] = self._get_user_agent_string()
        self._mount_http_adapter()

        logging_config = self.config.get("logging", {})
        if logging_config.get("level"):
            self.set_file_logger(
                logging_config.get("level", "NOTSET"), logging_config.get("file", "internetarchive.log")
            )
            if debug or (logger.level <= 10):
                self.set_file_logger(
                    logging_config.get("level", "NOTSET"),
                    logging_config.get("file", "internetarchive.log"),
                    "requests.packages.urllib3",
                )
Пример #5
0
    def __init__(self,
                 config=None,
                 config_file=None,
                 debug=None,
                 http_adapter_kwargs=None):
        """Initialize :class:`ArchiveSession <ArchiveSession>` object with config.

        :type config: dict
        :param config: (optional) A config dict used for initializing the
                       :class:`ArchiveSession <ArchiveSession>` object.

        :type config_file: str
        :param config_file: (optional) Path to config file used for initializing the
                            :class:`ArchiveSession <ArchiveSession>` object.

        :type http_adapter_kwargs: dict
        :param http_adapter_kwargs: (optional) Keyword arguments used to initialize the
                                    :class:`requests.adapters.HTTPAdapter <HTTPAdapter>`
                                    object.
        """
        super(ArchiveSession, self).__init__()
        http_adapter_kwargs = {} if not http_adapter_kwargs else http_adapter_kwargs
        debug = False if not debug else True

        self.config = get_config(config, config_file)
        self.cookies.update(self.config.get('cookies', {}))
        # Avoid InsecurePlatformWarning errors on older versions of Python.
        if sys.version_info < (2, 7, 9):
            self.secure = self.config.get('secure', False)
        else:
            self.secure = self.config.get('secure', True)
        self.protocol = 'https:' if self.secure else 'http:'
        self.access_key = self.config.get('s3', {}).get('access')
        self.secret_key = self.config.get('s3', {}).get('secret')
        self.http_adapter_kwargs = http_adapter_kwargs

        self.headers = default_headers()
        self.headers['User-Agent'] = self._get_user_agent_string()
        self._mount_http_adapter()

        logging_config = self.config.get('logging', {})
        if logging_config.get('level'):
            self.set_file_logger(logging_config.get('level', 'NOTSET'),
                                 logging_config.get('file', 'internetarchive.log'))
            if debug:
                self.set_file_logger(logging_config.get('level', 'NOTSET'),
                                     logging_config.get('file', 'internetarchive.log'),
                                     'requests.packages.urllib3')
Пример #6
0
    def __init__(self,
                 config: Mapping | None = None,
                 config_file: str = "",
                 debug: bool = False,
                 http_adapter_kwargs: MutableMapping | None = None):
        """Initialize :class:`ArchiveSession <ArchiveSession>` object with config.

        :param config: A config dict used for initializing the
                       :class:`ArchiveSession <ArchiveSession>` object.

        :param config_file: Path to config file used for initializing the
                            :class:`ArchiveSession <ArchiveSession>` object.

        :param http_adapter_kwargs: Keyword arguments used to initialize the
                                    :class:`requests.adapters.HTTPAdapter <HTTPAdapter>`
                                    object.

        :returns: :class:`ArchiveSession` object.
        """
        super().__init__()
        http_adapter_kwargs = http_adapter_kwargs or {}
        debug = bool(debug)

        self.config = get_config(config, config_file)
        self.config_file = config_file
        for ck, cv in self.config.get('cookies', {}).items():
            raw_cookie = f'{ck}={cv}'
            cookie_dict = parse_dict_cookies(raw_cookie)
            if not cookie_dict.get(ck):
                continue
            cookie = create_cookie(ck,
                                   cookie_dict[ck],
                                   domain=cookie_dict.get(
                                       'domain', '.archive.org'),
                                   path=cookie_dict.get('path', '/'))
            self.cookies.set_cookie(cookie)

        self.secure: bool = self.config.get('general', {}).get('secure', True)
        self.host: str = self.config.get('general',
                                         {}).get('host', 'archive.org')
        if 'archive.org' not in self.host:
            self.host += '.archive.org'
        self.protocol = 'https:' if self.secure else 'http:'
        user_email = self.config.get('cookies', {}).get('logged-in-user')
        if user_email:
            user_email = user_email.split(';')[0]
            user_email = unquote(user_email)
        self.user_email: str = user_email
        self.access_key: str = self.config.get('s3', {}).get('access')
        self.secret_key: str = self.config.get('s3', {}).get('secret')
        self.http_adapter_kwargs: MutableMapping = http_adapter_kwargs or {}

        self.headers = default_headers()
        self.headers.update({'User-Agent': self._get_user_agent_string()})
        self.headers.update({'Connection': 'close'})

        self.mount_http_adapter()

        logging_config = self.config.get('logging', {})
        if logging_config.get('level'):
            self.set_file_logger(
                logging_config.get('level', 'NOTSET'),
                logging_config.get('file', 'internetarchive.log'))
            if debug or (logger.level <= 10):
                self.set_file_logger(
                    logging_config.get('level', 'NOTSET'),
                    logging_config.get('file', 'internetarchive.log'),
                    'urllib3')
Пример #7
0
such a way that py.test does not automatically run it.
"""

import internetarchive as ia
import datetime
import hashlib
import os
import StringIO
import string
import tempfile
import time

import internetarchive.config as ic


s3_conf = ic.get_config().get('s3', {})
access = s3_conf.get('access_key')
secret = s3_conf.get('secret_key')


# get_new_item()
#_________________________________________________________________________________________
def get_new_item():
    """return an ia item object for an item that does not yet exist"""
    now = datetime.datetime.utcnow()
    item_name = 'test_upload_iawrapper_' + now.strftime('%Y_%m_%d_%H%M%S')
    item = ia.Item(item_name)
    if item.exists is False:
        return item

    raise KeyError, 'Could not find a unique item name after 5 tries'
Пример #8
0
This test script creates a new archive.org item, and therefore is named in
such a way that py.test does not automatically run it.
"""

import internetarchive as ia
import datetime
import hashlib
import os
import StringIO
import string
import tempfile
import time

import internetarchive.config as ic

s3_conf = ic.get_config().get('s3', {})
access = s3_conf.get('access_key')
secret = s3_conf.get('secret_key')


# get_new_item()
#_________________________________________________________________________________________
def get_new_item():
    """return an ia item object for an item that does not yet exist"""
    now = datetime.datetime.utcnow()
    item_name = 'test_upload_iawrapper_' + now.strftime('%Y_%m_%d_%H%M%S')
    item = ia.Item(item_name)
    if item.exists is False:
        return item

    raise KeyError, 'Could not find a unique item name after 5 tries'
Пример #9
0
This test script creates a new archive.org item, and therefore is named in
such a way that py.test does not automatically run it.
"""

import internetarchive as ia
import datetime
import hashlib
import os
import StringIO
import tempfile
import time

import internetarchive.config as ic


s3_conf = ic.get_config().get("s3", {})
access = s3_conf.get("access_key")
secret = s3_conf.get("secret_key")


# get_new_item()
# _________________________________________________________________________________________
def get_new_item():
    """return an ia item object for an item that does not yet exist"""
    now = datetime.datetime.utcnow()
    item_name = "test_upload_iawrapper_" + now.strftime("%Y_%m_%d_%H%M%S")
    item = ia.Item(item_name)
    if item.exists is False:
        return item

    raise KeyError, "Could not find a unique item name after 5 tries"
Пример #10
0
from bs4 import BeautifulSoup
from internetarchive.config import get_config

from bgp.modules.terms import (FulltextProcessor, IsbnExtractorModule,
                               NGramProcessor, ReadingLevelModule,
                               UrlExtractorModule, WordFreqModule,
                               CopyrightPageDetectorModule, PageTypeProcessor)
from bgp.utils import STOP_WORDS
from subprocess import PIPE, Popen, STDOUT

logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s',
                    level=logging.INFO,
                    datefmt='%Y-%m-%d %H:%M:%S',
                    filename='obgp_errors.log')

s3_keys = get_config().get('s3')

IA = ia.get_session(s3_keys)
ia.get_item = IA.get_item

try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    print('Downloading Punkt Tokenizer...')
    nltk.download('punkt')


def _memoize_xml(self):
    if not hasattr(self, '_xml'):
        _memoize_xml_tic = time.perf_counter()
        try: