def __init__(self, config=None, config_file=None, debug=None, http_adapter_kwargs=None): """Initialize :class:`ArchiveSession <ArchiveSession>` object with config. :type config: dict :param config: (optional) A config dict used for initializing the :class:`ArchiveSession <ArchiveSession>` object. :type config_file: str :param config_file: (optional) Path to config file used for initializing the :class:`ArchiveSession <ArchiveSession>` object. :type http_adapter_kwargs: dict :param http_adapter_kwargs: (optional) Keyword arguments used to initialize the :class:`requests.adapters.HTTPAdapter <HTTPAdapter>` object. :returns: :class:`ArchiveSession` object. """ super(ArchiveSession, self).__init__() http_adapter_kwargs = {} if not http_adapter_kwargs else http_adapter_kwargs debug = False if not debug else True self.config = get_config(config, config_file) self.config_file = config_file self.cookies.update(self.config.get('cookies', {})) self.secure = self.config.get('general', {}).get('secure', True) self.host = self.config.get('general', {}).get('host', 'archive.org') if 'archive.org' not in self.host: self.host += '.archive.org' self.protocol = 'https:' if self.secure else 'http:' user_email = self.config.get('cookies', dict()).get('logged-in-user') if user_email: user_email = user_email.split(';')[0] user_email = unquote(user_email) self.user_email = user_email self.access_key = self.config.get('s3', {}).get('access') self.secret_key = self.config.get('s3', {}).get('secret') self.http_adapter_kwargs = http_adapter_kwargs self.headers = default_headers() self.headers.update({'User-Agent': self._get_user_agent_string()}) self.headers.update({'Connection': 'close'}) self.mount_http_adapter() logging_config = self.config.get('logging', {}) if logging_config.get('level'): self.set_file_logger( logging_config.get('level', 'NOTSET'), logging_config.get('file', 'internetarchive.log')) if debug or (logger.level <= 10): self.set_file_logger( logging_config.get('level', 'NOTSET'), logging_config.get('file', 'internetarchive.log'), 'urllib3')
def __init__(self, config=None, config_file=None, debug=None, http_adapter_kwargs=None): """Initialize :class:`ArchiveSession <ArchiveSession>` object with config. :type config: dict :param config: (optional) A config dict used for initializing the :class:`ArchiveSession <ArchiveSession>` object. :type config_file: str :param config_file: (optional) Path to config file used for initializing the :class:`ArchiveSession <ArchiveSession>` object. :type http_adapter_kwargs: dict :param http_adapter_kwargs: (optional) Keyword arguments used to initialize the :class:`requests.adapters.HTTPAdapter <HTTPAdapter>` object. :returns: :class:`ArchiveSession` object. """ super(ArchiveSession, self).__init__() http_adapter_kwargs = {} if not http_adapter_kwargs else http_adapter_kwargs debug = False if not debug else True self.config = get_config(config, config_file) self.cookies.update(self.config.get('cookies', {})) # Avoid InsecurePlatformWarning errors on older versions of Python. if sys.version_info < (2, 7, 9): self.secure = self.config.get('general', {}).get('secure', False) else: self.secure = self.config.get('general', {}).get('secure', True) self.protocol = 'https:' if self.secure else 'http:' self.access_key = self.config.get('s3', {}).get('access') self.secret_key = self.config.get('s3', {}).get('secret') self.http_adapter_kwargs = http_adapter_kwargs self.headers = default_headers() self.headers['User-Agent'] = self._get_user_agent_string() self._mount_http_adapter() logging_config = self.config.get('logging', {}) if logging_config.get('level'): self.set_file_logger(logging_config.get('level', 'NOTSET'), logging_config.get('file', 'internetarchive.log')) if debug or (logger.level <= 10): self.set_file_logger(logging_config.get('level', 'NOTSET'), logging_config.get('file', 'internetarchive.log'), 'requests.packages.urllib3')
def __init__(self, config=None, config_file=None, debug=None, http_adapter_kwargs=None): """Initialize :class:`ArchiveSession <ArchiveSession>` object with config. :type config: dict :param config: (optional) A config dict used for initializing the :class:`ArchiveSession <ArchiveSession>` object. :type config_file: str :param config_file: (optional) Path to config file used for initializing the :class:`ArchiveSession <ArchiveSession>` object. :type http_adapter_kwargs: dict :param http_adapter_kwargs: (optional) Keyword arguments used to initialize the :class:`requests.adapters.HTTPAdapter <HTTPAdapter>` object. :returns: :class:`ArchiveSession` object. """ super(ArchiveSession, self).__init__() http_adapter_kwargs = {} if not http_adapter_kwargs else http_adapter_kwargs debug = False if not debug else True self.config = get_config(config, config_file) self.config_file = config_file self.cookies.update(self.config.get('cookies', {})) self.secure = self.config.get('general', {}).get('secure', True) self.protocol = 'https:' if self.secure else 'http:' self.access_key = self.config.get('s3', {}).get('access') self.secret_key = self.config.get('s3', {}).get('secret') self.http_adapter_kwargs = http_adapter_kwargs self.headers = default_headers() self.headers.update({'User-Agent': self._get_user_agent_string()}) self.headers.update({'Connection': 'close'}) self.mount_http_adapter() logging_config = self.config.get('logging', {}) if logging_config.get('level'): self.set_file_logger(logging_config.get('level', 'NOTSET'), logging_config.get('file', 'internetarchive.log')) if debug or (logger.level <= 10): self.set_file_logger(logging_config.get('level', 'NOTSET'), logging_config.get('file', 'internetarchive.log'), 'urllib3')
def __init__(self, config=None, config_file=None, debug=None, http_adapter_kwargs=None): """Initialize :class:`ArchiveSession <ArchiveSession>` object with config. :type config: dict :param config: (optional) A config dict used for initializing the :class:`ArchiveSession <ArchiveSession>` object. :type config_file: str :param config_file: (optional) Path to config file used for initializing the :class:`ArchiveSession <ArchiveSession>` object. :type http_adapter_kwargs: dict :param http_adapter_kwargs: (optional) Keyword arguments used to initialize the :class:`requests.adapters.HTTPAdapter <HTTPAdapter>` object. :returns: :class:`ArchiveSession` object. """ super(ArchiveSession, self).__init__() http_adapter_kwargs = {} if not http_adapter_kwargs else http_adapter_kwargs debug = False if not debug else True self.config = get_config(config, config_file) self.cookies.update(self.config.get("cookies", {})) # Avoid InsecurePlatformWarning errors on older versions of Python. if sys.version_info < (2, 7, 9): self.secure = self.config.get("general", {}).get("secure", False) else: self.secure = self.config.get("general", {}).get("secure", True) self.protocol = "https:" if self.secure else "http:" self.access_key = self.config.get("s3", {}).get("access") self.secret_key = self.config.get("s3", {}).get("secret") self.http_adapter_kwargs = http_adapter_kwargs self.headers = default_headers() self.headers["User-Agent"] = self._get_user_agent_string() self._mount_http_adapter() logging_config = self.config.get("logging", {}) if logging_config.get("level"): self.set_file_logger( logging_config.get("level", "NOTSET"), logging_config.get("file", "internetarchive.log") ) if debug or (logger.level <= 10): self.set_file_logger( logging_config.get("level", "NOTSET"), logging_config.get("file", "internetarchive.log"), "requests.packages.urllib3", )
def __init__(self, config=None, config_file=None, debug=None, http_adapter_kwargs=None): """Initialize :class:`ArchiveSession <ArchiveSession>` object with config. :type config: dict :param config: (optional) A config dict used for initializing the :class:`ArchiveSession <ArchiveSession>` object. :type config_file: str :param config_file: (optional) Path to config file used for initializing the :class:`ArchiveSession <ArchiveSession>` object. :type http_adapter_kwargs: dict :param http_adapter_kwargs: (optional) Keyword arguments used to initialize the :class:`requests.adapters.HTTPAdapter <HTTPAdapter>` object. """ super(ArchiveSession, self).__init__() http_adapter_kwargs = {} if not http_adapter_kwargs else http_adapter_kwargs debug = False if not debug else True self.config = get_config(config, config_file) self.cookies.update(self.config.get('cookies', {})) # Avoid InsecurePlatformWarning errors on older versions of Python. if sys.version_info < (2, 7, 9): self.secure = self.config.get('secure', False) else: self.secure = self.config.get('secure', True) self.protocol = 'https:' if self.secure else 'http:' self.access_key = self.config.get('s3', {}).get('access') self.secret_key = self.config.get('s3', {}).get('secret') self.http_adapter_kwargs = http_adapter_kwargs self.headers = default_headers() self.headers['User-Agent'] = self._get_user_agent_string() self._mount_http_adapter() logging_config = self.config.get('logging', {}) if logging_config.get('level'): self.set_file_logger(logging_config.get('level', 'NOTSET'), logging_config.get('file', 'internetarchive.log')) if debug: self.set_file_logger(logging_config.get('level', 'NOTSET'), logging_config.get('file', 'internetarchive.log'), 'requests.packages.urllib3')
def __init__(self, config: Mapping | None = None, config_file: str = "", debug: bool = False, http_adapter_kwargs: MutableMapping | None = None): """Initialize :class:`ArchiveSession <ArchiveSession>` object with config. :param config: A config dict used for initializing the :class:`ArchiveSession <ArchiveSession>` object. :param config_file: Path to config file used for initializing the :class:`ArchiveSession <ArchiveSession>` object. :param http_adapter_kwargs: Keyword arguments used to initialize the :class:`requests.adapters.HTTPAdapter <HTTPAdapter>` object. :returns: :class:`ArchiveSession` object. """ super().__init__() http_adapter_kwargs = http_adapter_kwargs or {} debug = bool(debug) self.config = get_config(config, config_file) self.config_file = config_file for ck, cv in self.config.get('cookies', {}).items(): raw_cookie = f'{ck}={cv}' cookie_dict = parse_dict_cookies(raw_cookie) if not cookie_dict.get(ck): continue cookie = create_cookie(ck, cookie_dict[ck], domain=cookie_dict.get( 'domain', '.archive.org'), path=cookie_dict.get('path', '/')) self.cookies.set_cookie(cookie) self.secure: bool = self.config.get('general', {}).get('secure', True) self.host: str = self.config.get('general', {}).get('host', 'archive.org') if 'archive.org' not in self.host: self.host += '.archive.org' self.protocol = 'https:' if self.secure else 'http:' user_email = self.config.get('cookies', {}).get('logged-in-user') if user_email: user_email = user_email.split(';')[0] user_email = unquote(user_email) self.user_email: str = user_email self.access_key: str = self.config.get('s3', {}).get('access') self.secret_key: str = self.config.get('s3', {}).get('secret') self.http_adapter_kwargs: MutableMapping = http_adapter_kwargs or {} self.headers = default_headers() self.headers.update({'User-Agent': self._get_user_agent_string()}) self.headers.update({'Connection': 'close'}) self.mount_http_adapter() logging_config = self.config.get('logging', {}) if logging_config.get('level'): self.set_file_logger( logging_config.get('level', 'NOTSET'), logging_config.get('file', 'internetarchive.log')) if debug or (logger.level <= 10): self.set_file_logger( logging_config.get('level', 'NOTSET'), logging_config.get('file', 'internetarchive.log'), 'urllib3')
such a way that py.test does not automatically run it. """ import internetarchive as ia import datetime import hashlib import os import StringIO import string import tempfile import time import internetarchive.config as ic s3_conf = ic.get_config().get('s3', {}) access = s3_conf.get('access_key') secret = s3_conf.get('secret_key') # get_new_item() #_________________________________________________________________________________________ def get_new_item(): """return an ia item object for an item that does not yet exist""" now = datetime.datetime.utcnow() item_name = 'test_upload_iawrapper_' + now.strftime('%Y_%m_%d_%H%M%S') item = ia.Item(item_name) if item.exists is False: return item raise KeyError, 'Could not find a unique item name after 5 tries'
This test script creates a new archive.org item, and therefore is named in such a way that py.test does not automatically run it. """ import internetarchive as ia import datetime import hashlib import os import StringIO import string import tempfile import time import internetarchive.config as ic s3_conf = ic.get_config().get('s3', {}) access = s3_conf.get('access_key') secret = s3_conf.get('secret_key') # get_new_item() #_________________________________________________________________________________________ def get_new_item(): """return an ia item object for an item that does not yet exist""" now = datetime.datetime.utcnow() item_name = 'test_upload_iawrapper_' + now.strftime('%Y_%m_%d_%H%M%S') item = ia.Item(item_name) if item.exists is False: return item raise KeyError, 'Could not find a unique item name after 5 tries'
This test script creates a new archive.org item, and therefore is named in such a way that py.test does not automatically run it. """ import internetarchive as ia import datetime import hashlib import os import StringIO import tempfile import time import internetarchive.config as ic s3_conf = ic.get_config().get("s3", {}) access = s3_conf.get("access_key") secret = s3_conf.get("secret_key") # get_new_item() # _________________________________________________________________________________________ def get_new_item(): """return an ia item object for an item that does not yet exist""" now = datetime.datetime.utcnow() item_name = "test_upload_iawrapper_" + now.strftime("%Y_%m_%d_%H%M%S") item = ia.Item(item_name) if item.exists is False: return item raise KeyError, "Could not find a unique item name after 5 tries"
from bs4 import BeautifulSoup from internetarchive.config import get_config from bgp.modules.terms import (FulltextProcessor, IsbnExtractorModule, NGramProcessor, ReadingLevelModule, UrlExtractorModule, WordFreqModule, CopyrightPageDetectorModule, PageTypeProcessor) from bgp.utils import STOP_WORDS from subprocess import PIPE, Popen, STDOUT logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S', filename='obgp_errors.log') s3_keys = get_config().get('s3') IA = ia.get_session(s3_keys) ia.get_item = IA.get_item try: nltk.data.find('tokenizers/punkt') except LookupError: print('Downloading Punkt Tokenizer...') nltk.download('punkt') def _memoize_xml(self): if not hasattr(self, '_xml'): _memoize_xml_tic = time.perf_counter() try: