Exemplo n.º 1
 def test_repr(self):
     A = namedtuple('A', 'x')
     self.assertEqual(repr(A(1)), 'A(x=1)')
     # repr should show the name of the subclass
     class B(A):
     self.assertEqual(repr(B(1)), 'B(x=1)')
Exemplo n.º 2
 def test_name_conflicts(self):
     # Some names like "self", "cls", "tuple", "itemgetter", and "property"
     # failed when used as field names.  Test to make sure these now work.
     T = namedtuple('T', 'itemgetter property self cls tuple')
     t = T(1, 2, 3, 4, 5)
     self.assertEqual(t, (1,2,3,4,5))
     newt = t._replace(itemgetter=10, property=20, self=30, cls=40, tuple=50)
     self.assertEqual(newt, (10,20,30,40,50))
Exemplo n.º 3
    def test_factory(self):
        Point = namedtuple('Point', 'x y')
        self.assertEqual(Point.__name__, 'Point')
        self.assertEqual(Point.__slots__, ())
        self.assertEqual(Point.__module__, __name__)
        self.assertEqual(Point.__getitem__, tuple.__getitem__)
        self.assertEqual(Point._fields, ('x', 'y'))

        self.assertRaises(ValueError, namedtuple, 'abc%', 'efg ghi')       # type has non-alpha char
        self.assertRaises(ValueError, namedtuple, 'class', 'efg ghi')      # type has keyword
        self.assertRaises(ValueError, namedtuple, '9abc', 'efg ghi')       # type starts with digit

        self.assertRaises(ValueError, namedtuple, 'abc', 'efg g%hi')       # field with non-alpha char
        self.assertRaises(ValueError, namedtuple, 'abc', 'abc class')      # field has keyword
        self.assertRaises(ValueError, namedtuple, 'abc', '8efg 9ghi')      # field starts with digit
        self.assertRaises(ValueError, namedtuple, 'abc', '_efg ghi')       # field with leading underscore
        self.assertRaises(ValueError, namedtuple, 'abc', 'efg efg ghi')    # duplicate field

        namedtuple('Point0', 'x1 y2')   # Verify that numbers are allowed in names
        namedtuple('_', 'a b c')        # Test leading underscores in a typename

        nt = namedtuple('nt', 'the quick brown fox')                       # check unicode input
        self.assertNotIn("u'", repr(nt._fields))
        nt = namedtuple('nt', ('the', 'quick'))                           # check unicode input
        self.assertNotIn("u'", repr(nt._fields))

        self.assertRaises(TypeError, Point._make, [11])                     # catch too few args
        self.assertRaises(TypeError, Point._make, [11, 22, 33])             # catch too many args
Exemplo n.º 4
 def test_name_fixer(self):
     for spec, renamed in [
         [('efg', 'g%hi'),  ('efg', '_1')],                              # field with non-alpha char
         [('abc', 'class'), ('abc', '_1')],                              # field has keyword
         [('8efg', '9ghi'), ('_0', '_1')],                               # field starts with digit
         [('abc', '_efg'), ('abc', '_1')],                               # field with leading underscore
         [('abc', 'efg', 'efg', 'ghi'), ('abc', 'efg', '_2', 'ghi')],    # duplicate field
         [('abc', '', 'x'), ('abc', '_1', 'x')],                         # fieldname is a space
         self.assertEqual(namedtuple('NT', spec, rename=True)._fields, renamed)
Exemplo n.º 5
    def test_odd_sizes(self):
        Zero = namedtuple('Zero', '')
        self.assertEqual(Zero(), ())
        self.assertEqual(Zero._make([]), ())
        self.assertEqual(repr(Zero()), 'Zero()')
        self.assertEqual(Zero()._asdict(), {})
        self.assertEqual(Zero()._fields, ())

        Dot = namedtuple('Dot', 'd')
        self.assertEqual(Dot(1), (1,))
        self.assertEqual(Dot._make([1]), (1,))
        self.assertEqual(Dot(1).d, 1)
        self.assertEqual(repr(Dot(1)), 'Dot(d=1)')
        self.assertEqual(Dot(1)._asdict(), {'d':1})
        self.assertEqual(Dot(1)._replace(d=999), (999,))
        self.assertEqual(Dot(1)._fields, ('d',))

        # n = 5000
        n = 254 # SyntaxError: more than 255 arguments:
        import string, random
        names = list(set(''.join([random.choice(string.ascii_letters)
                                  for j in range(10)]) for i in range(n)))
        n = len(names)
        Big = namedtuple('Big', names)
        b = Big(*range(n))
        self.assertEqual(b, tuple(range(n)))
        self.assertEqual(Big._make(range(n)), tuple(range(n)))
        for pos, name in enumerate(names):
            self.assertEqual(getattr(b, name), pos)
        repr(b)                                 # make sure repr() doesn't blow-up
        d = b._asdict()
        d_expected = dict(zip(names, range(n)))
        self.assertEqual(d, d_expected)
        b2 = b._replace(**dict([(names[1], 999),(names[-5], 42)]))
        b2_expected = list(range(n))
        b2_expected[1] = 999
        b2_expected[-5] = 42
        self.assertEqual(b2, tuple(b2_expected))
        self.assertEqual(b._fields, tuple(names))
Exemplo n.º 6
    def test_instance(self):
        Point = namedtuple('Point', 'x y')
        p = Point(11, 22)
        self.assertEqual(p, Point(x=11, y=22))
        self.assertEqual(p, Point(11, y=22))
        self.assertEqual(p, Point(y=22, x=11))
        self.assertEqual(p, Point(*(11, 22)))
        self.assertEqual(p, Point(**dict(x=11, y=22)))
        self.assertRaises(TypeError, Point, 1)                              # too few args
        self.assertRaises(TypeError, Point, 1, 2, 3)                        # too many args
        self.assertRaises(TypeError, eval, 'Point(XXX=1, y=2)', locals())   # wrong keyword argument
        self.assertRaises(TypeError, eval, 'Point(x=1)', locals())          # missing keyword argument
        self.assertEqual(repr(p), 'Point(x=11, y=22)')
        self.assertNotIn('__weakref__', dir(p))
        self.assertEqual(p, Point._make([11, 22]))                          # test _make classmethod
        self.assertEqual(p._fields, ('x', 'y'))                             # test _fields attribute
        self.assertEqual(p._replace(x=1), (1, 22))                          # test _replace method
        self.assertEqual(p._asdict(), dict(x=11, y=22))                     # test _asdict method
        self.assertEqual(vars(p), p._asdict())                              # verify that vars() works

            p._replace(x=1, error=2)
        except ValueError:
            self._fail('Did not detect an incorrect fieldname')

        # verify that field string can have commas
        Point = namedtuple('Point', 'x, y')
        p = Point(x=11, y=22)
        self.assertEqual(repr(p), 'Point(x=11, y=22)')

        # verify that fieldspec can be a non-string sequence
        Point = namedtuple('Point', ('x', 'y'))
        p = Point(x=11, y=22)
        self.assertEqual(repr(p), 'Point(x=11, y=22)')
Exemplo n.º 7
    def test_tupleness(self):
        Point = namedtuple('Point', 'x y')
        p = Point(11, 22)

        self.assertIsInstance(p, tuple)
        self.assertEqual(p, (11, 22))                                       # matches a real tuple
        self.assertEqual(tuple(p), (11, 22))                                # coercable to a real tuple
        self.assertEqual(list(p), [11, 22])                                 # coercable to a list
        self.assertEqual(max(p), 22)                                        # iterable
        self.assertEqual(max(*p), 22)                                       # star-able
        x, y = p
        self.assertEqual(p, (x, y))                                         # unpacks like a tuple
        self.assertEqual((p[0], p[1]), (11, 22))                            # indexable like a tuple
        self.assertRaises(IndexError, p.__getitem__, 3)

        self.assertEqual(p.x, x)
        self.assertEqual(p.y, y)
        self.assertRaises(AttributeError, eval, 'p.z', locals())
Exemplo n.º 8
from wpull.scraper.util import urljoin_safe
from wpull.url import parse_url_or_log
from wpull.writer import NullWriter

_logger = logging.getLogger(__name__)
_ = gettext.gettext

GLOB_CHARS = frozenset('[]*?')

FTPProcessorFetchParams = namedlist.namedtuple(
        ('remove_listing', True),
        ('glob', True),
        ('preserve_permissions', False),
        ('retr_symlinks', True),

    remove_listing (bool): Remove `.listing` files after fetching.
    glob (bool): Enable URL globbing.
    preserve_permissions (bool): Preserve file permissions.
    follow_symlinks (bool): Follow symlinks.

FTPProcessorInstances = namedlist.namedtuple(
Exemplo n.º 9
import namedlist
import asyncio

from wpull.driver.process import Process
import wpull.util

_logger = logging.getLogger(__name__)

PhantomJSDriverParams = namedlist.namedtuple('PhantomJSDriverParamsType', [
    ('snapshot_paths', []),
    ('wait_time', 1),
    ('num_scrolls', 10),
    ('smart_scroll', True),
    ('snapshot', True),
    ('viewport_size', (1200, 1920)),
    ('paper_size', (2400, 3840)),
    ('event_log_filename', None),
    ('action_log_filename', None),
    ('custom_headers', {}),
    ('page_settings', {}),
'''PhantomJS Driver parameters

    url (str): URL of page to fetch.
    snapshot_type (list): List of filenames. Accepted extensions are html,
        pdf, png, gif.
    wait_time (float): Time between page scrolls.
    num_scrolls (int): Maximum number of scrolls.
    smart_scroll (bool): Whether to stop scrolling if number of
Exemplo n.º 10
from wpull.protocol.ftp.util import FTPServerError
from wpull.scraper.util import urljoin_safe
from wpull.url import parse_url_or_log, URLInfo
from wpull.writer import NullWriter, BaseFileWriter

_logger = StyleAdapter(logging.getLogger(__name__))
_ = gettext.gettext

GLOB_CHARS = frozenset('[]*?')

FTPProcessorFetchParams = namedlist.namedtuple(
        ('remove_listing', True),
        ('glob', True),
        ('preserve_permissions', False),
        ('retr_symlinks', True),

    remove_listing (bool): Remove `.listing` files after fetching.
    glob (bool): Enable URL globbing.
    preserve_permissions (bool): Preserve file permissions.
    follow_symlinks (bool): Follow symlinks.

class HookPreResponseBreak(ProtocolError):
Exemplo n.º 11
        for session in self._sessions:

    def __exit__(self, *args):
        for context in self._contexts:

WARCRecorderParams = namedlist.namedtuple(
        ('compress', True),
        ('extra_fields', None),
        ('temp_dir', None),
        ('log', True),
        ('appending', False),
        ('digests', True),
        ('cdx', None),
        ('max_size', None),
        ('move_to', None),
        ('url_table', None),
        ('software_string', None)
''':class:`WARCRecorder` parameters.

    compress (bool): If True, files will be compressed with gzip
    extra_fields (list): A list of key-value pairs containing extra
        metadata fields
    temp_dir (str): Directory to use for temporary files
    log (bool): Include the program logging messages in the WARC file
Exemplo n.º 12
from trollius import From, Return

from wpull.backport.logging import BraceMessage as __
from wpull.document.html import HTMLReader
from wpull.body import Body
from wpull.driver.phantomjs import PhantomJSDriverParams
from wpull.namevalue import NameValueRecord
from wpull.warc import WARCRecord
import wpull.url

PhantomJSParams = namedlist.namedtuple('PhantomJSParamsType', [
    ('snapshot_types', ('html', 'pdf')),
    ('wait_time', 1),
    ('num_scrolls', 10),
    ('smart_scroll', True),
    ('snapshot', True),
    ('viewport_size', (1200, 1920)),
    ('paper_size', (2400, 3840)),
    ('load_time', 900),
    ('custom_headers', {}),
    ('page_settings', {}),
'''PhantomJS parameters

    snapshot_type (list): File types. Accepted are html, pdf, png, gif.
    wait_time (float): Time between page scrolls.
    num_scrolls (int): Maximum number of scrolls.
    smart_scroll (bool): Whether to stop scrolling if number of
        requests & responses do not change.
    snapshot (bool): Whether to take snapshot files.
    viewport_size (tuple): Width and height of the page viewport.
Exemplo n.º 13
        It must call one of :meth:`.engine.URLItem.set_status` or

    def close(self):
        '''Run any clean up actions.'''

WebProcessorFetchParams = namedlist.namedtuple(
        ('retry_connrefused', False),
        ('retry_dns_error', False),
        ('post_data', None),
        ('strong_redirects', True),
        ('content_on_error', False),

    retry_connrefused: If True, don't consider a connection refused error
        to be a permanent error.
    retry_dns_error: If True, don't consider a DNS resolution error to be
        permanent error.
    post_data (str): If provided, all requests will be POSTed with the
        given `post_data`. `post_data` must be in percent-encoded
        query format ("application/x-www-form-urlencoded").
    strong_redirects (bool): If True, redirects are allowed to span hosts.
Exemplo n.º 14
 def __init__(self):
     self.Result = namedtuple('Result', ['type', 'text', 'result'])
Exemplo n.º 15
 def __init__(self, field_names, default=NO_DEFAULT, filters=None):
     self.args_tuple = namedtuple('_ArgsTuple', field_names, default)
     self.fields = self.args_tuple._fields
     self.filters = (_validate_filters(self.fields, filters)
                     if filters else {})
Exemplo n.º 16
 def __init__(self, field_names, default=NO_DEFAULT, filters=None):
     self.args_tuple = namedtuple('_ArgsTuple', field_names, default)
     self.fields = self.args_tuple._fields
     self.filters = (_validate_filters(self.fields, filters)
                     if filters else {})
Exemplo n.º 17
    def __init__(self, *args, **kwargs):
        self.size_limit = kwargs.pop("size_limit", None)
        OrderedDict.__init__(self, *args, **kwargs)

    def __setitem__(self, key, value):
        OrderedDict.__setitem__(self, key, value)

    def _check_size_limit(self):
        if self.size_limit is not None:
            while len(self) > self.size_limit:

EpicComment = namedlist.namedtuple('EpicComment', [('id', ''),
                                                   ('submission', '')])

class EpicValidator(CommentValidator):
    __slots__ = ['_sticky_store', '_comment_store']

    def __init__(self, reddit):
        self._sticky_store = LimitedSizeDict(size_limit=20)
        self._comment_store = deque(maxlen=200)

    def validate(self, comment: Comment) -> Tuple[Action, Rule]:
        css_class = comment.author_flair_css_class
        if not self.has_comment(comment) and css_class and css_class.lower(
        ) in self.config['general']['class']:
Exemplo n.º 18
_ = gettext.gettext
_logger = logging.getLogger(__name__)

'''Default buffer size in bytes.'''
    itertools.chain(range(100, 200),
                    [http.client.NO_CONTENT, http.client.NOT_MODIFIED]))
'''Status codes where a response body is prohibited.'''

ConnectionParams = namedlist.namedtuple('ConnectionParamsType', [
    ('bind_address', None),
    ('keep_alive', True),
    ('ssl_options', None),
    ('connect_timeout', None),
    ('read_timeout', None),
    ('buffer_size', DEFAULT_BUFFER_SIZE),
    ('no_content_codes', DEFAULT_NO_CONTENT_CODES),
    ('ignore_length', False),
'''Parameters for connections.

    bind_address: The IP address to bind the socket. Must match
        :meth:`socket.SocketType.bind`. Use this if your local host has
        multiple IP addresses.
    keep_alive (bool): If True, use HTTP keep-alive.
    ssl_options: A ``dict`` containing options for :func:`ssl.wrap_socket`
    connect_timeout (float): If given, the time in seconds before the
        connection is timed out during connection. Otherwise, depend on the
        underlying libraries for timeout.
Exemplo n.º 19
from wpull.driver.process import Process
import wpull.util

_logger = logging.getLogger(__name__)

PhantomJSDriverParams = namedlist.namedtuple(
    'PhantomJSDriverParamsType', [
        ('snapshot_paths', []),
        ('wait_time', 1),
        ('num_scrolls', 10),
        ('smart_scroll', True),
        ('snapshot', True),
        ('viewport_size', (1200, 1920)),
        ('paper_size', (2400, 3840)),
        ('event_log_filename', None),
        ('action_log_filename', None),
        ('custom_headers', {}),
        ('page_settings', {}),
'''PhantomJS Driver parameters

    url (str): URL of page to fetch.
    snapshot_type (list): List of filenames. Accepted extensions are html,
        pdf, png, gif.
    wait_time (float): Time between page scrolls.
    num_scrolls (int): Maximum number of scrolls.
Exemplo n.º 20

NormalizationParams = namedlist.namedtuple(
        ('sort_query', False),
        ('always_delim_query', False)
'''Parameters for URL normalization.

    sort_query (bool): Whether to sort the query string items.
    always_delim_query: Whether to always deliminate the key-value items where
        value is empty.

class URLInfo(_URLInfoType):
    '''A named tuple containing the parts of the URL.
Exemplo n.º 21
 def test_factory_doc_attr(self):
     Point = namedtuple('Point', 'x y')
     self.assertEqual(Point.__doc__, 'Point(x, y)')
Exemplo n.º 22
        self.assertEqual(Point.__doc__, "Point(dx=FACTORY({0}), dy=FACTORY({0}), dz=11.0)".format(list_repr))

        Point = namedlist('Point', ['dx', 'dy', ('dz', FACTORY(11.0))], default=[])
        self.assertEqual(Point.__doc__, 'Point(dx=[], dy=[], dz=FACTORY(11.0))')

    def test_slice(self):
        Point = namedlist('Point', 'x y z color')
        values = [3, 5, -12, 'red']
        p = Point(*values)
        self.assertEqual(values[0:-1], p[0:-1])
        self.assertEqual(values[:3], p[:3])
        self.assertEqual(values[4:1:-1], p[4:1:-1])

TestNT = namedtuple('TestNT', 'x y z')    # type used for pickle tests

class TestNamedTuple(unittest.TestCase):

    def test_unicode_identifiers(self):
        Point = namedtuple(u'Point', u'x y')
        p = Point(10, 20)
        self.assertEqual((p.x, p.y), (10, 20))
        self.assertEqual(p._asdict(), {'x':10, 'y':20})

    def test_factory(self):
        Point = namedtuple('Point', 'x y')
        self.assertEqual(Point.__name__, 'Point')
        self.assertEqual(Point.__slots__, ())
        self.assertEqual(Point.__module__, __name__)
        self.assertEqual(Point.__getitem__, tuple.__getitem__)
Exemplo n.º 23
 def test_unicode_identifiers(self):
     Point = namedtuple(u'Point', u'x y')
     p = Point(10, 20)
     self.assertEqual((p.x, p.y), (10, 20))
     self.assertEqual(p._asdict(), {'x':10, 'y':20})
Exemplo n.º 24
"""Listing parser."""
import re

import itertools
import namedlist

from wpull.protocol.ftp.ls.date import parse_datetime
import wpull.protocol.ftp.ls.date

FileEntry = namedlist.namedtuple(
    "FileEntryType", ["name", ("type", None), ("size", None), ("date", None), ("dest", None), ("perm", None)]
"""A row in a listing.

    name (str): Filename.
    type (str, None): ``file``, ``dir``, ``symlink``, ``other``, ``None``
    size (int, None): Size of file.
    date (:class:`datetime.datetime`, None): A datetime object in UTC.
    dest (str, None): Destination filename for symlinks.
    perm (int, None): Unix permissions expressed as an integer.

class ListingError(ValueError):
    """Error during parsing a listing."""

class UnknownListingError(ListingError):
    """Failed to determine type of listing."""
Exemplo n.º 25
'''Base classes'''
import abc
import collections
import io
import namedlist

from wpull.document.base import BaseTextStreamReader, \
    BaseHTMLReader, BaseExtractiveReader
from wpull.scraper.util import urljoin_safe

LinkContext = namedlist.namedtuple('LinkContextType', [
    'link', ('inline', False), ('linked', False), ('link_type', None),
    ('extra', None)
'''A named tuple describing a scraped link.

    link (str): The link that was scraped.
    inline (bool): Whether the link is an embeded object.
    linked (bool): Whether the link links to another page.
    link_type: A value from :class:`.item.LinkType`.
    extra: Any extra info.

class ScrapeResult(dict):
    '''Links scraped from a document.

    This class is subclassed from ``dict`` and contains convenience methods.
    def __init__(self, link_contexts, encoding):
Exemplo n.º 26
from wpull.processor.base import BaseProcessor, BaseProcessorSession, \
from wpull.processor.rule import FetchRule, ResultRule, ProcessingRule
from wpull.url import URLInfo
from wpull.writer import BaseFileWriter
import wpull.string
import wpull.util

_logger = StyleAdapter(logging.getLogger(__name__))
_ = gettext.gettext

WebProcessorFetchParams = namedlist.namedtuple(
        ('post_data', None),
        ('strong_redirects', True),
        ('content_on_error', False),

    post_data (str): If provided, all requests will be POSTed with the
        given `post_data`. `post_data` must be in percent-encoded
        query format ("application/x-www-form-urlencoded").
    strong_redirects (bool): If True, redirects are allowed to span hosts.

class HookPreResponseBreak(ProtocolError):
    '''Hook pre-response break.'''
Exemplo n.º 27
from wpull.processor.base import BaseProcessor, BaseProcessorSession, \
from wpull.processor.rule import FetchRule, ResultRule, ProcessingRule
from wpull.url import URLInfo
from wpull.writer import BaseFileWriter
import wpull.string
import wpull.util

_logger = StyleAdapter(logging.getLogger(__name__))
_ = gettext.gettext

WebProcessorFetchParams = namedlist.namedtuple(
        ('post_data', None),
        ('strong_redirects', True),
        ('content_on_error', False),

    post_data (str): If provided, all requests will be POSTed with the
        given `post_data`. `post_data` must be in percent-encoded
        query format ("application/x-www-form-urlencoded").
    strong_redirects (bool): If True, redirects are allowed to span hosts.

class HookPreResponseBreak(ProtocolError):
    '''Hook pre-response break.'''
Exemplo n.º 28
'''Default buffer size in bytes.'''
DEFAULT_NO_CONTENT_CODES = frozenset(itertools.chain(
    range(100, 200),
    [http.client.NO_CONTENT, http.client.NOT_MODIFIED]
'''Status codes where a response body is prohibited.'''

ConnectionParams = namedlist.namedtuple(
        ('bind_address', None),
        ('keep_alive', True),
        ('ssl_options', None),
        ('connect_timeout', None),
        ('read_timeout', None),
        ('buffer_size', DEFAULT_BUFFER_SIZE),
        ('no_content_codes', DEFAULT_NO_CONTENT_CODES),
        ('ignore_length', False),
'''Parameters for connections.

    bind_address: The IP address to bind the socket. Must match
        :meth:`socket.SocketType.bind`. Use this if your local host has
        multiple IP addresses.
    keep_alive (bool): If True, use HTTP keep-alive.
    ssl_options: A ``dict`` containing options for :func:`ssl.wrap_socket`
    connect_timeout (float): If given, the time in seconds before the
Exemplo n.º 29
'''Listing parser.'''
import re

import namedlist

from wpull.ftp.ls.date import parse_datetime

FileEntry = namedlist.namedtuple(
        ('type', None),
        ('size', None),
        ('date', None),
        ('dest', None),
        ('perm', None)
'''A row in a listing.

    name (str): Filename.
    type (str, None): ``file``, ``dir``, ``symlink``, ``other``, ``None``
    size (int, None): Size of file.
    date (:class:`datetime.datetime`, None): A datetime object in UTC.
    dest (str, None): Destination filename for symlinks.
    perm (int, None): Unix permissions expressed as an integer.

class ListingError(ValueError):
Exemplo n.º 30
lang_aliases = dict()
max_upload_size = 5 * 1024 * 1024  # 5 MB

VolumeInfo = namedtuple('VolumeInfo', 'name container_path mode')
_extra_volumes = {
    'python3-tensorflow': [
        VolumeInfo('deeplearning-samples', '/home/work/samples', 'ro'),
    'python3-tensorflow-gpu': [
        VolumeInfo('deeplearning-samples', '/home/work/samples', 'ro'),

restarting_kernels = {}
blocking_cleans = {}

async def get_extra_volumes(docker, lang):
    avail_volumes = (await docker.volumes.list())['Volumes']
Exemplo n.º 31
'''Listing parser.'''
import re

import namedlist

from wpull.ftp.ls.date import parse_datetime

FileEntry = namedlist.namedtuple('FileEntryType', [
    'name', ('type', None), ('size', None), ('date', None), ('dest', None),
    ('perm', None)
'''A row in a listing.

    name (str): Filename.
    type (str, None): ``file``, ``dir``, ``symlink``, ``other``, ``None``
    size (int, None): Size of file.
    date (:class:`datetime.datetime`, None): A datetime object in UTC.
    dest (str, None): Destination filename for symlinks.
    perm (int, None): Unix permissions expressed as an integer.

class ListingError(ValueError):
    '''Error during parsing a listing.'''

class UnknownListingError(ListingError):
    '''Failed to determine type of listing.'''

Exemplo n.º 32
        It must call one of :meth:`.engine.URLItem.set_status` or

    def close(self):
        '''Run any clean up actions.'''

WebProcessorFetchParams = namedlist.namedtuple(
        ('retry_connrefused', False),
        ('retry_dns_error', False),
        ('post_data', None),
        ('strong_robots', True),
        ('strong_redirects', True),
        ('content_on_error', False),

    retry_connrefused: If True, don't consider a connection refused error
        to be a permanent error.
    retry_dns_error: If True, don't consider a DNS resolution error to be
        permanent error.
    post_data (str): If provided, all requests will be POSTed with the
        given `post_data`. `post_data` must be in percent-encoded
        query format ("application/x-www-form-urlencoded").
Exemplo n.º 33
from wpull.errors import ProtocolError
from wpull.hook import HookableMixin, Actions
from wpull.http.web import LoopType
from wpull.processor.base import BaseProcessor, BaseProcessorSession, \
from wpull.processor.rule import FetchRule, ResultRule
from wpull.stats import Statistics
from wpull.writer import NullWriter
import wpull.string

_logger = logging.getLogger(__name__)
_ = gettext.gettext

WebProcessorFetchParams = namedlist.namedtuple('WebProcessorFetchParamsType', [
    ('post_data', None),
    ('strong_redirects', True),
    ('content_on_error', False),

    post_data (str): If provided, all requests will be POSTed with the
        given `post_data`. `post_data` must be in percent-encoded
        query format ("application/x-www-form-urlencoded").
    strong_redirects (bool): If True, redirects are allowed to span hosts.

WebProcessorInstances = namedlist.namedtuple('WebProcessorInstancesType', [
    ('fetch_rule', FetchRule()),
    ('result_rule', ResultRule()),
    ('processing_rule', None),
Exemplo n.º 34
from wpull.body import Body
from wpull.driver.phantomjs import PhantomJSDriverParams, PhantomJSDriver
from wpull.namevalue import NameValueRecord
from wpull.pipeline.session import ItemSession
from wpull.processor.rule import ProcessingRule
from wpull.warc.format import WARCRecord
import wpull.url

PhantomJSParams = namedlist.namedtuple(
    'PhantomJSParamsType', [
        ('snapshot_types', ('html', 'pdf')),
        ('wait_time', 1),
        ('num_scrolls', 10),
        ('smart_scroll', True),
        ('snapshot', True),
        ('viewport_size', (1200, 1920)),
        ('paper_size', (2400, 3840)),
        ('load_time', 900),
        ('custom_headers', {}),
        ('page_settings', {}),
'''PhantomJS parameters

    snapshot_type (list): File types. Accepted are html, pdf, png, gif.
    wait_time (float): Time between page scrolls.
    num_scrolls (int): Maximum number of scrolls.
    smart_scroll (bool): Whether to stop scrolling if number of
        requests & responses do not change.
    snapshot (bool): Whether to take snapshot files.
Exemplo n.º 35
from wpull.http.web import LoopType
from wpull.processor.base import BaseProcessor, BaseProcessorSession, \
from wpull.processor.rule import FetchRule, ResultRule
from wpull.stats import Statistics
from wpull.writer import NullWriter
import wpull.string

_logger = logging.getLogger(__name__)
_ = gettext.gettext

WebProcessorFetchParams = namedlist.namedtuple(
        ('post_data', None),
        ('strong_redirects', True),
        ('content_on_error', False),

    post_data (str): If provided, all requests will be POSTed with the
        given `post_data`. `post_data` must be in percent-encoded
        query format ("application/x-www-form-urlencoded").
    strong_redirects (bool): If True, redirects are allowed to span hosts.

WebProcessorInstances = namedlist.namedtuple(
Exemplo n.º 36
from os import path
from namedlist import namedtuple
import yaml

Config = namedtuple('Config', ['components', 'component_aliases', 'templates', 'state_codes'])

def load_config(template_path):
    templates = {}
    components = {}
    component_aliases = {}
    state_codes = {}

    if not path.isdir(template_path):
        raise IOError('Address formatting templates path cannot be found.')

    # Parse components and component aliases
    with open(path.join(template_path, 'components.yaml'), 'r') as ymlfile:
        comps = yaml.safe_load_all(ymlfile)

        for comp in comps:
            if 'aliases' in comp:
                component_aliases.update({alias: comp['name'] for alias in comp['aliases']})

            components[comp['name']] = comp.get('aliases')

    # Parse templates
    with open(path.join(template_path, 'countries', 'worldwide.yaml'), 'r') as ymlfile:
        templates = yaml.safe_load(ymlfile)

    # Parse state codes
Exemplo n.º 37

    def response_data(self, data):
        for session in self._sessions:

    def __exit__(self, *args):
        for context in self._contexts:

WARCRecorderParams = namedlist.namedtuple('WARCRecorderParamsType',
                                          [('compress', True),
                                           ('extra_fields', None),
                                           ('temp_dir', None), ('log', True),
                                           ('appending', False),
                                           ('digests', True), ('cdx', None),
                                           ('max_size', None),
                                           ('url_table', None),
                                           ('software_string', None)])
''':class:`WARCRecorder` parameters.

    compress (bool): If True, files will be compressed with gzip
    extra_fields (list): A list of key-value pairs containing extra
        metadata fields
    temp_dir (str): Directory to use for temporary files
    log (bool): Include the program logging messages in the WARC file
    appending (bool): If True, the file is not overwritten upon opening
    digests (bool): If True, the SHA1 hash digests will be written.
    cdx (bool): If True, a CDX file will be written.
Exemplo n.º 38
NormalizationParams = namedlist.namedtuple(
        ('sort_query', False),
        ('always_delim_query', False)
'''Parameters for URL normalization.

    sort_query (bool): Whether to sort the query string items.
    always_delim_query: Whether to always deliminate the key-value items where
        value is empty.

class URLInfo(_URLInfoType):
    '''A named tuple containing the parts of the URL.
Exemplo n.º 39
"""Base classes"""
import abc
import collections
import io
import namedlist

from wpull.document.base import BaseTextStreamReader, BaseHTMLReader, BaseExtractiveReader
from wpull.scraper.util import urljoin_safe

LinkContext = namedlist.namedtuple(
    "LinkContextType", ["link", ("inline", False), ("linked", False), ("link_type", None), ("extra", None)]
"""A named tuple describing a scraped link.

    link (str): The link that was scraped.
    inline (bool): Whether the link is an embeded object.
    linked (bool): Whether the link links to another page.
    link_type: A value from :class:`.item.LinkType`.
    extra: Any extra info.

class ScrapeResult(dict):
    """Links scraped from a document.

    This class is subclassed from ``dict`` and contains convenience methods.

    def __init__(self, link_contexts, encoding):