예제 #1
0
 def test_repr(self):
     A = namedtuple('A', 'x')
     self.assertEqual(repr(A(1)), 'A(x=1)')
     # repr should show the name of the subclass
     class B(A):
         pass
     self.assertEqual(repr(B(1)), 'B(x=1)')
예제 #2
0
 def test_name_conflicts(self):
     # Some names like "self", "cls", "tuple", "itemgetter", and "property"
     # failed when used as field names.  Test to make sure these now work.
     T = namedtuple('T', 'itemgetter property self cls tuple')
     t = T(1, 2, 3, 4, 5)
     self.assertEqual(t, (1,2,3,4,5))
     newt = t._replace(itemgetter=10, property=20, self=30, cls=40, tuple=50)
     self.assertEqual(newt, (10,20,30,40,50))
예제 #3
0
    def test_factory(self):
        Point = namedtuple('Point', 'x y')
        self.assertEqual(Point.__name__, 'Point')
        self.assertEqual(Point.__slots__, ())
        self.assertEqual(Point.__module__, __name__)
        self.assertEqual(Point.__getitem__, tuple.__getitem__)
        self.assertEqual(Point._fields, ('x', 'y'))

        self.assertRaises(ValueError, namedtuple, 'abc%', 'efg ghi')       # type has non-alpha char
        self.assertRaises(ValueError, namedtuple, 'class', 'efg ghi')      # type has keyword
        self.assertRaises(ValueError, namedtuple, '9abc', 'efg ghi')       # type starts with digit

        self.assertRaises(ValueError, namedtuple, 'abc', 'efg g%hi')       # field with non-alpha char
        self.assertRaises(ValueError, namedtuple, 'abc', 'abc class')      # field has keyword
        self.assertRaises(ValueError, namedtuple, 'abc', '8efg 9ghi')      # field starts with digit
        self.assertRaises(ValueError, namedtuple, 'abc', '_efg ghi')       # field with leading underscore
        self.assertRaises(ValueError, namedtuple, 'abc', 'efg efg ghi')    # duplicate field

        namedtuple('Point0', 'x1 y2')   # Verify that numbers are allowed in names
        namedtuple('_', 'a b c')        # Test leading underscores in a typename

        nt = namedtuple('nt', 'the quick brown fox')                       # check unicode input
        self.assertNotIn("u'", repr(nt._fields))
        nt = namedtuple('nt', ('the', 'quick'))                           # check unicode input
        self.assertNotIn("u'", repr(nt._fields))

        self.assertRaises(TypeError, Point._make, [11])                     # catch too few args
        self.assertRaises(TypeError, Point._make, [11, 22, 33])             # catch too many args
예제 #4
0
 def test_name_fixer(self):
     for spec, renamed in [
         [('efg', 'g%hi'),  ('efg', '_1')],                              # field with non-alpha char
         [('abc', 'class'), ('abc', '_1')],                              # field has keyword
         [('8efg', '9ghi'), ('_0', '_1')],                               # field starts with digit
         [('abc', '_efg'), ('abc', '_1')],                               # field with leading underscore
         [('abc', 'efg', 'efg', 'ghi'), ('abc', 'efg', '_2', 'ghi')],    # duplicate field
         [('abc', '', 'x'), ('abc', '_1', 'x')],                         # fieldname is a space
     ]:
         self.assertEqual(namedtuple('NT', spec, rename=True)._fields, renamed)
예제 #5
0
    def test_odd_sizes(self):
        Zero = namedtuple('Zero', '')
        self.assertEqual(Zero(), ())
        self.assertEqual(Zero._make([]), ())
        self.assertEqual(repr(Zero()), 'Zero()')
        self.assertEqual(Zero()._asdict(), {})
        self.assertEqual(Zero()._fields, ())

        Dot = namedtuple('Dot', 'd')
        self.assertEqual(Dot(1), (1,))
        self.assertEqual(Dot._make([1]), (1,))
        self.assertEqual(Dot(1).d, 1)
        self.assertEqual(repr(Dot(1)), 'Dot(d=1)')
        self.assertEqual(Dot(1)._asdict(), {'d':1})
        self.assertEqual(Dot(1)._replace(d=999), (999,))
        self.assertEqual(Dot(1)._fields, ('d',))

        # n = 5000
        n = 254 # SyntaxError: more than 255 arguments:
        import string, random
        names = list(set(''.join([random.choice(string.ascii_letters)
                                  for j in range(10)]) for i in range(n)))
        n = len(names)
        Big = namedtuple('Big', names)
        b = Big(*range(n))
        self.assertEqual(b, tuple(range(n)))
        self.assertEqual(Big._make(range(n)), tuple(range(n)))
        for pos, name in enumerate(names):
            self.assertEqual(getattr(b, name), pos)
        repr(b)                                 # make sure repr() doesn't blow-up
        d = b._asdict()
        d_expected = dict(zip(names, range(n)))
        self.assertEqual(d, d_expected)
        b2 = b._replace(**dict([(names[1], 999),(names[-5], 42)]))
        b2_expected = list(range(n))
        b2_expected[1] = 999
        b2_expected[-5] = 42
        self.assertEqual(b2, tuple(b2_expected))
        self.assertEqual(b._fields, tuple(names))
예제 #6
0
    def test_instance(self):
        Point = namedtuple('Point', 'x y')
        p = Point(11, 22)
        self.assertEqual(p, Point(x=11, y=22))
        self.assertEqual(p, Point(11, y=22))
        self.assertEqual(p, Point(y=22, x=11))
        self.assertEqual(p, Point(*(11, 22)))
        self.assertEqual(p, Point(**dict(x=11, y=22)))
        self.assertRaises(TypeError, Point, 1)                              # too few args
        self.assertRaises(TypeError, Point, 1, 2, 3)                        # too many args
        self.assertRaises(TypeError, eval, 'Point(XXX=1, y=2)', locals())   # wrong keyword argument
        self.assertRaises(TypeError, eval, 'Point(x=1)', locals())          # missing keyword argument
        self.assertEqual(repr(p), 'Point(x=11, y=22)')
        self.assertNotIn('__weakref__', dir(p))
        self.assertEqual(p, Point._make([11, 22]))                          # test _make classmethod
        self.assertEqual(p._fields, ('x', 'y'))                             # test _fields attribute
        self.assertEqual(p._replace(x=1), (1, 22))                          # test _replace method
        self.assertEqual(p._asdict(), dict(x=11, y=22))                     # test _asdict method
        self.assertEqual(vars(p), p._asdict())                              # verify that vars() works

        try:
            p._replace(x=1, error=2)
        except ValueError:
            pass
        else:
            self._fail('Did not detect an incorrect fieldname')

        # verify that field string can have commas
        Point = namedtuple('Point', 'x, y')
        p = Point(x=11, y=22)
        self.assertEqual(repr(p), 'Point(x=11, y=22)')

        # verify that fieldspec can be a non-string sequence
        Point = namedtuple('Point', ('x', 'y'))
        p = Point(x=11, y=22)
        self.assertEqual(repr(p), 'Point(x=11, y=22)')
예제 #7
0
    def test_tupleness(self):
        Point = namedtuple('Point', 'x y')
        p = Point(11, 22)

        self.assertIsInstance(p, tuple)
        self.assertEqual(p, (11, 22))                                       # matches a real tuple
        self.assertEqual(tuple(p), (11, 22))                                # coercable to a real tuple
        self.assertEqual(list(p), [11, 22])                                 # coercable to a list
        self.assertEqual(max(p), 22)                                        # iterable
        self.assertEqual(max(*p), 22)                                       # star-able
        x, y = p
        self.assertEqual(p, (x, y))                                         # unpacks like a tuple
        self.assertEqual((p[0], p[1]), (11, 22))                            # indexable like a tuple
        self.assertRaises(IndexError, p.__getitem__, 3)

        self.assertEqual(p.x, x)
        self.assertEqual(p.y, y)
        self.assertRaises(AttributeError, eval, 'p.z', locals())
예제 #8
0
파일: ftp.py 프로젝트: flatron18116/wpull
from wpull.scraper.util import urljoin_safe
from wpull.url import parse_url_or_log
from wpull.writer import NullWriter


_logger = logging.getLogger(__name__)
_ = gettext.gettext

GLOB_CHARS = frozenset('[]*?')


FTPProcessorFetchParams = namedlist.namedtuple(
    'FTPProcessorFetchParamsType',
    [
        ('remove_listing', True),
        ('glob', True),
        ('preserve_permissions', False),
        ('retr_symlinks', True),
    ]
)
'''FTPProcessorFetchParams

Args:
    remove_listing (bool): Remove `.listing` files after fetching.
    glob (bool): Enable URL globbing.
    preserve_permissions (bool): Preserve file permissions.
    follow_symlinks (bool): Follow symlinks.
'''

FTPProcessorInstances = namedlist.namedtuple(
    'FTPProcessorInstancesType',
예제 #9
0
import namedlist
import asyncio

from wpull.driver.process import Process
import wpull.util

_logger = logging.getLogger(__name__)

PhantomJSDriverParams = namedlist.namedtuple('PhantomJSDriverParamsType', [
    'url',
    ('snapshot_paths', []),
    ('wait_time', 1),
    ('num_scrolls', 10),
    ('smart_scroll', True),
    ('snapshot', True),
    ('viewport_size', (1200, 1920)),
    ('paper_size', (2400, 3840)),
    ('event_log_filename', None),
    ('action_log_filename', None),
    ('custom_headers', {}),
    ('page_settings', {}),
])
'''PhantomJS Driver parameters

Attributes:
    url (str): URL of page to fetch.
    snapshot_type (list): List of filenames. Accepted extensions are html,
        pdf, png, gif.
    wait_time (float): Time between page scrolls.
    num_scrolls (int): Maximum number of scrolls.
    smart_scroll (bool): Whether to stop scrolling if number of
예제 #10
0
from wpull.protocol.ftp.util import FTPServerError
from wpull.scraper.util import urljoin_safe
from wpull.url import parse_url_or_log, URLInfo
from wpull.writer import NullWriter, BaseFileWriter

_logger = StyleAdapter(logging.getLogger(__name__))
_ = gettext.gettext

GLOB_CHARS = frozenset('[]*?')


FTPProcessorFetchParams = namedlist.namedtuple(
    'FTPProcessorFetchParamsType',
    [
        ('remove_listing', True),
        ('glob', True),
        ('preserve_permissions', False),
        ('retr_symlinks', True),
    ]
)
'''FTPProcessorFetchParams

Args:
    remove_listing (bool): Remove `.listing` files after fetching.
    glob (bool): Enable URL globbing.
    preserve_permissions (bool): Preserve file permissions.
    follow_symlinks (bool): Follow symlinks.
'''


class HookPreResponseBreak(ProtocolError):
예제 #11
0
파일: recorder.py 프로젝트: mback2k/wpull
        for session in self._sessions:
            session.response_data(data)

    def __exit__(self, *args):
        for context in self._contexts:
            context.__exit__(*args)


WARCRecorderParams = namedlist.namedtuple(
    'WARCRecorderParamsType',
    [
        ('compress', True),
        ('extra_fields', None),
        ('temp_dir', None),
        ('log', True),
        ('appending', False),
        ('digests', True),
        ('cdx', None),
        ('max_size', None),
        ('move_to', None),
        ('url_table', None),
        ('software_string', None)
    ]
)
''':class:`WARCRecorder` parameters.

Args:
    compress (bool): If True, files will be compressed with gzip
    extra_fields (list): A list of key-value pairs containing extra
        metadata fields
    temp_dir (str): Directory to use for temporary files
    log (bool): Include the program logging messages in the WARC file
예제 #12
0
from trollius import From, Return

from wpull.backport.logging import BraceMessage as __
from wpull.document.html import HTMLReader
from wpull.body import Body
from wpull.driver.phantomjs import PhantomJSDriverParams
from wpull.namevalue import NameValueRecord
from wpull.warc import WARCRecord
import wpull.url

PhantomJSParams = namedlist.namedtuple('PhantomJSParamsType', [
    ('snapshot_types', ('html', 'pdf')),
    ('wait_time', 1),
    ('num_scrolls', 10),
    ('smart_scroll', True),
    ('snapshot', True),
    ('viewport_size', (1200, 1920)),
    ('paper_size', (2400, 3840)),
    ('load_time', 900),
    ('custom_headers', {}),
    ('page_settings', {}),
])
'''PhantomJS parameters

Attributes:
    snapshot_type (list): File types. Accepted are html, pdf, png, gif.
    wait_time (float): Time between page scrolls.
    num_scrolls (int): Maximum number of scrolls.
    smart_scroll (bool): Whether to stop scrolling if number of
        requests & responses do not change.
    snapshot (bool): Whether to take snapshot files.
    viewport_size (tuple): Width and height of the page viewport.
예제 #13
0
파일: processor.py 프로젝트: mback2k/wpull
        It must call one of :meth:`.engine.URLItem.set_status` or
        :meth:`.engine.URLItem.skip`.
        '''
        pass

    def close(self):
        '''Run any clean up actions.'''
        pass


WebProcessorFetchParams = namedlist.namedtuple(
    'WebProcessorFetchParamsType',
    [
        ('retry_connrefused', False),
        ('retry_dns_error', False),
        ('post_data', None),
        ('strong_redirects', True),
        ('content_on_error', False),
    ]
)
'''WebProcessorFetchParams

Args:
    retry_connrefused: If True, don't consider a connection refused error
        to be a permanent error.
    retry_dns_error: If True, don't consider a DNS resolution error to be
        permanent error.
    post_data (str): If provided, all requests will be POSTed with the
        given `post_data`. `post_data` must be in percent-encoded
        query format ("application/x-www-form-urlencoded").
    strong_redirects (bool): If True, redirects are allowed to span hosts.
예제 #14
0
 def __init__(self):
     self.Result = namedtuple('Result', ['type', 'text', 'result'])
예제 #15
0
 def __init__(self, field_names, default=NO_DEFAULT, filters=None):
     self.args_tuple = namedtuple('_ArgsTuple', field_names, default)
     self.fields = self.args_tuple._fields
     self.filters = (_validate_filters(self.fields, filters)
                     if filters else {})
예제 #16
0
 def __init__(self, field_names, default=NO_DEFAULT, filters=None):
     self.args_tuple = namedtuple('_ArgsTuple', field_names, default)
     self.fields = self.args_tuple._fields
     self.filters = (_validate_filters(self.fields, filters)
                     if filters else {})
예제 #17
0
    def __init__(self, *args, **kwargs):
        self.size_limit = kwargs.pop("size_limit", None)
        OrderedDict.__init__(self, *args, **kwargs)
        self._check_size_limit()

    def __setitem__(self, key, value):
        OrderedDict.__setitem__(self, key, value)
        self._check_size_limit()

    def _check_size_limit(self):
        if self.size_limit is not None:
            while len(self) > self.size_limit:
                self.popitem(last=False)


EpicComment = namedlist.namedtuple('EpicComment', [('id', ''),
                                                   ('submission', '')])


class EpicValidator(CommentValidator):
    __slots__ = ['_sticky_store', '_comment_store']

    def __init__(self, reddit):
        super().__init__(reddit)
        self._sticky_store = LimitedSizeDict(size_limit=20)
        self._comment_store = deque(maxlen=200)

    def validate(self, comment: Comment) -> Tuple[Action, Rule]:
        css_class = comment.author_flair_css_class
        if not self.has_comment(comment) and css_class and css_class.lower(
        ) in self.config['general']['class']:
            self._comment_store.appendleft(
예제 #18
0
_ = gettext.gettext
_logger = logging.getLogger(__name__)

DEFAULT_BUFFER_SIZE = 1048576
'''Default buffer size in bytes.'''
DEFAULT_NO_CONTENT_CODES = frozenset(
    itertools.chain(range(100, 200),
                    [http.client.NO_CONTENT, http.client.NOT_MODIFIED]))
'''Status codes where a response body is prohibited.'''

ConnectionParams = namedlist.namedtuple('ConnectionParamsType', [
    ('bind_address', None),
    ('keep_alive', True),
    ('ssl_options', None),
    ('connect_timeout', None),
    ('read_timeout', None),
    ('buffer_size', DEFAULT_BUFFER_SIZE),
    ('no_content_codes', DEFAULT_NO_CONTENT_CODES),
    ('ignore_length', False),
])
'''Parameters for connections.

Args:
    bind_address: The IP address to bind the socket. Must match
        :meth:`socket.SocketType.bind`. Use this if your local host has
        multiple IP addresses.
    keep_alive (bool): If True, use HTTP keep-alive.
    ssl_options: A ``dict`` containing options for :func:`ssl.wrap_socket`
    connect_timeout (float): If given, the time in seconds before the
        connection is timed out during connection. Otherwise, depend on the
        underlying libraries for timeout.
예제 #19
0
from wpull.driver.process import Process
import wpull.util


_logger = logging.getLogger(__name__)


PhantomJSDriverParams = namedlist.namedtuple(
    'PhantomJSDriverParamsType', [
        'url',
        ('snapshot_paths', []),
        ('wait_time', 1),
        ('num_scrolls', 10),
        ('smart_scroll', True),
        ('snapshot', True),
        ('viewport_size', (1200, 1920)),
        ('paper_size', (2400, 3840)),
        ('event_log_filename', None),
        ('action_log_filename', None),
        ('custom_headers', {}),
        ('page_settings', {}),
    ]
)
'''PhantomJS Driver parameters

Attributes:
    url (str): URL of page to fetch.
    snapshot_type (list): List of filenames. Accepted extensions are html,
        pdf, png, gif.
    wait_time (float): Time between page scrolls.
    num_scrolls (int): Maximum number of scrolls.
예제 #20
0
        'query',
        'fragment',
        'username',
        'password',
        'hostname',
        'port',
        'raw',
        'encoding',
    ]
)


NormalizationParams = namedlist.namedtuple(
    'NormalizationParamsType',
    [
        ('sort_query', False),
        ('always_delim_query', False)
    ]
)
'''Parameters for URL normalization.

Args:
    sort_query (bool): Whether to sort the query string items.
    always_delim_query: Whether to always deliminate the key-value items where
        value is empty.
'''


class URLInfo(_URLInfoType):
    '''A named tuple containing the parts of the URL.
예제 #21
0
 def test_factory_doc_attr(self):
     Point = namedtuple('Point', 'x y')
     self.assertEqual(Point.__doc__, 'Point(x, y)')
예제 #22
0
        self.assertEqual(Point.__doc__, "Point(dx=FACTORY({0}), dy=FACTORY({0}), dz=11.0)".format(list_repr))

        Point = namedlist('Point', ['dx', 'dy', ('dz', FACTORY(11.0))], default=[])
        self.assertEqual(Point.__doc__, 'Point(dx=[], dy=[], dz=FACTORY(11.0))')

    def test_slice(self):
        Point = namedlist('Point', 'x y z color')
        values = [3, 5, -12, 'red']
        p = Point(*values)
        self.assertEqual(values[0:-1], p[0:-1])
        self.assertEqual(values[:3], p[:3])
        self.assertEqual(values[4:1:-1], p[4:1:-1])



TestNT = namedtuple('TestNT', 'x y z')    # type used for pickle tests

class TestNamedTuple(unittest.TestCase):

    def test_unicode_identifiers(self):
        Point = namedtuple(u'Point', u'x y')
        p = Point(10, 20)
        self.assertEqual((p.x, p.y), (10, 20))
        self.assertEqual(p._asdict(), {'x':10, 'y':20})

    def test_factory(self):
        Point = namedtuple('Point', 'x y')
        self.assertEqual(Point.__name__, 'Point')
        self.assertEqual(Point.__slots__, ())
        self.assertEqual(Point.__module__, __name__)
        self.assertEqual(Point.__getitem__, tuple.__getitem__)
예제 #23
0
 def test_unicode_identifiers(self):
     Point = namedtuple(u'Point', u'x y')
     p = Point(10, 20)
     self.assertEqual((p.x, p.y), (10, 20))
     self.assertEqual(p._asdict(), {'x':10, 'y':20})
예제 #24
0
"""Listing parser."""
import re

import itertools
import namedlist

from wpull.protocol.ftp.ls.date import parse_datetime
import wpull.protocol.ftp.ls.date


FileEntry = namedlist.namedtuple(
    "FileEntryType", ["name", ("type", None), ("size", None), ("date", None), ("dest", None), ("perm", None)]
)
"""A row in a listing.

Attributes:
    name (str): Filename.
    type (str, None): ``file``, ``dir``, ``symlink``, ``other``, ``None``
    size (int, None): Size of file.
    date (:class:`datetime.datetime`, None): A datetime object in UTC.
    dest (str, None): Destination filename for symlinks.
    perm (int, None): Unix permissions expressed as an integer.
"""


class ListingError(ValueError):
    """Error during parsing a listing."""


class UnknownListingError(ListingError):
    """Failed to determine type of listing."""
예제 #25
0
'''Base classes'''
import abc
import collections
import io
import namedlist

from wpull.document.base import BaseTextStreamReader, \
    BaseHTMLReader, BaseExtractiveReader
from wpull.scraper.util import urljoin_safe

LinkContext = namedlist.namedtuple('LinkContextType', [
    'link', ('inline', False), ('linked', False), ('link_type', None),
    ('extra', None)
])
'''A named tuple describing a scraped link.

Attributes:
    link (str): The link that was scraped.
    inline (bool): Whether the link is an embeded object.
    linked (bool): Whether the link links to another page.
    link_type: A value from :class:`.item.LinkType`.
    extra: Any extra info.
'''


class ScrapeResult(dict):
    '''Links scraped from a document.

    This class is subclassed from ``dict`` and contains convenience methods.
    '''
    def __init__(self, link_contexts, encoding):
예제 #26
0
from wpull.processor.base import BaseProcessor, BaseProcessorSession, \
    REMOTE_ERRORS
from wpull.processor.rule import FetchRule, ResultRule, ProcessingRule
from wpull.url import URLInfo
from wpull.writer import BaseFileWriter
import wpull.string
import wpull.util


_logger = StyleAdapter(logging.getLogger(__name__))
_ = gettext.gettext

WebProcessorFetchParams = namedlist.namedtuple(
    'WebProcessorFetchParamsType',
    [
        ('post_data', None),
        ('strong_redirects', True),
        ('content_on_error', False),
    ]
)
'''WebProcessorFetchParams

Args:
    post_data (str): If provided, all requests will be POSTed with the
        given `post_data`. `post_data` must be in percent-encoded
        query format ("application/x-www-form-urlencoded").
    strong_redirects (bool): If True, redirects are allowed to span hosts.
'''


class HookPreResponseBreak(ProtocolError):
    '''Hook pre-response break.'''
예제 #27
0
파일: web.py 프로젝트: Super-Rad/wpull
from wpull.processor.base import BaseProcessor, BaseProcessorSession, \
    REMOTE_ERRORS
from wpull.processor.rule import FetchRule, ResultRule, ProcessingRule
from wpull.url import URLInfo
from wpull.writer import BaseFileWriter
import wpull.string
import wpull.util


_logger = StyleAdapter(logging.getLogger(__name__))
_ = gettext.gettext

WebProcessorFetchParams = namedlist.namedtuple(
    'WebProcessorFetchParamsType',
    [
        ('post_data', None),
        ('strong_redirects', True),
        ('content_on_error', False),
    ]
)
'''WebProcessorFetchParams

Args:
    post_data (str): If provided, all requests will be POSTed with the
        given `post_data`. `post_data` must be in percent-encoded
        query format ("application/x-www-form-urlencoded").
    strong_redirects (bool): If True, redirects are allowed to span hosts.
'''


class HookPreResponseBreak(ProtocolError):
    '''Hook pre-response break.'''
예제 #28
0
DEFAULT_BUFFER_SIZE = 1048576
'''Default buffer size in bytes.'''
DEFAULT_NO_CONTENT_CODES = frozenset(itertools.chain(
    range(100, 200),
    [http.client.NO_CONTENT, http.client.NOT_MODIFIED]
))
'''Status codes where a response body is prohibited.'''


ConnectionParams = namedlist.namedtuple(
    'ConnectionParamsType',
    [
        ('bind_address', None),
        ('keep_alive', True),
        ('ssl_options', None),
        ('connect_timeout', None),
        ('read_timeout', None),
        ('buffer_size', DEFAULT_BUFFER_SIZE),
        ('no_content_codes', DEFAULT_NO_CONTENT_CODES),
        ('ignore_length', False),
    ]
)
'''Parameters for connections.

Args:
    bind_address: The IP address to bind the socket. Must match
        :meth:`socket.SocketType.bind`. Use this if your local host has
        multiple IP addresses.
    keep_alive (bool): If True, use HTTP keep-alive.
    ssl_options: A ``dict`` containing options for :func:`ssl.wrap_socket`
    connect_timeout (float): If given, the time in seconds before the
예제 #29
0
파일: listing.py 프로젝트: Willianvdv/wpull
'''Listing parser.'''
import re

import namedlist

from wpull.ftp.ls.date import parse_datetime


FileEntry = namedlist.namedtuple(
    'FileEntryType',
    [
        'name',
        ('type', None),
        ('size', None),
        ('date', None),
        ('dest', None),
        ('perm', None)
    ])
'''A row in a listing.

Attributes:
    name (str): Filename.
    type (str, None): ``file``, ``dir``, ``symlink``, ``other``, ``None``
    size (int, None): Size of file.
    date (:class:`datetime.datetime`, None): A datetime object in UTC.
    dest (str, None): Destination filename for symlinks.
    perm (int, None): Unix permissions expressed as an integer.
'''


class ListingError(ValueError):
예제 #30
0
    'nodejs6',
    'git',
    'julia',
    'lua5',
    'haskell',
    'octave4',
    'cpp',
    'c',
    'java',
    'go',
    'rust',
}
lang_aliases = dict()
max_upload_size = 5 * 1024 * 1024  # 5 MB

VolumeInfo = namedtuple('VolumeInfo', 'name container_path mode')
_extra_volumes = {
    'python3-tensorflow': [
        VolumeInfo('deeplearning-samples', '/home/work/samples', 'ro'),
    ],
    'python3-tensorflow-gpu': [
        VolumeInfo('deeplearning-samples', '/home/work/samples', 'ro'),
    ],
}

restarting_kernels = {}
blocking_cleans = {}


async def get_extra_volumes(docker, lang):
    avail_volumes = (await docker.volumes.list())['Volumes']
예제 #31
0
'''Listing parser.'''
import re

import namedlist

from wpull.ftp.ls.date import parse_datetime

FileEntry = namedlist.namedtuple('FileEntryType', [
    'name', ('type', None), ('size', None), ('date', None), ('dest', None),
    ('perm', None)
])
'''A row in a listing.

Attributes:
    name (str): Filename.
    type (str, None): ``file``, ``dir``, ``symlink``, ``other``, ``None``
    size (int, None): Size of file.
    date (:class:`datetime.datetime`, None): A datetime object in UTC.
    dest (str, None): Destination filename for symlinks.
    perm (int, None): Unix permissions expressed as an integer.
'''


class ListingError(ValueError):
    '''Error during parsing a listing.'''


class UnknownListingError(ListingError):
    '''Failed to determine type of listing.'''

예제 #32
0
        It must call one of :meth:`.engine.URLItem.set_status` or
        :meth:`.engine.URLItem.skip`.
        '''
        pass

    def close(self):
        '''Run any clean up actions.'''
        pass


WebProcessorFetchParams = namedlist.namedtuple(
    'WebProcessorFetchParamsType',
    [
        ('retry_connrefused', False),
        ('retry_dns_error', False),
        ('post_data', None),
        ('strong_robots', True),
        ('strong_redirects', True),
        ('content_on_error', False),
    ]
)
'''WebProcessorFetchParams

Args:
    retry_connrefused: If True, don't consider a connection refused error
        to be a permanent error.
    retry_dns_error: If True, don't consider a DNS resolution error to be
        permanent error.
    post_data (str): If provided, all requests will be POSTed with the
        given `post_data`. `post_data` must be in percent-encoded
        query format ("application/x-www-form-urlencoded").
예제 #33
0
파일: web.py 프로젝트: flatron18116/wpull
from wpull.errors import ProtocolError
from wpull.hook import HookableMixin, Actions
from wpull.http.web import LoopType
from wpull.processor.base import BaseProcessor, BaseProcessorSession, \
    REMOTE_ERRORS
from wpull.processor.rule import FetchRule, ResultRule
from wpull.stats import Statistics
from wpull.writer import NullWriter
import wpull.string

_logger = logging.getLogger(__name__)
_ = gettext.gettext

WebProcessorFetchParams = namedlist.namedtuple('WebProcessorFetchParamsType', [
    ('post_data', None),
    ('strong_redirects', True),
    ('content_on_error', False),
])
'''WebProcessorFetchParams

Args:
    post_data (str): If provided, all requests will be POSTed with the
        given `post_data`. `post_data` must be in percent-encoded
        query format ("application/x-www-form-urlencoded").
    strong_redirects (bool): If True, redirects are allowed to span hosts.
'''

WebProcessorInstances = namedlist.namedtuple('WebProcessorInstancesType', [
    ('fetch_rule', FetchRule()),
    ('result_rule', ResultRule()),
    ('processing_rule', None),
예제 #34
0
from wpull.body import Body
from wpull.driver.phantomjs import PhantomJSDriverParams, PhantomJSDriver
from wpull.namevalue import NameValueRecord
from wpull.pipeline.session import ItemSession
from wpull.processor.rule import ProcessingRule
from wpull.warc.format import WARCRecord
import wpull.url


PhantomJSParams = namedlist.namedtuple(
    'PhantomJSParamsType', [
        ('snapshot_types', ('html', 'pdf')),
        ('wait_time', 1),
        ('num_scrolls', 10),
        ('smart_scroll', True),
        ('snapshot', True),
        ('viewport_size', (1200, 1920)),
        ('paper_size', (2400, 3840)),
        ('load_time', 900),
        ('custom_headers', {}),
        ('page_settings', {}),
    ]
)
'''PhantomJS parameters

Attributes:
    snapshot_type (list): File types. Accepted are html, pdf, png, gif.
    wait_time (float): Time between page scrolls.
    num_scrolls (int): Maximum number of scrolls.
    smart_scroll (bool): Whether to stop scrolling if number of
        requests & responses do not change.
    snapshot (bool): Whether to take snapshot files.
예제 #35
0
파일: web.py 프로젝트: Willianvdv/wpull
from wpull.http.web import LoopType
from wpull.processor.base import BaseProcessor, BaseProcessorSession, \
    REMOTE_ERRORS
from wpull.processor.rule import FetchRule, ResultRule
from wpull.stats import Statistics
from wpull.writer import NullWriter
import wpull.string


_logger = logging.getLogger(__name__)
_ = gettext.gettext

WebProcessorFetchParams = namedlist.namedtuple(
    'WebProcessorFetchParamsType',
    [
        ('post_data', None),
        ('strong_redirects', True),
        ('content_on_error', False),
    ]
)
'''WebProcessorFetchParams

Args:
    post_data (str): If provided, all requests will be POSTed with the
        given `post_data`. `post_data` must be in percent-encoded
        query format ("application/x-www-form-urlencoded").
    strong_redirects (bool): If True, redirects are allowed to span hosts.
'''

WebProcessorInstances = namedlist.namedtuple(
    'WebProcessorInstancesType',
    [
예제 #36
0
from os import path
from namedlist import namedtuple
import yaml

Config = namedtuple('Config', ['components', 'component_aliases', 'templates', 'state_codes'])


def load_config(template_path):
    templates = {}
    components = {}
    component_aliases = {}
    state_codes = {}

    if not path.isdir(template_path):
        raise IOError('Address formatting templates path cannot be found.')

    # Parse components and component aliases
    with open(path.join(template_path, 'components.yaml'), 'r') as ymlfile:
        comps = yaml.safe_load_all(ymlfile)

        for comp in comps:
            if 'aliases' in comp:
                component_aliases.update({alias: comp['name'] for alias in comp['aliases']})

            components[comp['name']] = comp.get('aliases')

    # Parse templates
    with open(path.join(template_path, 'countries', 'worldwide.yaml'), 'r') as ymlfile:
        templates = yaml.safe_load(ymlfile)

    # Parse state codes
예제 #37
0
            session.response(response)

    def response_data(self, data):
        for session in self._sessions:
            session.response_data(data)

    def __exit__(self, *args):
        for context in self._contexts:
            context.__exit__(*args)


WARCRecorderParams = namedlist.namedtuple('WARCRecorderParamsType',
                                          [('compress', True),
                                           ('extra_fields', None),
                                           ('temp_dir', None), ('log', True),
                                           ('appending', False),
                                           ('digests', True), ('cdx', None),
                                           ('max_size', None),
                                           ('url_table', None),
                                           ('software_string', None)])
''':class:`WARCRecorder` parameters.

Args:
    compress (bool): If True, files will be compressed with gzip
    extra_fields (list): A list of key-value pairs containing extra
        metadata fields
    temp_dir (str): Directory to use for temporary files
    log (bool): Include the program logging messages in the WARC file
    appending (bool): If True, the file is not overwritten upon opening
    digests (bool): If True, the SHA1 hash digests will be written.
    cdx (bool): If True, a CDX file will be written.
예제 #38
0
파일: url.py 프로젝트: lowks/wpull
        'query',
        'fragment',
        'username',
        'password',
        'hostname',
        'port',
        'raw',
        'encoding',
    ]
)


NormalizationParams = namedlist.namedtuple(
    'NormalizationParamsType',
    [
        ('sort_query', False),
        ('always_delim_query', False)
    ]
)
'''Parameters for URL normalization.

Args:
    sort_query (bool): Whether to sort the query string items.
    always_delim_query: Whether to always deliminate the key-value items where
        value is empty.
'''


class URLInfo(_URLInfoType):
    '''A named tuple containing the parts of the URL.
예제 #39
0
파일: base.py 프로젝트: charygao/wpull
"""Base classes"""
import abc
import collections
import io
import namedlist

from wpull.document.base import BaseTextStreamReader, BaseHTMLReader, BaseExtractiveReader
from wpull.scraper.util import urljoin_safe


LinkContext = namedlist.namedtuple(
    "LinkContextType", ["link", ("inline", False), ("linked", False), ("link_type", None), ("extra", None)]
)
"""A named tuple describing a scraped link.

Attributes:
    link (str): The link that was scraped.
    inline (bool): Whether the link is an embeded object.
    linked (bool): Whether the link links to another page.
    link_type: A value from :class:`.item.LinkType`.
    extra: Any extra info.
"""


class ScrapeResult(dict):
    """Links scraped from a document.

    This class is subclassed from ``dict`` and contains convenience methods.
    """

    def __init__(self, link_contexts, encoding):