def test_copy_pickle(self): """Test copy and pickle.""" # Test that we can pickle and unpickle # We force a pattern that contains all custom types: # `Selector`, `NullSelector`, `SelectorTag`, `SelectorAttribute`, # `SelectorNth`, `SelectorLang`, `SelectorList`, and `Namespaces` p1 = sv.compile('p.class#id[id]:nth-child(2):lang(en):focus', {'html': 'http://www.w3.org/TR/html4/'}) sp1 = pickle.dumps(p1) pp1 = pickle.loads(sp1) self.assertTrue(pp1 == p1) # Test that we pull the same one from cache p2 = sv.compile('p.class#id[id]:nth-child(2):lang(en):focus', {'html': 'http://www.w3.org/TR/html4/'}) self.assertTrue(p1 is p2) # Test that we compile a new one when providing a different flags p3 = sv.compile('p.class#id[id]:nth-child(2):lang(en):focus', {'html': 'http://www.w3.org/TR/html4/'}, flags=0x10) self.assertTrue(p1 is not p3) self.assertTrue(p1 != p3) # Test that the copy is equivalent, but not same. p4 = copy.copy(p1) self.assertTrue(p4 is not p1) self.assertTrue(p4 == p1) p5 = copy.copy(p3) self.assertTrue(p5 is not p3) self.assertTrue(p5 == p3) self.assertTrue(p5 is not p4)
def test_pseudo_element(self): """Test pseudo element.""" with self.assertRaises(NotImplementedError): sv.compile(':first-line') with self.assertRaises(NotImplementedError): sv.compile('::first-line')
def test_syntax_error_on_third_line(self): """Test that multiline selector errors have the right position.""" with self.assertRaises(sv.SelectorSyntaxError) as cm: sv.compile('input:is(\n' ' [name=foo]\n' ' [type=42]\n' ')\n') e = cm.exception self.assertEqual(e.line, 3) self.assertEqual(e.col, 3)
def test_invalid_incomplete_has(self): """Test invalid `:has()`.""" with self.assertRaises(SyntaxError): sv.compile(':has(>)') with self.assertRaises(SyntaxError): sv.compile(':has()')
def test_invalid_pseudo_close(self): """Test invalid pseudo close.""" with self.assertRaises(SyntaxError): sv.compile('div)') with self.assertRaises(SyntaxError): sv.compile(':is(div,)')
def test_invalid_pseudo(self): """Test invalid pseudo class.""" with self.assertRaises(NotImplementedError): sv.compile(':before') with self.assertRaises(SyntaxError): sv.compile(':nth-child(a)')
def test_invalid_tag(self): """ Test invalid tag. Tag must come first. """ with self.assertRaises(SyntaxError): sv.compile(':is(div)p')
def test_syntax_error_has_text_and_position(self): """Test that selector syntax errors contain the position.""" with self.assertRaises(sv.SelectorSyntaxError) as cm: sv.compile('input.field[type=42]') e = cm.exception self.assertEqual(e.context, 'input.field[type=42]\n ^') self.assertEqual(e.line, 1) self.assertEqual(e.col, 12)
def test_syntax_error_with_multiple_lines(self): """Test that multiline selector errors have the right position.""" with self.assertRaises(sv.SelectorSyntaxError) as cm: sv.compile('input\n' '.field[type=42]') e = cm.exception self.assertEqual(e.context, ' input\n--> .field[type=42]\n ^') self.assertEqual(e.line, 2) self.assertEqual(e.col, 7)
def setup(self): """Setup.""" self.user_break_tags = set(self.config['break_tags']) self.comments = self.config['comments'] self.attributes = set(self.config['attributes']) self.parser = 'xml' self.type = 'xml' self.ignores = sv.compile(','.join(self.config['ignores']), self.config['namespaces']) self.captures = sv.compile(','.join(self.config['captures']), self.config['namespaces'])
def compile(selector: str) -> "Selector": parts = [""] in_brackets = 0 last_char = None for char in selector: if char == '|' and last_char != '\\' and in_brackets == 0: parts.append('') continue if char == '(' and last_char != '\\': in_brackets += 1 elif char == ')' and last_char != '\\': in_brackets -= 1 last_char = char parts[-1] += char parts = [part.strip() for part in parts] attr_selector = parts[0] if len(parts) > 0 else "" attr_selector_parts = attr_selector.split('@') attr_selector_parts = [part.strip() for part in attr_selector_parts] pattern = attr_selector_parts[0] if len( attr_selector_parts) > 0 else "" attr = attr_selector_parts[1] if len(attr_selector_parts) > 1 else None regx = None fmt = None cvt = None defval = None for part in parts[1:]: if part.startswith('regx:'): matched = match(r"regx:\s*(.*)\s*", part) if matched: regx = matched.group(1).strip() elif part.startswith('fmt:'): matched = match(r"fmt:\s*(.*)\s*", part) if matched: fmt = matched.group(1).strip() elif part.startswith('cvt:'): matched = match(r"cvt:\s*(\w+)\s*", part) if matched: cvt = matched.group(1).strip() elif part.startswith('defval:'): matched = match(r"defval:(.*)\s*", part) if matched: defval = matched.group(1).strip() if pattern: try: soupsieve.compile(pattern) except soupsieve.SelectorSyntaxError as e: raise ValueError("Syntax error: {}".format(e)) from e return Selector(pattern, attr, regx, fmt, cvt, defval)
def test_quirks_warn_attribute_unquoted(self): """Test that quirks mode raises a warning with attribute values that normally should be quoted.""" with warnings.catch_warnings(record=True) as w: # Cause all warnings to always be triggered. warnings.simplefilter("always") # Trigger a warning. sv.compile('[data={}]', flags=sv._QUIRKS) # Verify some things self.assertTrue(len(w) == 1) self.assertTrue(issubclass(w[-1].category, sv_util.QuirksWarning))
def test_quirks_warn_relative_combinator(self): """Test that quirks mode raises a warning with relative combinator.""" sv.purge() with warnings.catch_warnings(record=True) as w: # Cause all warnings to always be triggered. warnings.simplefilter("always") # Trigger a warning. sv.compile('> p', flags=sv._QUIRKS) # Verify some things self.assertTrue(len(w) == 1) self.assertTrue(issubclass(w[-1].category, sv_util.QuirksWarning))
def __init__( self, client, *, initial_paths: Iterable[str] = None, rules: Iterable[Rule] = None, path_attrs: Iterable[str] = (HREF, ), ignore_css_selectors: Iterable = None, ignore_form_fields: Iterable[str] = None, max_requests: Optional[int] = None, capture_exceptions: bool = True, output_summary: bool = True, should_process_handlers: Iterable[Callable] = None, check_response_handlers: Iterable[Callable] = None, ): # params self._client = client self.initial_paths = list(initial_paths or []) self.rules = list(rules or []) self.path_attrs = tuple(path_attrs) self.ignore_css_selectors = list(ignore_css_selectors or []) self.ignore_form_fields = list(ignore_form_fields or []) self.max_requests = max_requests self.capture_exceptions = capture_exceptions self.output_summary = output_summary # data structures self.queue: Queue = Queue() self.graph = DirectedGraph() self.tracebacks: List = [] # handler lists self.should_process_handlers = list(should_process_handlers or []) self.check_response_handlers = list(check_response_handlers or []) # check css selectors for selector in self.ignore_css_selectors: try: soupsieve.compile(selector) except soupsieve.SelectorSyntaxError as e: msg = f"Invalid CSS selector '{selector}' (see parent exception)" raise ValueError(msg) from e # detect client and construct wrapper self.client = detect_and_wrap_client(self._client, ignore_css_selectors) # get logger self.logger = logging.getLogger(LOGGER_NAME)
def setup(self): """Setup.""" self.additional_context = '' self.comments = False self.attributes = [] self.parser = 'xml' self.type = None self.filepattern = 'content.xml' self.namespaces = { 'text': 'urn:oasis:names:tc:opendocument:xmlns:text:1.0', 'draw': 'urn:oasis:names:tc:opendocument:xmlns:drawing:1.0' } self.ignores = sv.compile('', {}) self.captures = sv.compile(','.join(self.default_capture), self.namespaces)
def test_compiled_icomments(self): """Test comments iterator from compiled pattern.""" markup = """ <!-- before header --> <html> <head> </head> <body> <!-- comment --> <p id="1"><code id="2"></code><img id="3" src="./image.png"/></p> <pre id="4"></pre> <p><span id="5" class="some-class"></span><span id="some-id"></span></p> <pre id="6" class='ignore'> <!-- don't ignore --> </pre> </body> </html> """ soup = self.soup(markup, 'html5lib') pattern = sv.compile('div', None, 0) comments = [ sv_util.ustr(c).strip() for c in pattern.icomments(soup, limit=2) ] self.assertEqual(sorted(comments), sorted(['before header', 'comment']))
def test_compiled_comments(self): """Test comments from compiled pattern.""" markup = """ <!-- before header --> <html> <head> </head> <body> <!-- comment --> <p id="1"><code id="2"></code><img id="3" src="./image.png"/></p> <pre id="4"></pre> <p><span id="5" class="some-class"></span><span id="some-id"></span></p> <pre id="6" class='ignore'> <!-- don't ignore --> </pre> </body> </html> """ soup = self.soup(markup, 'html.parser') # Check that comments on compiled object work just like `sv.comments` pattern = sv.compile('div', None, 0) comments = [sv_util.ustr(c).strip() for c in pattern.comments(soup)] self.assertEqual(sorted(comments), sorted(['before header', 'comment', "don't ignore"]))
def setup(self): """Setup.""" self.user_break_tags = set(self.config['break_tags']) self.comments = self.config.get('comments', True) is True self.attributes = set(self.config['attributes']) self.type = self.config['mode'] if self.type not in MODE: self.type = 'html' self.parser = MODE[self.type] ignores = ','.join(self.config['ignores']) self.ignores = sv.compile( ignores, self.config['namespaces']) if ignores.strip() else None captures = ','.join(self.config['captures']) self.captures = sv.compile( captures, self.config['namespaces']) if captures.strip() else None
def compile_pattern(self, selectors, namespaces=None, flags=0): """Compile pattern.""" print('PATTERN: ', selectors) flags |= sv.DEBUG if self.quirks: flags |= sv._QUIRKS return sv.compile(selectors, namespaces=namespaces, flags=flags)
def parse(selector: str) -> "Selector": """ <css selector>@<attribute> | re: <matcher> | fmt: <formatter> """ parts = [""] in_brackets = 0 last_char = None for char in selector: if char == '|' and last_char != '\\' and in_brackets == 0: parts.append('') continue if char == '(' and last_char != '\\': in_brackets += 1 elif char == ')' and last_char != '\\': in_brackets -= 1 last_char = char parts[-1] += char # parts = selector.split('|') parts = [part.strip() for part in parts] attr_selector = parts[0] if len(parts) > 0 else '' attr_selector_parts = attr_selector.split('@') attr_selector_parts = [part.strip() for part in attr_selector_parts] css = attr_selector_parts[0] if len(attr_selector_parts) > 0 else '' attr = attr_selector_parts[1] if len(attr_selector_parts) > 1 else None regex = None fmt = None for part in parts[1:]: if part.startswith('re:'): m = match(r"re:\s*(.*)\s*", part) if m: regex = m.group(1).strip() elif part.startswith('fmt:'): m = match(r"fmt:\s*(.*)\s*", part) if m: fmt = m.group(1).strip() if css: soupsieve.compile(css) return Selector(css, attr, regex, fmt)
def setup(self): """Setup.""" self.additional_context = '' self.comments = False self.attributes = [] self.parser = 'xml' self.type = None self.filepattern = '' self.ignores = sv.compile('', {}) self.captures = None
def test_invalid_combination(self): """ Test invalid combination. Selectors cannot start with relational symbols unless in `:has()`. `:has()` cannot start with `,`. """ with self.assertRaises(SyntaxError): sv.compile('> p') with self.assertRaises(SyntaxError): sv.compile(', p') with self.assertRaises(SyntaxError): sv.compile(':has(, p)') with self.assertRaises(SyntaxError): sv.compile('div >> p') with self.assertRaises(SyntaxError): sv.compile('div >')
def test_cache(self): """Test cache.""" sv.purge() self.assertEqual(sv.cp._cached_css_compile.cache_info().currsize, 0) for x in range(1000): value = '[value="{}"]'.format(sv_util.ustr(random.randint(1, 10000))) p = sv.compile(value) self.assertTrue(p.pattern == value) self.assertTrue(sv.cp._cached_css_compile.cache_info().currsize > 0) self.assertTrue(sv.cp._cached_css_compile.cache_info().currsize == 500) sv.purge() self.assertEqual(sv.cp._cached_css_compile.cache_info().currsize, 0)
def test_recompile(self): """If you feed through the same object, it should pass through unless you change parameters.""" p1 = sv.compile('p[id]') p2 = sv.compile(p1) self.assertTrue(p1 is p2) with pytest.raises(ValueError): sv.compile(p1, flags=sv.DEBUG) with pytest.raises(ValueError): sv.compile(p1, namespaces={"": ""}) with pytest.raises(ValueError): sv.compile(p1, custom={":--header": 'h1, h2, h3, h4, h5, h6'})
def test_icomment_compilied(self): """Test compiled `icomment`.""" html = '<div><!-- comments -->text</div>' soup = self.soup(html, 'html.parser') with warnings.catch_warnings(record=True) as w: # Cause all warnings to always be triggered. warnings.simplefilter("always") pattern = sv.compile('div') pattern.icomments(soup) self.assertTrue(len(w) == 1) self.assertTrue(issubclass(w[-1].category, DeprecationWarning))
def test_recompile(self): """If you feed through the same object, it should pass through unless you change flags.""" p1 = sv.compile('p[id]') p2 = sv.compile(p1) self.assertTrue(p1 is p2) with pytest.raises(ValueError): sv.compile(p1, flags=0x10) with pytest.raises(ValueError): sv.compile(p1, namespaces={"": ""})
def determine_file_type(self, z): """Determine file type.""" content = z.read('[Content_Types].xml') with io.BytesIO(content) as b: encoding = self._analyze_file(b) if encoding is None: encoding = 'utf-8' b.seek(0) text = b.read().decode(encoding) soup = bs4.BeautifulSoup(text, 'xml') for o in soup.find_all('Override'): name = o.attrs.get('PartName') for k, v in MIMEMAP.items(): if name.startswith('/{}/'.format(k)): self.type = v break if self.type: break self.filepattern = DOC_PARAMS[self.type]['filepattern'] self.namespaces = DOC_PARAMS[self.type]['namespaces'] self.captures = sv.compile(DOC_PARAMS[self.type]['captures'], DOC_PARAMS[self.type]['namespaces'])
def __init__(self, css_selector, attr, string_pattern): self.css_selector = sv.compile(css_selector) self.attr = attr self.string_pattern = string_pattern
def test_invalid_syntax(self): """Test invalid syntax.""" with self.assertRaises(SyntaxError): sv.compile('div?')
def test_at_rule(self): """Test at-rule (not supported).""" with self.assertRaises(NotImplementedError): sv.compile('@page :left')