def test_join(self): result = scrape({'iterator': 'td', 'join': '|'}, TABLE_TH_HTML) assert result == 'John|Mayall|Mary|Susan' result = scrape( { 'iterator': 'tbody > tr', 'fields': { 'name': { 'sel': 'td' }, 'joined': { 'iterator': 'td', 'join': '|' } } }, TABLE_TH_HTML) assert result == [{ 'name': 'John', 'joined': 'John|Mayall' }, { 'name': 'Mary', 'joined': 'Mary|Susan' }]
def test_eval(self): result = scrape( { 'iterator': 'li', 'fields': { 'root_id': { 'eval': 'root.select_one("#ok").get("id")' } } }, META_HTML) assert result == [{'root_id': 'ok'}, {'root_id': 'ok'}] result = scrape( { 'sel': 'li', 'item': { 'eval': 'a = 45\nreturn a + 10' } }, BASIC_HTML) assert result == 55 result = scrape( { 'item': { 'eval': 'el = element.select_one("li")\nreturn el.get_text().strip()' } }, BASIC_HTML) assert result == 'One'
def test_leaf(self): item = scrape({'sel': 'li'}, BASIC_HTML) assert item == 'One' item = scrape({}, BASIC_HTML) assert item == 'One\nTwo'
def worker(payload): row, headers, path, encoding, content, scraper = payload # Reading from file if content is None: try: if path.endswith('.gz'): with open(path, 'rb') as f: content_bytes = gzip.decompress(f.read()) content = content_bytes.decode(encoding, errors='replace') else: with codecs.open(path, 'r', encoding=encoding, errors='replace') as f: content = f.read() except UnicodeDecodeError as e: return ScrapeWorkerResult(e, None) # Building context context = {} if row: context['line'] = LazyLineDict(headers, row) if path: context['path'] = path context['basename'] = basename(path) # Attempting to scrape items = scrape(scraper, content, context=context) return ScrapeWorkerResult(None, items)
def test_conditional_eval(self): html = ''' <main> <div id="colors"> <p>Red</p> <p>Blue</p> </div> <div id="animals"> <ul> <li>Tiger</li> <li>Dog</li> </ul> </div> </main> ''' result = scrape( { 'iterator': 'div', 'fields': { 'kind': 'id', 'items': { 'iterator_eval': 'element.select("p") if element.get("id") == "colors" else element.select("li")' } } }, html) assert result == [{ 'kind': 'colors', 'items': ['Red', 'Blue'] }, { 'kind': 'animals', 'items': ['Tiger', 'Dog'] }]
def test_eval_errors(self): with pytest.raises(ScraperEvalError) as info: scrape({ 'iterator': 'li', 'item': { 'eval': 'item.split()' } }, BASIC_HTML) assert isinstance(info.value.reason, NameError) assert info.value.path == ['item', 'eval'] with pytest.raises(ScraperEvalError) as info: def hellraiser(**kwargs): raise RuntimeError scrape({ 'iterator': 'li', 'item': { 'eval': hellraiser } }, BASIC_HTML) assert isinstance(info.value.reason, RuntimeError) assert info.value.path == ['item', 'eval'] with pytest.raises(ScraperEvalTypeError) as info: scrape({'sel_eval': '45'}, BASIC_HTML) assert info.value.expected == (Tag, str) assert info.value.got == 45 assert info.value.path == ['sel_eval'] with pytest.raises(ScraperEvalNoneError) as info: scrape({'iterator_eval': 'None'}, BASIC_HTML) assert info.value.path == ['iterator_eval'] with pytest.raises(ScraperEvalTypeError) as info: def iterator(element, **kwargs): return [element.select_one('li'), 45] scrape({'iterator_eval': iterator}, BASIC_HTML) assert info.value.path == ['iterator_eval']
def test_recursive(self): result = scrape({ 'iterator': 'li', 'item': { 'iterator': 'span' } }, NESTED_HTML) assert result == [['One', '1'], ['Two', '2']]
def test_inexistent_selection(self): expected = [{'id': 'li1', 'empty': None}, {'id': 'li2', 'empty': None}] items = scrape( { 'iterator': 'li', 'fields': { 'id': 'id', 'empty': { 'sel': 'blockquote' } } }, BASIC_HTML) assert items == expected items = scrape( { 'iterator': 'li', 'fields': { 'id': 'id', 'empty': { 'sel': 'blockquote', 'item': 'text' } } }, BASIC_HTML) assert items == expected item = scrape( { 'sel': 'li', 'fields': { 'id': 'id', 'empty': { 'sel': 'blockquote', 'item': 'text' } } }, BASIC_HTML) assert item == expected[0]
def test_selection_eval(self): result = scrape( { 'iterator': 'li', 'item': { 'sel_eval': 'element.select_one("span")' } }, NESTED_HTML) assert result == ['One', 'Two'] result = scrape( { 'iterator_eval': 'element.select("li") + element.select("span")', 'item': { 'attr': 'class' } }, NESTED_HTML) assert result == [['li'], ['li'], ['first'], ['second'], ['first'], ['second']] result = scrape({ 'iterator': 'li', 'item': { 'sel_eval': '"span"' } }, NESTED_HTML) assert result == ['One', 'Two'] result = scrape( { 'iterator_eval': '"li, span"', 'item': { 'attr': 'class' } }, NESTED_HTML) assert result == [['li'], ['first'], ['second'], ['li'], ['first'], ['second']]
def test_uniq(self): result = scrape({ 'iterator': 'li', 'item': 'id', 'uniq': True }, REPETITIVE_HTML) assert result == ['li1', 'li3'] result = scrape( { 'iterator': 'li', 'fields': { 'id': 'id' }, 'uniq': 'id' }, REPETITIVE_HTML) assert result == [{'id': 'li1'}, {'id': 'li3'}]
def test_scope(self): result = scrape( { 'iterator': 'li', 'item': { 'eval': 'x = scope.x or 0\nscope.x = x + 1\nreturn scope.x' } }, REPETITIVE_HTML) assert [1, 2, 3]
def test_filter(self): result = scrape({'iterator': 'li', 'item': 'id'}, HOLEY_HTML) assert result == ['li1', None, 'li3'] result = scrape( { 'iterator': 'li', 'item': 'id', 'filter_eval': 'bool(value)' }, HOLEY_HTML) assert result == ['li1', 'li3'] result = scrape({ 'iterator': 'li', 'item': 'id', 'filter': True }, HOLEY_HTML) assert result == ['li1', 'li3'] result = scrape({ '$$': 'li', 'fields': { 'id': 'id' }, 'filter': True }, HOLEY_HTML) assert result == [{'id': 'li1'}, {'id': None}, {'id': 'li3'}] result = scrape( { 'iterator': 'li', 'fields': { 'id': 'id' }, 'filter': 'id' }, HOLEY_HTML) assert result == [{'id': 'li1'}, {'id': 'li3'}]
def test_callable_eval(self): def process(value, **kwargs): return value.upper() result = scrape({ 'iterator': 'li', 'item': { 'eval': process } }, BASIC_HTML) assert result == ['ONE', 'TWO']
def test_dumb_recursive(self): item = scrape( { 'sel': 'ul', 'item': { 'sel': 'li', 'item': { 'sel': 'span' } } }, NESTED_HTML) assert item == 'One'
def test_transform(self): result = scrape( { 'iterator': 'li', 'item': { 'extract': 'text', 'transform': 'upper' } }, BASIC_HTML) assert result == ['ONE', 'TWO'] result = scrape( { 'iterator': 'li', 'item': { 'extract': 'text', 'transform': ['upper', 'lower'] } }, BASIC_HTML) assert result == ['one', 'two']
def test_nested_local_context(self): html = ''' <div data-topic="science"> <ul> <li> <p> Post n°<strong>1</strong> by <em>Allan</em> </p> </li> <li> <p> Post n°<strong>2</strong> by <em>Susan</em> </p> </li> </ul> </div> <div data-topic="arts"> <ul> <li> <p> Post n°<strong>3</strong> by <em>Josephine</em> </p> </li> <li> <p> Post n°<strong>4</strong> by <em>Peter</em> </p> </li> </ul> </div> ''' result = scrape( { 'iterator': 'div', 'item': { 'set_context': { 'topic': 'data-topic' }, 'iterator': 'li > p', 'fields': { 'topic': { 'get_context': 'topic' }, 'post': { 'sel': 'strong' }, 'author': { 'sel': 'em' } } } }, html) assert result == [[ { 'topic': 'science', 'post': '1', 'author': 'Allan' }, { 'topic': 'science', 'post': '2', 'author': 'Susan' }, ], [{ 'topic': 'arts', 'post': '3', 'author': 'Josephine' }, { 'topic': 'arts', 'post': '4', 'author': 'Peter' }]] result = scrape( { 'iterator': 'li', 'fields': { 'topic': { 'sel_eval': 'element.find_parent("div")', 'attr': 'data-topic' }, 'post': { 'sel': 'strong' }, 'author': { 'sel': 'em' } } }, html) assert result == [{ 'topic': 'science', 'post': '1', 'author': 'Allan' }, { 'topic': 'science', 'post': '2', 'author': 'Susan' }, { 'topic': 'arts', 'post': '3', 'author': 'Josephine' }, { 'topic': 'arts', 'post': '4', 'author': 'Peter' }]
def test_absent_tail_call(self): item = scrape({'sel': 'quote', 'fields': {'url': 'href'}}, BASIC_HTML) assert item is None
def test_stripped_extraction(self): text = scrape({'sel': 'div'}, '<div> Hello world </div>') assert text == 'Hello world'
def test_filter(self): result = scrape({'iterator': 'li', 'item': 'id'}, HOLEY_HTML) assert result == ['li1', None, 'li3'] result = scrape( { 'iterator': 'li', 'item': 'id', 'filter_eval': 'bool(value)' }, HOLEY_HTML) assert result == ['li1', 'li3'] result = scrape({ 'iterator': 'li', 'item': 'id', 'filter': True }, HOLEY_HTML) assert result == ['li1', 'li3'] result = scrape({ '$$': 'li', 'fields': { 'id': 'id' }, 'filter': True }, HOLEY_HTML) assert result == [{'id': 'li1'}, {'id': None}, {'id': 'li3'}] result = scrape( { 'iterator': 'li', 'fields': { 'id': 'id' }, 'filter': 'id' }, HOLEY_HTML) assert result == [{'id': 'li1'}, {'id': 'li3'}] target_html = ''' <ul> <li color="blue" age="34">John</li> <li age="45">Mary</li> <li color="purple" age="23">Susan </li> </ul> ''' result = scrape( { 'iterator': 'li', 'fields': { 'name': 'text', 'attributes': { 'fields': { 'color': 'color', 'age': 'age' } } }, 'filter': 'attributes.color' }, target_html) assert result == [{ 'name': 'John', 'attributes': { 'color': 'blue', 'age': '34' } }, { 'name': 'Susan', 'attributes': { 'color': 'purple', 'age': '23' } }]
def test_context(self): result = scrape( { 'iterator': 'li', 'fields': { 'text': { 'method': 'text' }, 'context': { 'eval': 'context["value"]' } } }, BASIC_HTML, context={'value': 1}) assert list(result) == [{ 'text': 'One', 'context': 1 }, { 'text': 'Two', 'context': 1 }] result = scrape( { 'iterator': 'li', 'fields': { 'text': { 'method': 'text' }, 'context': { 'get_context': 'value' } } }, BASIC_HTML, context={'value': 1}) assert list(result) == [{ 'text': 'One', 'context': 1 }, { 'text': 'Two', 'context': 1 }] result = scrape( { 'set_context': { 'divid': { '$': '#ok', 'attr': 'id' } }, 'iterator': 'li', 'fields': { 'context': { 'get_context': 'divid' }, 'value': 'text' } }, META_HTML) assert result == [{ 'context': 'ok', 'value': 'One' }, { 'context': 'ok', 'value': 'Two' }] result = scrape( { 'set_context': { 'title': { 'default': 'Scrape' } }, 'iterator': 'li', 'fields': { 'local': { 'set_context': { 'divid': { 'eval': 'root.select_one("#ok").get("id")' } }, 'get_context': 'divid' }, 'global': { 'get_context': 'divid' }, 'title': { 'get_context': 'title' } } }, META_HTML, context={'divid': 'notok'}) assert result == [{ 'local': 'ok', 'global': 'notok', 'title': 'Scrape' }, { 'local': 'ok', 'global': 'notok', 'title': 'Scrape' }]
def test_basics(self): result = scrape({'iterator': 'li'}, BASIC_HTML) assert result == ['One', 'Two'] result = scrape({'iterator': 'li', 'item': 'id'}, BASIC_HTML) assert result == ['li1', 'li2'] result = scrape({'iterator': 'li', 'item': {'attr': 'id'}}, BASIC_HTML) assert result == ['li1', 'li2'] result = scrape({'sel': '#ok', 'item': 'id'}, META_HTML) assert result == 'ok' result = scrape({ 'sel': '#ok', 'iterator': 'li', 'item': 'id' }, META_HTML) assert result == ['li1', 'li2'] result = scrape( { 'iterator': 'li', 'item': { 'eval': 'element.get("id") + "-ok"' } }, BASIC_HTML) assert result == ['li1-ok', 'li2-ok'] result = scrape( { 'iterator': 'li', 'item': { 'attr': 'id', 'eval': 'value + "-test"' } }, BASIC_HTML) result == ['li1-test', 'li2-test'] result = scrape( { 'iterator': 'li', 'fields': { 'id': 'id', 'text': 'text' } }, BASIC_HTML) assert result == [{ 'id': 'li1', 'text': 'One' }, { 'id': 'li2', 'text': 'Two' }] result = scrape( { 'iterator': 'li', 'fields': { 'label': { 'sel': '.first' }, 'number': { 'sel': '.second' } } }, NESTED_HTML) assert result == [{ 'number': '1', 'label': 'One' }, { 'number': '2', 'label': 'Two' }] result = scrape( { 'iterator': 'li', 'fields': { 'inner': { 'extract': 'inner_html' }, 'outer': { 'extract': 'outer_html' } } }, NESTED_HTML) assert result == [{ 'inner': '<span class="first">One</span> <span class="second">1</span>', 'outer': '<li class="li" id="li1"><span class="first">One</span> <span class="second">1</span></li>' }, { 'inner': '<span class="first">Two</span> <span class="second">2</span>', 'outer': '<li class="li" id="li2"><span class="first">Two</span> <span class="second">2</span></li>' }] result = scrape({'item': { 'extract': 'display_text' }}, '<p>Hello</p><p>World</p>') assert result == 'Hello\n\nWorld' result = scrape( { 'iterator': 'li', 'fields': { 'value': 'text', 'constant': { 'default': 'Same' } } }, BASIC_HTML) assert result == [{ 'value': 'One', 'constant': 'Same' }, { 'value': 'Two', 'constant': 'Same' }] result = scrape( { 'iterator': 'li', 'item': { 'attr': 'class', 'default': 'no-class' } }, BASIC_HTML) assert result == ['no-class', 'no-class']