def test_null(Html, freezetime): freezetime('2017-05-18T14:43:40.876642') row = Html(['<div><p id="this"> p1 </p>/div>']) selector = html.Select(select('#wrong:text?').strip()) with pytest.raises(html.SelectorError) as e: selector(row) assert str(e.value) == dedent('''\ Expression error while evaluating None. Error: error while processing expression: this. strip() evaluated with: Row({ 'compression': None, 'created': datetime.datetime(2017, 5, 18, 14, 43, 40, 876642), 'id': 1, 'key': 'http://exemple.com', 'value': {'content': b'<div><p id="this"> p1 </p>/div>'}, }). Context: <html> <body> <div><p id="this"> p1 </p>/div></div> </body> </html> ''') selector = html.Select(select('#wrong:text?').null().strip()) assert selector(row) is None selector = html.Select(select('#this:text?').null().strip()) assert selector(row) == 'p1'
def test_expresion_str(): expr = (task('a', 'b').select( ['query', ('key', { 'foo': select('query'), })]).download(this.key, check=select('query')).urlparse().query.key.cast(int)) assert str(expr) == dedent(''' task('a', 'b'). select(['query', ('key', {'foo': select('query')})]). download(this.key, check=select('query')). urlparse().query.key. cast('int') ''').strip()
def test_empty_result(Html): row = Html([ '<div>', ' <h1>Test</h1>', ' <p>1</p>', ' <p>2</p>', ' <p>3</p>', ' <h2></h2>', '</div>', ]) # Raise error, if selector did not found anything selector = html.Select(['p.new:text']) with pytest.raises(html.SelectorError) as e: selector(row) assert str(e.value) == ( "Select query did not returned any results. Row key: 'http://exemple.com'. Query: ['p.new:text']" ) # Allow empty result from selector. selector = html.Select(['p.new:text'], check=False) assert selector(row) == [] # Allow empty result from selector, but check if we still looking at the right page. selector = html.Select(['p.new:text'], check='xpath://h1[text() = "Test"]') assert selector(row) == [] # Existing element without content should not be threated as emtpy. selector = html.Select(select('xpath://h2').text()) assert selector(row) == ''
def test_apply_with_select_arg(Html): def f(value, link): return value, link row = Html(['<div><p>1</p></div>']) selector = html.Select(select('p:text').cast(int).apply(f, this.key)) assert selector(row) == (1, 'http://exemple.com')
def test_oneof(Html): row = Html([ '<div>' ' <p><a>1</a></p>' ' <p><b>2</b></p>' ' <p><i>3</i></p>' '</div>' ]) selector = html.Select([ 'div p', oneof( select('a:text'), select('b:text'), select('i:text'), ) ]) assert selector(row) == ['1', '2', '3']
def test_expresion_str(): expr = ( task('a', 'b'). select(['query', ('key', { 'foo': select('query'), })]). download(this.key, check=select('query')). urlparse(). query.key.cast(int) ) assert str(expr) == dedent(''' task('a', 'b'). select(['query', ('key', {'foo': select('query')})]). download(this.key, check=select('query')). urlparse().query.key. cast('int') ''').strip()
def test_url(Html): row = Html([ '<div>', '<a href="http://example.com/?id=1">a</a>', '<a href="http://example.com/">b</a>', '</div>', ]) selector = html.Select(['a', select('@href').url(query='id')]) assert selector(row) == ['http://example.com/?id=1', None]
def test_text(Html): row = Html(['<div><p>p1</p>p2<br>p3<br>p4</div>']) selector = html.Select(select('div').text()) assert selector(row).splitlines() == [ 'p1', '', 'p2', '', 'p3', '', 'p4' ]
def test_select_method(bot): row = Row({ 'key': 1, 'value': { 'xml': ( '<div>' ' <p>1</p>' ' <p>2</p>' ' <p>3</p>' '</div>' ), }, }) selector = html.Select(this.value.xml.select([select('div p:text').cast(int)])) assert selector(row) == [1, 2, 3]
task('paieška').daily().clean().download(search_url.format(page=1), check='select[name=page]'), task('paieška', 'paieškos-puslapių-numeriai').daily(). select(['select[name=page] option @value']). dedup(), task('paieškos-puslapių-numeriai', 'paieškos-puslapiai'). download(strformat(search_url, page=this.key), check='#exhibitListBlockId'), task('paieškos-puslapiai', 'eksponatų-nuorodos').select([ '#exhibitListBlockId .thumbs-with-title > li span.vertical-scroller > a', ('@href', ':text'), ]), task('eksponatų-nuorodos', 'eksponatų-puslapiai').download(check='#exhibit_block_main_info'), task('eksponatų-puslapiai', 'eksponatų-duomenys').select(this.key, { 'info': select(['#exhibit_block_main_info > tr', ( select('xpath:./td[1]').text().strip(), select('xpath:./td[2]').text().strip(), )]).cast(dict), 'aprašymas': select('xpath://td[text() = "Eksponato aprašymas"]/following-sibling::td?').null().text(), 'pateikė': select('xpath://td[text() = "Duomenis pateikė"]/following-sibling::td?').null().text(), 'savininkas': select('xpath://td[text() = "Savininkas"]/following-sibling::td?').null().text(), }), ], } if __name__ == '__main__': botlib.runbot(pipeline)
task('dokumentų-sąrašas', 'dokumentų-puslapiai').download( cookies=cookies, check= '#page-content div.default b xpath:a[text()="dokumento tekstas"]'), task('dokumentų-puslapiai', 'susijusių-dokumentų-sąrašas').select([ '#page-content div.default b xpath:a[text()="susiję dokumentai"]/@href' ]).dedup(), task('susijusių-dokumentų-puslapiai').download( cookies=cookies, check= '#page-content div.default b xpath:a[text()="susiję dokumentai"]'), task('dokumentų-puslapiai', 'metadata').select( this.key, call(dict, [ '.basic .ltb', (select(':text').strip(), select('b:text?').strip()) ])).dedup(), task('dokumentų-puslapiai', 'texts').select(this.key, 'body > div:content').dedup(), task('metadata').export('data/lrs/dokumentai/metadata.csv', include=[ 'key', 'Data:', 'Rūšis:', 'Kalba:', 'Numeris:', 'Statusas:', ]), task('texts').export('data/lrs/dokumentai/texts.csv', include=[ 'key',
def test_text_text(Html): row = Html(['<div><p>1</p>2<p>3</p>4</div>']) selector = html.Select(select(['xpath://p[1]/following-sibling::node()']).text()) assert selector(row) == '2 3\n\n4'
def test_text_processing_instructions(Html): row = Html(['<div><p>text <?xml:namespace prefix="o" /></p></div>']) selector = html.Select(select('div').text()) assert selector(row) == 'text'
def test_text_comments(Html): row = Html(['<div><p>text <!-- comment --><o:p></o:p></p></div>']) selector = html.Select(select('div').text()) assert selector(row) == 'text'
define('dataset data'), define('datasets'), ], 'tasks': [ task('index urls').daily().append( 'http://opendata.gov.lt/index.php?vars=/public/public/search'), task('index urls', 'index pages', watch=True).download(), task('index pages', 'index urls', watch=True).select(['td > a.path@href']).dedup(), task('index pages', 'dataset urls').select( ['form[name=frm] > table > tr > td[3] > a@href']), task('dataset urls').clean(timedelta(days=7)).dedup(), task('dataset urls', 'dataset pages').download(), task('dataset pages', 'dataset data').select(this.key, [ 'table xpath:tr[count(td)=2]', ( 'td[1]:content', select('td[2]:content').strip(), ) ]), task('dataset data').clean(timedelta(days=7)).dedup(), task('dataset data', 'datasets').call(lambda x: [(x.key, dict(x.value))]), task('datasets').export('data/ivpk/opendata-gov-lt/datasets.jsonl'), task().compact(), ], } if __name__ == '__main__': botlib.runbot(pipeline)
def test_select_outside_list(Html): row = Html(['<div><a name="1">a</a><a name="2"></a></div>']) selector = html.Select(select(['a@name']).apply(len)) assert selector(row) == 2
'tasks': [ task('vidurkiai-zip').monthly().download('http://sodra.is.lt/Failai/Vidurkiai.zip'), task('vidurkiai-zip', 'vidurkiai'). call(partial(read_csv, 'VIDURKIAI.CSV', 'kodas', ['regnr', 'kodas', 'alga', 'autorine', 'viso'])). dedup(), task('skaiciai-zip').monthly().download('http://sodra.is.lt/Failai/Apdraustuju_skaicius.zip'), task('skaiciai-zip', 'skaiciai'). call(partial(read_csv, 'APDRAUSTUJU_SKAICIUS.CSV', 'kodas', ['regnr', 'kodas', 'skaicius'])). dedup(), task('vidurkiai', 'imones-puslapis').download( 'https://draudejai.sodra.lt/draudeju_viesi_duomenys/', method='POST', data={ 'formType': 'NEW', 'year': '2017', 'month': '1', 'declarantCode2': this.value.kodas.cast(int).cast(str), 'actionName': 'MEAN', }, check='xpath://td[text() = "Draudėjo pavadinimas"]', ), task('imones-puslapis', 'imones').select(this.value.request.data.declarantCode2, { 'pavadinimas': select('xpath://td[text() = "Draudėjo pavadinimas"]/following-sibling::td[1]/text()'), }) ], } if __name__ == '__main__': botlib.runbot(pipeline)
def test_call_getitem(Html): row = Html(['<div><a name="1">a</a><a name="2">b</a></div>']) qry = html.Select(select(['div > a'])[0].text().upper()) assert qry(row) == 'A'
def test_inline_call(Html): row = Html(['<div><a name="1">a</a><a name="2">b</a></div>']) qry = html.Select(['div > a', ('@name', select(':text').upper())]) assert qry(row) == [('1', 'A'), ('2', 'B')]
task('raidės-puslapiai', 'raidės-nuorodos', watch=True).select(['#alphabet li a@href']).dedup(), task('raidės-nuorodos', 'raidės-puslapiai', watch=True).download(), # Sąrašo puslapiavimas task('raidės-puslapiai', 'sąrašas-nuorodos', watch=True).select(['.pagination li a@href']).dedup(), task('sąrašas-puslapiai', 'sąrašas-nuorodos', watch=True).select(['.pagination li a@href']).dedup(), task('sąrašas-nuorodos', 'sąrašas-puslapiai', watch=True).download(), # Vardų puslapiai task('sąrašas-puslapiai', 'vardai-nuorodos').select([ '.name-list li', ('a@href', { 'name': select('a').text(), 'class': select('a@class'), }) ]).dedup(), task('vardai-nuorodos', 'vardai-puslapiai').download(), # Vardai task('vardai-puslapiai', 'vardai').select( this.key.urlparse().path, { 'lytis': select('#page-left xpath:.//h1[1]/@class'), 'vardas': select('#page-left xpath:.//h1[1]/strong/text()'), 'kilmė': select( '#name-info xpath:./p[strong/text() = "Vardo kilmė:"]/text()?'
pipeline = { 'pipes': [ define('paieškos-nuorodos'), define('paieškos-puslapiai', compress=True), define('knygos-duomenys'), ], 'tasks': [ # task('paieškos-nuorodos').once().append(extract_index_urls(), # progress='paieškos-nuorodos').dedup(), task('paieškos-nuorodos', 'paieškos-puslapiai').download(), task('paieškos-puslapiai', 'knygos-duomenys').select( this.key, { 'url': this.value.url, 'antraštė': select('.authorTitle').text(), 'd1': select([ '.entryTable tr', ( select('th:content'), select('td:content').strip(), ), ]).apply(dict), 'd2': select('.authorTitle').text().apply(parse_title), }), task('knygos-duomenys').export( 'data/epaveldas/metrikai/knygos.csv', update=lambda row: { 'url': row.value['url'],
def test_select_outside_nested_list(Html): row = Html(['<div><a name="1">a</a><a name="2"></a></div>']) selector = html.Select(['a@name', select().cast(int)]) assert selector(row) == [1, 2]
def test_select_inside_list(Html): row = Html(['<div><a name="1">a</a><a name="2"></a></div>']) selector = html.Select([select('a@name').cast(int)]) assert selector(row) == [1, 2]
'tasks': [ # Darbotvarkės klausimas (balsavimai) task('klausimų-puslapiai', 'balsavimų-sąrašas').select( [ '.sale_svarst_eiga tr td[2] xpath:a[text()="balsavimas"]', '@href' ], check= 'xpath://h1[contains(text(), "Darbotvarkės klausimas")]/text()', ).dedup(), task('balsavimų-sąrašas', 'balsavimų-puslapiai').download( cookies=cookies, check='#page-content h1.page-title'), task('balsavimų-puslapiai', 'balsavimų-duomenys').select( this.key, { 'data': select('h1.page-title:text').re(r'\d{4}-\d{2}-\d{2}'), 'posėdis': select('h1.page-title:text').re(r'(\w+) posėdis\)'), 'klausimai': [ 'xpath://b/a[contains(@class, "link") and text()="dokumento tekstas"]', { 'pavadinimas': select( 'xpath:./../preceding-sibling::b[contains(a/@class, "link")][1]/a/text()[1]' ), 'rūšis': select( 'xpath:./../preceding-sibling::b[contains(a/@class, "link")][1]/following-sibling::text()[1]' ), 'klausimo-nuoroda': select(
pipeline = { 'pipes': [ define('index pages'), define('data'), ], 'tasks': [ task('index pages').call(extract_archive_pages, range(1812, 1921, 5)).clean().reset(), task('index pages', 'data').select( '.inventoryLabel:text', { 'fondas': '.upperHierarchyTreeInner xpath:a[1]/text()', 'apyrašas': '.upperHierarchyTreeInner xpath:a[2]/text()', 'data': select([ '.inventoryBaseDataTable tr', ( 'td[1]:content', 'td[2]:content', ) ]).cast(dict), }), ], } if __name__ == '__main__': botlib.runbot(define, run)
return '/'.join([ normtime(select.render(row, node, q, many, single)) for q in self.queries ]) pipeline = { 'pipes': [ define('pages', compress=True), define('data'), ], 'tasks': [ task('pages').freq(minutes=5).download('http://www.meteo.lt/lt_LT/miestas?placeCode=Vilnius'), task('pages', 'data').select([ '.forecast-hours', ( key(select(['xpath://body css:.forecast-hours .forecastTime:text']).min(), select('.forecastTime:text')), { 'base': select(['xpath://body css:.forecast-hours .forecastTime:text']).min().apply(normtime), # precision=hours base time 'time': select('.forecastTime:text').apply(normtime), # precision=hours prediction time 'temperature': select('.temperature:text').cast(int), # °C 'wind_direction': select('.windDirectionGroundDegree:text').cast(int), # degrees 'wind_speed': select('.windSpeedGround:text').cast(int), # m/s 'gust_speed': select('.windGustGround:text').cast(int), # m/s 'precipitation': select('.precipitation:text').cast(float), # mm/h 'pressure': select('.pressureMeanSea:text').cast(int), # hPa 'humidity': select('.humidityGround:text').cast(int), # % 'feels_like': select('.feelLike:text').cast(int), # °C } ) ]).compact(),