def main(): botlib.runbot({ 'pipes': [ define('places'), ], 'tasks': [ task('places').daily().clean().append(query_places(), progress='places'), task('places').export('data/osm/places.csv', include=[ 'osm_id', 'type', 'place', 'population', 'wikipedia_title', 'wikipedia_lang', 'lon', 'lat', 'admin_level_6_osm_id', 'admin_level_6', 'admin_level_5_osm_id', 'admin_level_5', 'admin_level_4_osm_id', 'admin_level_4', ]) ], })
def test_run_limits_and_fail(): def handler(row): if row.key == 'b': raise ValueError('b') else: yield row.key.upper() pipeline = { 'tasks': [ task('p1').once().append(['a', 'b', 'c']), task('p1', 'p2').call(handler), ], } bot = Bot() p1 = bot.define('p1') p2 = bot.define('p2') with pytest.raises(ExpressionError): bot.main(pipeline, ['run', '-l', '1,1,0']) assert list(p1.keys()) == ['a', 'b', 'c'] assert list(p2.keys()) == ['A'] assert pipeline['tasks'][0]._evals == 2 assert pipeline['tasks'][1]._evals == 2
def test_download_post(bot, requests): def callback(request, context): context.status_code = 200 return ('<div>%s</div>' % request.text).encode('utf-8') url = 'http://example.com/' requests.post(url, content=callback) bot.define('source').append([ (1, {'num': '1'}), (2, {'num': '2'}), ]) t1 = bot.define('t1') t2 = bot.define('t2') tasks = [ task('source', 't1').download(url, method='POST', data={'value': this.value.num}), task('t1', 't2').select('div:text'), ] bot.commands.run(tasks, limits=(0,), error_limit=0) assert list(t1.keys()) == [url, url] assert list(t2.keys()) == ['value=1', 'value=2'] assert t1.last()['value']['request'] == { 'method': 'POST', 'data': {'value': '2'}, }
def test_run_error_limit_n(bot, capsys): def handler(row): if row.key > 1: raise ValueError('Error.') else: yield row.key, row.value.upper() pipeline = { 'pipes': [ define('p1'), define('p2'), ], 'tasks': [ task('p1').once().append([(1, 'a'), (2, 'b'), (3, 'c'), (4, 'd')]), task('p1', 'p2').call(handler), ] } with pytest.raises(ExpressionError): bot.main(pipeline, argv=['run', '-f', '2', '-l', '0']) assert bot.output.output.getvalue() == textwrap.dedent('''\ Validating pipeline. Run pipeline (limit=0). - key: 3 value: 'c' ''') assert list(bot.pipe('p2').items()) == [(1, 'A')] assert capsys.readouterr()[0] == 'Interrupting bot because error limit of 2 was reached.\n' assert task('p1', 'p2').errors.count()._eval(bot) == 2
def test_download_post(bot, requests): def callback(request, context): context.status_code = 200 return ('<div>%s</div>' % request.text).encode('utf-8') url = 'http://example.com/' requests.post(url, content=callback) bot.define('source').append([ (1, { 'num': '1' }), (2, { 'num': '2' }), ]) t1 = bot.define('t1') t2 = bot.define('t2') tasks = [ task('source', 't1').download(url, method='POST', data={'value': this.value.num}), task('t1', 't2').select('div:text'), ] bot.commands.run(tasks, limits=(0, ), error_limit=0) assert list(t1.keys()) == [url, url] assert list(t2.keys()) == ['value=1', 'value=2'] assert t1.last()['value']['request'] == { 'method': 'POST', 'data': { 'value': '2' }, }
def test_run_limits_and_fail_smaller(): def handler(row): if row.key == 'b': raise ValueError('b') else: yield row.key.upper() pipeline = { 'tasks': [ task('p1').once().append(['a', 'b', 'c']), task('p1', 'p2').call(handler), ], } bot = Bot() p1 = bot.define('p1') p2 = bot.define('p2') bot.main(pipeline, ['run', '-l', '1,1,0', '-f', '2']) assert list(p1.keys()) == ['a', 'b', 'c'] assert list(p2.keys()) == ['A', 'C'] assert list(p2(p1).errors.keys()) == ['b'] assert pipeline['tasks'][0]._evals == 3 assert pipeline['tasks'][1]._evals == 3
def test_run_error_limit_n(bot, capsys): def handler(row): if row.key > 1: raise ValueError('Error.') else: yield row.key, row.value.upper() pipeline = { 'pipes': [ define('p1'), define('p2'), ], 'tasks': [ task('p1').once().append([(1, 'a'), (2, 'b'), (3, 'c'), (4, 'd')]), task('p1', 'p2').call(handler), ] } with pytest.raises(ExpressionError): bot.main(pipeline, argv=['run', '-f', '2', '-l', '0']) assert bot.output.output.getvalue() == textwrap.dedent('''\ Validating pipeline. Run pipeline (limit=0). - key: 3 value: 'c' ''') assert list(bot.pipe('p2').items()) == [(1, 'A')] assert capsys.readouterr( )[0] == 'Interrupting bot because error limit of 2 was reached.\n' assert task('p1', 'p2').errors.count()._eval(bot) == 2
def test_run_once(): tasks = [ task('p1').once().append(1), task('p1').once().append(2), task('p1').append(3), ] bot = Bot() p1 = bot.define('p1') bot.commands.run(tasks, limits=(1, 1, 0)) assert list(p1.keys()) == [1, 2, 3, 3, 3]
def test_download_update(bot, requests): url = 'http://example.com/1' requests.get(url, content=b'<div></div>') bot.define('source').append([(url, {'extra': 42})]) target = bot.define('target') tasks = [ task('source', 'target').download(update={ 'extra': this.value.extra, 'request.foo': 'bar', }) ] bot.commands.run(tasks, limits=(0, ), error_limit=0) assert list(target.items()) == [('http://example.com/1', { 'content': b'<div></div>', 'cookies': {}, 'encoding': None, 'extra': 42, 'headers': {}, 'history': [], 'status_code': 200, 'url': 'http://example.com/1', 'request': { 'method': 'GET', 'foo': 'bar', }, })]
def test_download_check(bot, requests): url = 'http://example.com/1' requests.get(url, content=b''' <div> <h1>Test</h1> <p>1</p> <p>2</p> <p>3</p> <h2></h2> </div> ''') source = bot.define('source').append(url) target = bot.define('target') pipe = target(source) tasks = [ task('source', 'target').download(check='xpath://h1[text() = "None"]') ] bot.commands.run(tasks, limits=(0, )) assert target.count() is 0 assert pipe.errors.count() == 1 assert list(pipe.errors.keys()) == [url]
def test_download_update(bot, requests): url = 'http://example.com/1' requests.get(url, content=b'<div></div>') bot.define('source').append([(url, {'extra': 42})]) target = bot.define('target') tasks = [ task('source', 'target').download(update={ 'extra': this.value.extra, 'request.foo': 'bar', }) ] bot.commands.run(tasks, limits=(0,), error_limit=0) assert list(target.items()) == [('http://example.com/1', { 'content': b'<div></div>', 'cookies': {}, 'encoding': None, 'extra': 42, 'headers': {}, 'history': [], 'status_code': 200, 'url': 'http://example.com/1', 'request': { 'method': 'GET', 'foo': 'bar', }, })]
def test_main(db): pipeline = { 'pipes': [ define('p1'), define('p2'), ], 'tasks': [ task('p1').once().append([('1', 'a'), ('2', 'b'), ('3', 'c')]), task('p1', 'p2').select(this.key, this.value.upper()), ], } bot = db.Bot().main(pipeline, argv=['-v0', 'run']) assert list(bot.pipe('p1').items()) == [('1', 'a'), ('2', 'b'), ('3', 'c')] assert list(bot.pipe('p2').items()) == [('1', 'A'), ('2', 'B'), ('3', 'C')]
def test_run_target(): pipeline = { 'pipes': [], 'tasks': [ task('a').once().append(['a']), task('a', 'b').select(this.key.upper()), task('b', 'c').select(this.key.lower()), task().compact(), ], } bot = Bot() bot.define('a') bot.define('b') bot.define('c') bot.main(pipeline, ['run', 'a', '-f']) assert list(bot.pipe('a').keys()) == ['a'] assert list(bot.pipe('b').keys()) == [] assert list(bot.pipe('c').keys()) == [] bot.main(pipeline, ['run', 'b', '-f']) assert list(bot.pipe('a').keys()) == ['a'] assert list(bot.pipe('b').keys()) == ['A'] assert list(bot.pipe('c').keys()) == [] bot.pipe('a').append('b') bot.main(pipeline, ['run', 'a', 'b', '-f']) assert list(bot.pipe('a').keys()) == ['a', 'b'] assert list(bot.pipe('b').keys()) == ['A', 'B'] assert list(bot.pipe('c').keys()) == [] bot.main(pipeline, ['run', 'b', 'c', '-f']) assert list(bot.pipe('a').keys()) == ['a', 'b'] assert list(bot.pipe('b').keys()) == ['A', 'B'] assert list(bot.pipe('c').keys()) == ['a', 'b'] bot.pipe('b').append('C') bot.main(pipeline, ['run', 'c', '-f']) assert list(bot.pipe('a').keys()) == ['a', 'b'] assert list(bot.pipe('b').keys()) == ['A', 'B', 'C'] assert list(bot.pipe('c').keys()) == ['a', 'b', 'c'] bot.main(pipeline, ['run', '-f']) assert list(bot.pipe('a').keys()) == ['b', 'a'] assert list(bot.pipe('b').keys()) == ['B', 'C', 'A'] assert list(bot.pipe('c').keys()) == ['b', 'c', 'a']
def test_run_limits(): pipeline = { 'tasks': [ task('p1').once().append(['a', 'b', 'c']), task('p1', 'p2').select(this.key.upper()), ], } bot = Bot() p1 = bot.define('p1') p2 = bot.define('p2') bot.main(pipeline, ['run', '-l', '1,1,0']) assert list(p1.keys()) == ['a', 'b', 'c'] assert list(p2.keys()) == ['A', 'B', 'C'] assert pipeline['tasks'][0]._evals == 3 assert pipeline['tasks'][1]._evals == 3
def test_watch(bot): a = bot.define('a') b = bot.define('b') def handler(row): if row.key < 16: yield row.key + row.key bot.commands.run([ task('a').once().append(1), task('a', 'b', watch=True).call(handler), task('b', 'a', watch=True).call(handler), task('b').once().append(1), ]) assert list(a.keys()) == [1, 4, 16, 2, 8] assert list(b.keys()) == [2, 8, 1, 4, 16]
def test_run(): pipeline = { 'pipes': [ define('a'), define('b'), ], 'tasks': [ task('a').append(['a', 'A', 'b']), task('a', 'b').select(this.key.upper()), task().compact(), ], } bot = Bot() bot.main(pipeline, ['run', '-f']) assert list(bot.pipe('a').keys()) == ['a', 'A', 'b'] assert list(bot.pipe('b').keys()) == ['A', 'B']
def test_download_expr(bot, requests): url = 'http://example.com/1' requests.get(url, content=b'<div></div>') bot.define('source').append([(1, {'link': url})]) target = bot.define('target') tasks = [task('source', 'target').download(this.value.link)] bot.commands.run(tasks, limits=(0, ), error_limit=0) assert list(target.keys()) == [url]
def test_watch_limits(bot): def handler(row): if row.key < 16: yield row.key + 1 tasks = [ task('a').once().append(1), task('a', 'b', watch=True).call(handler), task('b', 'a', watch=True).call(handler), ] a = bot.define('a') b = bot.define('b') # a | b | run # --------------+-----------+---------------------------------------------- # [1] | [] | task('a').once().append(1) # | | watch: # [1] | [2] | task('a', 'b', watch=True).call(handler) # [1, 3] | [2] | task('b', 'a', watch=True).call(handler) # [1, 3] | [2, 4] | task('a', 'b', watch=True).call(handler) # | | watch: # [1, 3, 5] | [2, 4] | task('b', 'a', watch=True).call(handler) # [1, 3, 5] | [2, 4] | task('b', 'a', watch=True).call(handler) # | | watch: # [1, 3, 5] | [2, 4, 6] | task('a', 'b', watch=True).call(handler) # [1, 3, 5, 7] | [2, 4, 6] | task('b', 'a', watch=True).call(handler) bot.limit = 1 run_all_tasks(bot, tasks) assert list(a.keys()) == [1, 3, 5, 7] assert list(b.keys()) == [2, 4, 6] bot.limit = 1 run_all_tasks(bot, tasks) assert list(a.keys()) == [1, 3, 5, 7, 9, 11, 13] assert list(b.keys()) == [2, 4, 6, 8, 10, 12] bot.limit = 0 run_all_tasks(bot, tasks) assert list(a.keys()) == [1, 3, 5, 7, 9, 11, 13, 15] assert list(b.keys()) == [2, 4, 6, 8, 10, 12, 14, 16]
def test_run(bot): pipeline = { 'pipes': [ define('p1'), define('p2'), ], 'tasks': [ task('p1').once().append([(1, 'a'), (2, 'b')]), task('p1', 'p2').select(this.key, this.value.upper()), ] } bot.main(pipeline, argv=['run', '-f']) assert bot.output.output.getvalue() == textwrap.dedent('''\ Validating pipeline. Run pipeline (limit=1). Run pipeline (limit=0). ''') assert list(bot.pipe('p2').items()) == [(1, 'A'), (2, 'B')]
def test_run(bot): pipeline = { 'pipes': [ define('p1'), define('p2'), ], 'tasks': [ task('p1').once().append([(1, 'a'), (2, 'b')]), task('p1', 'p2').select(this.key, this.value.upper()), ] } bot.main(pipeline, argv=['run', '-f']) assert bot.output.output.getvalue() == textwrap.dedent('''\ Validating pipeline. Run pipeline (limit=1). Run pipeline (limit=0). ''') assert list(bot.pipe('p2').items()) == [(1, 'A'), (2, 'B')]
def test_expresion_str(): expr = (task('a', 'b').select( ['query', ('key', { 'foo': select('query'), })]).download(this.key, check=select('query')).urlparse().query.key.cast(int)) assert str(expr) == dedent(''' task('a', 'b'). select(['query', ('key', {'foo': select('query')})]). download(this.key, check=select('query')). urlparse().query.key. cast('int') ''').strip()
def test_download_expr(bot, requests): url = 'http://example.com/1' requests.get(url, content=b'<div></div>') bot.define('source').append([(1, {'link': url})]) target = bot.define('target') tasks = [ task('source', 'target').download(this.value.link) ] bot.commands.run(tasks, limits=(0,), error_limit=0) assert list(target.keys()) == [url]
def test_run_freq(): tasks = [ task('p1').freq(days=3).append(['a']), task('p1', 'p2').select(this.key.upper()), ] bot = Bot() p1 = bot.define('p1') p2 = bot.define('p2') with freezegun.freeze_time('2017-01-01 00:00:00'): bot.commands.run(tasks) with freezegun.freeze_time('2017-01-02 00:00:00'): bot.commands.run(tasks) assert list(p1.keys()) == ['a'] assert list(p2.keys()) == ['A'] with freezegun.freeze_time('2017-01-04 00:00:00'): bot.commands.run(tasks) assert list(p1.keys()) == ['a', 'a'] assert list(p2.keys()) == ['A', 'A']
def test_expresion_str(): expr = ( task('a', 'b'). select(['query', ('key', { 'foo': select('query'), })]). download(this.key, check=select('query')). urlparse(). query.key.cast(int) ) assert str(expr) == dedent(''' task('a', 'b'). select(['query', ('key', {'foo': select('query')})]). download(this.key, check=select('query')). urlparse().query.key. cast('int') ''').strip()
def test_download_check_multiple(bot, requests): url = 'http://example.com/1' requests.get(url, content=b''' <div> <h1>Test</h1> <p>1</p> <p>2</p> <p>3</p> <h2></h2> </div> ''') source = bot.define('source').append(url) target = bot.define('target') pipe = target(source) tasks = [ task('source', 'target').download(check='p') ] bot.commands.run(tasks, limits=(0,), error_limit=0) assert target.count() == 1 assert pipe.errors.count() == 0 assert list(target.keys()) == [url]
self.queries = queries def __call__(self, select, row, node, many=False, single=True): return '/'.join([ normtime(select.render(row, node, q, many, single)) for q in self.queries ]) pipeline = { 'pipes': [ define('pages', compress=True), define('data'), ], 'tasks': [ task('pages').freq(minutes=5).download('http://www.meteo.lt/lt_LT/miestas?placeCode=Vilnius'), task('pages', 'data').select([ '.forecast-hours', ( key(select(['xpath://body css:.forecast-hours .forecastTime:text']).min(), select('.forecastTime:text')), { 'base': select(['xpath://body css:.forecast-hours .forecastTime:text']).min().apply(normtime), # precision=hours base time 'time': select('.forecastTime:text').apply(normtime), # precision=hours prediction time 'temperature': select('.temperature:text').cast(int), # °C 'wind_direction': select('.windDirectionGroundDegree:text').cast(int), # degrees 'wind_speed': select('.windSpeedGround:text').cast(int), # m/s 'gust_speed': select('.windGustGround:text').cast(int), # m/s 'precipitation': select('.precipitation:text').cast(float), # mm/h 'pressure': select('.pressureMeanSea:text').cast(int), # hPa 'humidity': select('.humidityGround:text').cast(int), # % 'feels_like': select('.feelLike:text').cast(int), # °C
#!/usr/bin/env python3 from databot import Bot, define, task, first pipeline = { 'pipes': [ define('index'), define('news'), ], 'tasks': [ task('index').once().download('https://www.reddit.com/'), task('index', 'news').select([ '.thing.link', ( '.entry .title > a@href', { 'title': '.entry .title > a:text', 'score': '.midcol .score.likes@title', 'time': first(['.tagline time@datetime']), 'comments': '.entry a.comments:text', } ) ]), task('news').export('/tmp/reddit.jsonl'), task().compact(), ], } if __name__ == '__main__': Bot('/tmp/reddit.db').main(pipeline)
break try: browser.wait.until( attribute_has_changed(By.CSS_SELECTOR, '.searchResultDescription a', 'href', first_item_link)) except TimeoutException: browser.get_screenshot_as_file( '/tmp/epaveldas_attribute_has_changed.png') except: browser.get_screenshot_as_file('/tmp/epaveldas_error.png') raise finally: browser.quit() pipeline = { 'pipes': [ define('paieškos-nuorodos'), ], 'tasks': [ task('paieškos-nuorodos').once().clean().append( extract_index_urls(), progress='paieškos-nuorodos').dedup(), ], } if __name__ == '__main__': botlib.runbot(pipeline)
#!/usr/bin/env python3 from databot import Bot, define, task, first pipeline = { 'pipes': [ define('index'), define('news'), ], 'tasks': [ task('index').once().download('https://www.reddit.com/'), task('index', 'news').select([ '.thing.link', ('.entry .title > a@href', { 'title': '.entry .title > a:text', 'score': '.midcol .score.likes@title', 'time': first(['.tagline time@datetime']), 'comments': '.entry a.comments:text', }) ]), task('news').export('/tmp/reddit.jsonl'), task().compact(), ], } if __name__ == '__main__': Bot('/tmp/reddit.db').main(pipeline)
'pradžia': start, 'pabaiga': end, 'trukmė': total, } pipeline = { 'pipes': [ define('paieškos-nuorodos'), define('paieškos-puslapiai', compress=True), define('knygos-duomenys'), ], 'tasks': [ # task('paieškos-nuorodos').once().append(extract_index_urls(), # progress='paieškos-nuorodos').dedup(), task('paieškos-nuorodos', 'paieškos-puslapiai').download(), task('paieškos-puslapiai', 'knygos-duomenys').select( this.key, { 'url': this.value.url, 'antraštė': select('.authorTitle').text(), 'd1': select([ '.entryTable tr', ( select('th:content'), select('td:content').strip(), ), ]).apply(dict), 'd2':
) yield from ((str(x[key]), json.loads(x.to_json())) for _, x in data.iterrows()) pipeline = { 'pipes': [ define('vidurkiai-zip'), define('skaiciai-zip'), define('vidurkiai'), define('skaiciai'), define('imones-puslapis', compress=True), define('imones'), ], 'tasks': [ task('vidurkiai-zip').monthly().download('http://sodra.is.lt/Failai/Vidurkiai.zip'), task('vidurkiai-zip', 'vidurkiai'). call(partial(read_csv, 'VIDURKIAI.CSV', 'kodas', ['regnr', 'kodas', 'alga', 'autorine', 'viso'])). dedup(), task('skaiciai-zip').monthly().download('http://sodra.is.lt/Failai/Apdraustuju_skaicius.zip'), task('skaiciai-zip', 'skaiciai'). call(partial(read_csv, 'APDRAUSTUJU_SKAICIUS.CSV', 'kodas', ['regnr', 'kodas', 'skaicius'])). dedup(), task('vidurkiai', 'imones-puslapis').download( 'https://draudejai.sodra.lt/draudeju_viesi_duomenys/', method='POST', data={ 'formType': 'NEW', 'year': '2017', 'month': '1', 'declarantCode2': this.value.kodas.cast(int).cast(str),
from databot import define, task, this, strformat, select search_url = 'https://www.limis.lt/greita-paieska/rezultatai/-/exhibitList/form?searchOnlySimpleMetadata=false&searchOnlyWithImages=false&searchInExhibits=true&searchInArchives=true&searchInLibraries=true&searchInAudioVideo=true&searchInPhotos=true&s_tab=&s_id=2duvdg1N5K4dHB0W&backUrl=https%3a%2f%2fwww.limis.lt%2fpradinis%2f-%2fexhibitSearchFast%2fform&listDisplayMode=simple&_exhibitListportlet_WAR_limiskportlet_searchType=&page={page}&rowsOnPage=48' pipeline = { 'pipes': [ define('paieška'), define('paieškos-puslapių-numeriai'), define('paieškos-puslapiai', compress=True), define('eksponatų-nuorodos'), define('eksponatų-puslapiai', compress=True), define('eksponatų-duomenys'), ], 'tasks': [ task('paieška').daily().clean().download(search_url.format(page=1), check='select[name=page]'), task('paieška', 'paieškos-puslapių-numeriai').daily(). select(['select[name=page] option @value']). dedup(), task('paieškos-puslapių-numeriai', 'paieškos-puslapiai'). download(strformat(search_url, page=this.key), check='#exhibitListBlockId'), task('paieškos-puslapiai', 'eksponatų-nuorodos').select([ '#exhibitListBlockId .thumbs-with-title > li span.vertical-scroller > a', ('@href', ':text'), ]), task('eksponatų-nuorodos', 'eksponatų-puslapiai').download(check='#exhibit_block_main_info'), task('eksponatų-puslapiai', 'eksponatų-duomenys').select(this.key, {
'.inventoryBaseDataTable tr', ( 'td[1]:content', 'td[2]:content', ) ]), }) pipeline = { 'pipes': [ define('index pages'), define('data'), ], 'tasks': [ task('index pages').call(extract_archive_pages, range(1812, 1921, 5)).clean().reset(), task('index pages', 'data').select( '.inventoryLabel:text', { 'fondas': '.upperHierarchyTreeInner xpath:a[1]/text()', 'apyrašas': '.upperHierarchyTreeInner xpath:a[2]/text()', 'data': select([ '.inventoryBaseDataTable tr', ( 'td[1]:content', 'td[2]:content', ) ]).cast(dict), }),
'osm_id': row.osm_id, 'name': row.name, 'lon': row.lon, 'lat': row.lat, 'religion': row.religion, 'denomination': row.denomination, 'place': find_closes_place(conn, point, row), } pipeline = { 'pipes': [ define('baznycios'), ], 'tasks': [ task('baznycios').once().clean().append( query(), progress='baznycios').compact(), task('baznycios').once().export('data/osm/baznycios.csv', include=[ 'osm_id', 'name', 'religion', 'denomination', 'lon', 'lat', 'place.osm_id', 'place.name', 'place.distance', 'place.population', 'place.lon', 'place.lat', ])
cookies = settings['cookies']['www.lrs.lt'] pipeline = { 'pipes': [ define('klausimų-puslapiai', botlib.dburi('lrs/posedziai')), define('dokumentų-sąrašas'), define('dokumentų-puslapiai', compress=True), define('susijusių-dokumentų-sąrašas'), define('susijusių-dokumentų-puslapiai', compress=True), define('metadata'), define('texts'), ], 'tasks': [ task('klausimų-puslapiai', 'dokumentų-sąrašas').select([ '#page-content div.default b xpath:a[text()="dokumento tekstas"]/@href' ]).dedup(), task('dokumentų-sąrašas', 'dokumentų-puslapiai').download( cookies=cookies, check= '#page-content div.default b xpath:a[text()="dokumento tekstas"]'), task('dokumentų-puslapiai', 'susijusių-dokumentų-sąrašas').select([ '#page-content div.default b xpath:a[text()="susiję dokumentai"]/@href' ]).dedup(), task('susijusių-dokumentų-puslapiai').download( cookies=cookies, check= '#page-content div.default b xpath:a[text()="susiję dokumentai"]'), task('dokumentų-puslapiai', 'metadata').select( this.key, call(dict, [
} pipeline = { 'pipes': [ define('raidės-nuorodos'), define('raidės-puslapiai', compress=True), define('sąrašas-nuorodos'), define('sąrašas-puslapiai', compress=True), define('vardai-nuorodos'), define('vardai-puslapiai', compress=True), define('vardai'), ], 'tasks': [ # Vardo pirmos raidės sąrašas task('raidės-nuorodos').monthly().append( 'https://www.tevu-darzelis.lt/vaiku-vardai/A/'), task('raidės-puslapiai', 'raidės-nuorodos', watch=True).select(['#alphabet li a@href']).dedup(), task('raidės-nuorodos', 'raidės-puslapiai', watch=True).download(), # Sąrašo puslapiavimas task('raidės-puslapiai', 'sąrašas-nuorodos', watch=True).select(['.pagination li a@href']).dedup(), task('sąrašas-puslapiai', 'sąrašas-nuorodos', watch=True).select(['.pagination li a@href']).dedup(), task('sąrašas-nuorodos', 'sąrašas-puslapiai', watch=True).download(), # Vardų puslapiai task('sąrašas-puslapiai', 'vardai-nuorodos').select([ '.name-list li', ('a@href', {
import botlib from datetime import timedelta from databot import define, task, this, select pipeline = { 'pipes': [ define('index urls'), define('index pages'), define('dataset urls'), define('dataset pages'), define('dataset data'), define('datasets'), ], 'tasks': [ task('index urls').daily().append( 'http://opendata.gov.lt/index.php?vars=/public/public/search'), task('index urls', 'index pages', watch=True).download(), task('index pages', 'index urls', watch=True).select(['td > a.path@href']).dedup(), task('index pages', 'dataset urls').select( ['form[name=frm] > table > tr > td[3] > a@href']), task('dataset urls').clean(timedelta(days=7)).dedup(), task('dataset urls', 'dataset pages').download(), task('dataset pages', 'dataset data').select(this.key, [ 'table xpath:tr[count(td)=2]', ( 'td[1]:content', select('td[2]:content').strip(), ) ]), task('dataset data').clean(timedelta(days=7)).dedup(),
pipeline = { 'pipes': [ define('klausimų-puslapiai', botlib.dburi('lrs/posedziai')), define('balsavimų-sąrašas'), define('balsavimų-puslapiai', compress=True), define('balsavimų-duomenys'), define('registracijos-sąrašas'), define('registracijos-puslapiai', compress=True), ], 'tasks': [ # Darbotvarkės klausimas (balsavimai) task('klausimų-puslapiai', 'balsavimų-sąrašas').select( [ '.sale_svarst_eiga tr td[2] xpath:a[text()="balsavimas"]', '@href' ], check= 'xpath://h1[contains(text(), "Darbotvarkės klausimas")]/text()', ).dedup(), task('balsavimų-sąrašas', 'balsavimų-puslapiai').download( cookies=cookies, check='#page-content h1.page-title'), task('balsavimų-puslapiai', 'balsavimų-duomenys').select( this.key, { 'data': select('h1.page-title:text').re(r'\d{4}-\d{2}-\d{2}'), 'posėdis': select('h1.page-title:text').re(r'(\w+) posėdis\)'), 'klausimai': [ 'xpath://b/a[contains(@class, "link") and text()="dokumento tekstas"]', { 'pavadinimas':