Exemplo n.º 1
0
def main():
    botlib.runbot({
        'pipes': [
            define('places'),
        ],
        'tasks': [
            task('places').daily().clean().append(query_places(), progress='places'),
            task('places').export('data/osm/places.csv', include=[
                'osm_id',
                'type',
                'place',
                'population',
                'wikipedia_title',
                'wikipedia_lang',
                'lon',
                'lat',
                'admin_level_6_osm_id',
                'admin_level_6',
                'admin_level_5_osm_id',
                'admin_level_5',
                'admin_level_4_osm_id',
                'admin_level_4',
            ])
        ],
    })
Exemplo n.º 2
0
def test_run_limits_and_fail():
    def handler(row):
        if row.key == 'b':
            raise ValueError('b')
        else:
            yield row.key.upper()

    pipeline = {
        'tasks': [
            task('p1').once().append(['a', 'b', 'c']),
            task('p1', 'p2').call(handler),
        ],
    }

    bot = Bot()
    p1 = bot.define('p1')
    p2 = bot.define('p2')

    with pytest.raises(ExpressionError):
        bot.main(pipeline, ['run', '-l', '1,1,0'])

    assert list(p1.keys()) == ['a', 'b', 'c']
    assert list(p2.keys()) == ['A']
    assert pipeline['tasks'][0]._evals == 2
    assert pipeline['tasks'][1]._evals == 2
Exemplo n.º 3
0
def test_download_post(bot, requests):
    def callback(request, context):
        context.status_code = 200
        return ('<div>%s</div>' % request.text).encode('utf-8')

    url = 'http://example.com/'
    requests.post(url, content=callback)

    bot.define('source').append([
        (1, {'num': '1'}),
        (2, {'num': '2'}),
    ])
    t1 = bot.define('t1')
    t2 = bot.define('t2')

    tasks = [
        task('source', 't1').download(url, method='POST', data={'value': this.value.num}),
        task('t1', 't2').select('div:text'),
    ]

    bot.commands.run(tasks, limits=(0,), error_limit=0)
    assert list(t1.keys()) == [url, url]
    assert list(t2.keys()) == ['value=1', 'value=2']
    assert t1.last()['value']['request'] == {
        'method': 'POST',
        'data': {'value': '2'},
    }
Exemplo n.º 4
0
def test_run_error_limit_n(bot, capsys):

    def handler(row):
        if row.key > 1:
            raise ValueError('Error.')
        else:
            yield row.key, row.value.upper()

    pipeline = {
        'pipes': [
            define('p1'),
            define('p2'),
        ],
        'tasks': [
            task('p1').once().append([(1, 'a'), (2, 'b'), (3, 'c'), (4, 'd')]),
            task('p1', 'p2').call(handler),
        ]
    }

    with pytest.raises(ExpressionError):
        bot.main(pipeline, argv=['run', '-f', '2', '-l', '0'])

    assert bot.output.output.getvalue() == textwrap.dedent('''\
    Validating pipeline.

    Run pipeline (limit=0).
    - key: 3
      value: 'c'
    ''')
    assert list(bot.pipe('p2').items()) == [(1, 'A')]
    assert capsys.readouterr()[0] == 'Interrupting bot because error limit of 2 was reached.\n'
    assert task('p1', 'p2').errors.count()._eval(bot) == 2
Exemplo n.º 5
0
def test_download_post(bot, requests):
    def callback(request, context):
        context.status_code = 200
        return ('<div>%s</div>' % request.text).encode('utf-8')

    url = 'http://example.com/'
    requests.post(url, content=callback)

    bot.define('source').append([
        (1, {
            'num': '1'
        }),
        (2, {
            'num': '2'
        }),
    ])
    t1 = bot.define('t1')
    t2 = bot.define('t2')

    tasks = [
        task('source', 't1').download(url,
                                      method='POST',
                                      data={'value': this.value.num}),
        task('t1', 't2').select('div:text'),
    ]

    bot.commands.run(tasks, limits=(0, ), error_limit=0)
    assert list(t1.keys()) == [url, url]
    assert list(t2.keys()) == ['value=1', 'value=2']
    assert t1.last()['value']['request'] == {
        'method': 'POST',
        'data': {
            'value': '2'
        },
    }
Exemplo n.º 6
0
def test_run_limits_and_fail_smaller():
    def handler(row):
        if row.key == 'b':
            raise ValueError('b')
        else:
            yield row.key.upper()

    pipeline = {
        'tasks': [
            task('p1').once().append(['a', 'b', 'c']),
            task('p1', 'p2').call(handler),
        ],
    }

    bot = Bot()
    p1 = bot.define('p1')
    p2 = bot.define('p2')

    bot.main(pipeline, ['run', '-l', '1,1,0', '-f', '2'])

    assert list(p1.keys()) == ['a', 'b', 'c']
    assert list(p2.keys()) == ['A', 'C']
    assert list(p2(p1).errors.keys()) == ['b']
    assert pipeline['tasks'][0]._evals == 3
    assert pipeline['tasks'][1]._evals == 3
Exemplo n.º 7
0
def test_run_error_limit_n(bot, capsys):
    def handler(row):
        if row.key > 1:
            raise ValueError('Error.')
        else:
            yield row.key, row.value.upper()

    pipeline = {
        'pipes': [
            define('p1'),
            define('p2'),
        ],
        'tasks': [
            task('p1').once().append([(1, 'a'), (2, 'b'), (3, 'c'), (4, 'd')]),
            task('p1', 'p2').call(handler),
        ]
    }

    with pytest.raises(ExpressionError):
        bot.main(pipeline, argv=['run', '-f', '2', '-l', '0'])

    assert bot.output.output.getvalue() == textwrap.dedent('''\
    Validating pipeline.

    Run pipeline (limit=0).
    - key: 3
      value: 'c'
    ''')
    assert list(bot.pipe('p2').items()) == [(1, 'A')]
    assert capsys.readouterr(
    )[0] == 'Interrupting bot because error limit of 2 was reached.\n'
    assert task('p1', 'p2').errors.count()._eval(bot) == 2
Exemplo n.º 8
0
def test_run_once():
    tasks = [
        task('p1').once().append(1),
        task('p1').once().append(2),
        task('p1').append(3),
    ]

    bot = Bot()
    p1 = bot.define('p1')

    bot.commands.run(tasks, limits=(1, 1, 0))
    assert list(p1.keys()) == [1, 2, 3, 3, 3]
Exemplo n.º 9
0
def test_download_update(bot, requests):
    url = 'http://example.com/1'
    requests.get(url, content=b'<div></div>')

    bot.define('source').append([(url, {'extra': 42})])
    target = bot.define('target')

    tasks = [
        task('source', 'target').download(update={
            'extra': this.value.extra,
            'request.foo': 'bar',
        })
    ]

    bot.commands.run(tasks, limits=(0, ), error_limit=0)
    assert list(target.items()) == [('http://example.com/1', {
        'content': b'<div></div>',
        'cookies': {},
        'encoding': None,
        'extra': 42,
        'headers': {},
        'history': [],
        'status_code': 200,
        'url': 'http://example.com/1',
        'request': {
            'method': 'GET',
            'foo': 'bar',
        },
    })]
Exemplo n.º 10
0
def test_download_check(bot, requests):
    url = 'http://example.com/1'
    requests.get(url,
                 content=b'''
        <div>
            <h1>Test</h1>
            <p>1</p>
            <p>2</p>
            <p>3</p>
            <h2></h2>
        </div>
    ''')

    source = bot.define('source').append(url)
    target = bot.define('target')
    pipe = target(source)

    tasks = [
        task('source', 'target').download(check='xpath://h1[text() = "None"]')
    ]

    bot.commands.run(tasks, limits=(0, ))
    assert target.count() is 0
    assert pipe.errors.count() == 1
    assert list(pipe.errors.keys()) == [url]
Exemplo n.º 11
0
def test_download_update(bot, requests):
    url = 'http://example.com/1'
    requests.get(url, content=b'<div></div>')

    bot.define('source').append([(url, {'extra': 42})])
    target = bot.define('target')

    tasks = [
        task('source', 'target').download(update={
            'extra': this.value.extra,
            'request.foo': 'bar',
        })
    ]

    bot.commands.run(tasks, limits=(0,), error_limit=0)
    assert list(target.items()) == [('http://example.com/1', {
        'content': b'<div></div>',
        'cookies': {},
        'encoding': None,
        'extra': 42,
        'headers': {},
        'history': [],
        'status_code': 200,
        'url': 'http://example.com/1',
        'request': {
            'method': 'GET',
            'foo': 'bar',
        },
    })]
Exemplo n.º 12
0
def test_main(db):
    pipeline = {
        'pipes': [
            define('p1'),
            define('p2'),
        ],
        'tasks': [
            task('p1').once().append([('1', 'a'), ('2', 'b'), ('3', 'c')]),
            task('p1', 'p2').select(this.key, this.value.upper()),
        ],
    }

    bot = db.Bot().main(pipeline, argv=['-v0', 'run'])

    assert list(bot.pipe('p1').items()) == [('1', 'a'), ('2', 'b'), ('3', 'c')]
    assert list(bot.pipe('p2').items()) == [('1', 'A'), ('2', 'B'), ('3', 'C')]
Exemplo n.º 13
0
def test_run_target():
    pipeline = {
        'pipes': [],
        'tasks': [
            task('a').once().append(['a']),
            task('a', 'b').select(this.key.upper()),
            task('b', 'c').select(this.key.lower()),
            task().compact(),
        ],
    }

    bot = Bot()
    bot.define('a')
    bot.define('b')
    bot.define('c')

    bot.main(pipeline, ['run', 'a', '-f'])
    assert list(bot.pipe('a').keys()) == ['a']
    assert list(bot.pipe('b').keys()) == []
    assert list(bot.pipe('c').keys()) == []

    bot.main(pipeline, ['run', 'b', '-f'])
    assert list(bot.pipe('a').keys()) == ['a']
    assert list(bot.pipe('b').keys()) == ['A']
    assert list(bot.pipe('c').keys()) == []

    bot.pipe('a').append('b')
    bot.main(pipeline, ['run', 'a', 'b', '-f'])
    assert list(bot.pipe('a').keys()) == ['a', 'b']
    assert list(bot.pipe('b').keys()) == ['A', 'B']
    assert list(bot.pipe('c').keys()) == []

    bot.main(pipeline, ['run', 'b', 'c', '-f'])
    assert list(bot.pipe('a').keys()) == ['a', 'b']
    assert list(bot.pipe('b').keys()) == ['A', 'B']
    assert list(bot.pipe('c').keys()) == ['a', 'b']

    bot.pipe('b').append('C')
    bot.main(pipeline, ['run', 'c', '-f'])
    assert list(bot.pipe('a').keys()) == ['a', 'b']
    assert list(bot.pipe('b').keys()) == ['A', 'B', 'C']
    assert list(bot.pipe('c').keys()) == ['a', 'b', 'c']

    bot.main(pipeline, ['run', '-f'])
    assert list(bot.pipe('a').keys()) == ['b', 'a']
    assert list(bot.pipe('b').keys()) == ['B', 'C', 'A']
    assert list(bot.pipe('c').keys()) == ['b', 'c', 'a']
Exemplo n.º 14
0
def test_run_limits():
    pipeline = {
        'tasks': [
            task('p1').once().append(['a', 'b', 'c']),
            task('p1', 'p2').select(this.key.upper()),
        ],
    }

    bot = Bot()
    p1 = bot.define('p1')
    p2 = bot.define('p2')

    bot.main(pipeline, ['run', '-l', '1,1,0'])
    assert list(p1.keys()) == ['a', 'b', 'c']
    assert list(p2.keys()) == ['A', 'B', 'C']
    assert pipeline['tasks'][0]._evals == 3
    assert pipeline['tasks'][1]._evals == 3
Exemplo n.º 15
0
def test_watch(bot):
    a = bot.define('a')
    b = bot.define('b')

    def handler(row):
        if row.key < 16:
            yield row.key + row.key

    bot.commands.run([
        task('a').once().append(1),
        task('a', 'b', watch=True).call(handler),
        task('b', 'a', watch=True).call(handler),
        task('b').once().append(1),
    ])

    assert list(a.keys()) == [1, 4, 16, 2, 8]
    assert list(b.keys()) == [2, 8, 1, 4, 16]
Exemplo n.º 16
0
def test_run():
    pipeline = {
        'pipes': [
            define('a'),
            define('b'),
        ],
        'tasks': [
            task('a').append(['a', 'A', 'b']),
            task('a', 'b').select(this.key.upper()),
            task().compact(),
        ],
    }

    bot = Bot()
    bot.main(pipeline, ['run', '-f'])

    assert list(bot.pipe('a').keys()) == ['a', 'A', 'b']
    assert list(bot.pipe('b').keys()) == ['A', 'B']
Exemplo n.º 17
0
def test_download_expr(bot, requests):
    url = 'http://example.com/1'
    requests.get(url, content=b'<div></div>')

    bot.define('source').append([(1, {'link': url})])
    target = bot.define('target')

    tasks = [task('source', 'target').download(this.value.link)]

    bot.commands.run(tasks, limits=(0, ), error_limit=0)
    assert list(target.keys()) == [url]
Exemplo n.º 18
0
def test_watch_limits(bot):
    def handler(row):
        if row.key < 16:
            yield row.key + 1

    tasks = [
        task('a').once().append(1),
        task('a', 'b', watch=True).call(handler),
        task('b', 'a', watch=True).call(handler),
    ]

    a = bot.define('a')
    b = bot.define('b')

    #  a            | b         | run
    # --------------+-----------+----------------------------------------------
    #  [1]          | []        | task('a').once().append(1)
    #               |           |   watch:
    #  [1]          | [2]       |     task('a', 'b', watch=True).call(handler)
    #  [1, 3]       | [2]       |     task('b', 'a', watch=True).call(handler)
    #  [1, 3]       | [2, 4]    | task('a', 'b', watch=True).call(handler)
    #               |           |   watch:
    #  [1, 3, 5]    | [2, 4]    |     task('b', 'a', watch=True).call(handler)
    #  [1, 3, 5]    | [2, 4]    | task('b', 'a', watch=True).call(handler)
    #               |           |   watch:
    #  [1, 3, 5]    | [2, 4, 6] |     task('a', 'b', watch=True).call(handler)
    #  [1, 3, 5, 7] | [2, 4, 6] |     task('b', 'a', watch=True).call(handler)
    bot.limit = 1
    run_all_tasks(bot, tasks)
    assert list(a.keys()) == [1, 3, 5, 7]
    assert list(b.keys()) == [2, 4, 6]

    bot.limit = 1
    run_all_tasks(bot, tasks)
    assert list(a.keys()) == [1, 3, 5, 7, 9, 11, 13]
    assert list(b.keys()) == [2, 4, 6, 8, 10, 12]

    bot.limit = 0
    run_all_tasks(bot, tasks)
    assert list(a.keys()) == [1, 3, 5, 7, 9, 11, 13, 15]
    assert list(b.keys()) == [2, 4, 6, 8, 10, 12, 14, 16]
Exemplo n.º 19
0
def test_run(bot):
    pipeline = {
        'pipes': [
            define('p1'),
            define('p2'),
        ],
        'tasks': [
            task('p1').once().append([(1, 'a'), (2, 'b')]),
            task('p1', 'p2').select(this.key, this.value.upper()),
        ]
    }

    bot.main(pipeline, argv=['run', '-f'])
    assert bot.output.output.getvalue() == textwrap.dedent('''\
    Validating pipeline.

    Run pipeline (limit=1).

    Run pipeline (limit=0).
    ''')
    assert list(bot.pipe('p2').items()) == [(1, 'A'), (2, 'B')]
Exemplo n.º 20
0
def test_run(bot):
    pipeline = {
        'pipes': [
            define('p1'),
            define('p2'),
        ],
        'tasks': [
            task('p1').once().append([(1, 'a'), (2, 'b')]),
            task('p1', 'p2').select(this.key, this.value.upper()),
        ]
    }

    bot.main(pipeline, argv=['run', '-f'])
    assert bot.output.output.getvalue() == textwrap.dedent('''\
    Validating pipeline.

    Run pipeline (limit=1).

    Run pipeline (limit=0).
    ''')
    assert list(bot.pipe('p2').items()) == [(1, 'A'), (2, 'B')]
Exemplo n.º 21
0
def test_expresion_str():
    expr = (task('a', 'b').select(
        ['query', ('key', {
            'foo': select('query'),
        })]).download(this.key,
                      check=select('query')).urlparse().query.key.cast(int))
    assert str(expr) == dedent('''
        task('a', 'b').
        select(['query', ('key', {'foo': select('query')})]).
        download(this.key, check=select('query')).
        urlparse().query.key.
        cast('int')
    ''').strip()
Exemplo n.º 22
0
def test_download_expr(bot, requests):
    url = 'http://example.com/1'
    requests.get(url, content=b'<div></div>')

    bot.define('source').append([(1, {'link': url})])
    target = bot.define('target')

    tasks = [
        task('source', 'target').download(this.value.link)
    ]

    bot.commands.run(tasks, limits=(0,), error_limit=0)
    assert list(target.keys()) == [url]
Exemplo n.º 23
0
def test_run_freq():
    tasks = [
        task('p1').freq(days=3).append(['a']),
        task('p1', 'p2').select(this.key.upper()),
    ]

    bot = Bot()
    p1 = bot.define('p1')
    p2 = bot.define('p2')

    with freezegun.freeze_time('2017-01-01 00:00:00'):
        bot.commands.run(tasks)

    with freezegun.freeze_time('2017-01-02 00:00:00'):
        bot.commands.run(tasks)

    assert list(p1.keys()) == ['a']
    assert list(p2.keys()) == ['A']

    with freezegun.freeze_time('2017-01-04 00:00:00'):
        bot.commands.run(tasks)

    assert list(p1.keys()) == ['a', 'a']
    assert list(p2.keys()) == ['A', 'A']
Exemplo n.º 24
0
def test_expresion_str():
    expr = (
        task('a', 'b').
        select(['query', ('key', {
            'foo': select('query'),
        })]).
        download(this.key, check=select('query')).
        urlparse().
        query.key.cast(int)
    )
    assert str(expr) == dedent('''
        task('a', 'b').
        select(['query', ('key', {'foo': select('query')})]).
        download(this.key, check=select('query')).
        urlparse().query.key.
        cast('int')
    ''').strip()
Exemplo n.º 25
0
def test_download_check_multiple(bot, requests):
    url = 'http://example.com/1'
    requests.get(url, content=b'''
        <div>
            <h1>Test</h1>
            <p>1</p>
            <p>2</p>
            <p>3</p>
            <h2></h2>
        </div>
    ''')

    source = bot.define('source').append(url)
    target = bot.define('target')
    pipe = target(source)

    tasks = [
        task('source', 'target').download(check='p')
    ]

    bot.commands.run(tasks, limits=(0,), error_limit=0)
    assert target.count() == 1
    assert pipe.errors.count() == 0
    assert list(target.keys()) == [url]
Exemplo n.º 26
0
        self.queries = queries

    def __call__(self, select, row, node, many=False, single=True):
        return '/'.join([
            normtime(select.render(row, node, q, many, single))
            for q in self.queries
        ])


pipeline = {
    'pipes': [
        define('pages', compress=True),
        define('data'),
    ],
    'tasks': [
        task('pages').freq(minutes=5).download('http://www.meteo.lt/lt_LT/miestas?placeCode=Vilnius'),
        task('pages', 'data').select([
            '.forecast-hours', (
                key(select(['xpath://body css:.forecast-hours .forecastTime:text']).min(),
                    select('.forecastTime:text')),
                {
                    'base': select(['xpath://body css:.forecast-hours .forecastTime:text']).min().apply(normtime),  # precision=hours base time
                    'time': select('.forecastTime:text').apply(normtime),  # precision=hours prediction time
                    'temperature': select('.temperature:text').cast(int),  # °C
                    'wind_direction': select('.windDirectionGroundDegree:text').cast(int),  # degrees
                    'wind_speed': select('.windSpeedGround:text').cast(int),  # m/s
                    'gust_speed': select('.windGustGround:text').cast(int),  # m/s
                    'precipitation': select('.precipitation:text').cast(float),  # mm/h
                    'pressure': select('.pressureMeanSea:text').cast(int),  # hPa
                    'humidity': select('.humidityGround:text').cast(int),  # %
                    'feels_like': select('.feelLike:text').cast(int),  # °C
Exemplo n.º 27
0
#!/usr/bin/env python3

from databot import Bot, define, task, first

pipeline = {
    'pipes': [
        define('index'),
        define('news'),
    ],
    'tasks': [
        task('index').once().download('https://www.reddit.com/'),
        task('index', 'news').select([
            '.thing.link', (
                '.entry .title > a@href', {
                    'title': '.entry .title > a:text',
                    'score': '.midcol .score.likes@title',
                    'time': first(['.tagline time@datetime']),
                    'comments': '.entry a.comments:text',
                }
            )
        ]),
        task('news').export('/tmp/reddit.jsonl'),
        task().compact(),
    ],
}

if __name__ == '__main__':
    Bot('/tmp/reddit.db').main(pipeline)
Exemplo n.º 28
0
                break

            try:
                browser.wait.until(
                    attribute_has_changed(By.CSS_SELECTOR,
                                          '.searchResultDescription a', 'href',
                                          first_item_link))
            except TimeoutException:
                browser.get_screenshot_as_file(
                    '/tmp/epaveldas_attribute_has_changed.png')

    except:
        browser.get_screenshot_as_file('/tmp/epaveldas_error.png')
        raise
    finally:
        browser.quit()


pipeline = {
    'pipes': [
        define('paieškos-nuorodos'),
    ],
    'tasks': [
        task('paieškos-nuorodos').once().clean().append(
            extract_index_urls(), progress='paieškos-nuorodos').dedup(),
    ],
}

if __name__ == '__main__':
    botlib.runbot(pipeline)
Exemplo n.º 29
0
#!/usr/bin/env python3

from databot import Bot, define, task, first

pipeline = {
    'pipes': [
        define('index'),
        define('news'),
    ],
    'tasks': [
        task('index').once().download('https://www.reddit.com/'),
        task('index', 'news').select([
            '.thing.link',
            ('.entry .title > a@href', {
                'title': '.entry .title > a:text',
                'score': '.midcol .score.likes@title',
                'time': first(['.tagline time@datetime']),
                'comments': '.entry a.comments:text',
            })
        ]),
        task('news').export('/tmp/reddit.jsonl'),
        task().compact(),
    ],
}

if __name__ == '__main__':
    Bot('/tmp/reddit.db').main(pipeline)
Exemplo n.º 30
0
        'pradžia': start,
        'pabaiga': end,
        'trukmė': total,
    }


pipeline = {
    'pipes': [
        define('paieškos-nuorodos'),
        define('paieškos-puslapiai', compress=True),
        define('knygos-duomenys'),
    ],
    'tasks': [
        # task('paieškos-nuorodos').once().append(extract_index_urls(),
        #                                         progress='paieškos-nuorodos').dedup(),
        task('paieškos-nuorodos', 'paieškos-puslapiai').download(),
        task('paieškos-puslapiai', 'knygos-duomenys').select(
            this.key, {
                'url':
                this.value.url,
                'antraštė':
                select('.authorTitle').text(),
                'd1':
                select([
                    '.entryTable tr',
                    (
                        select('th:content'),
                        select('td:content').strip(),
                    ),
                ]).apply(dict),
                'd2':
Exemplo n.º 31
0
        )

    yield from ((str(x[key]), json.loads(x.to_json())) for _, x in data.iterrows())


pipeline = {
    'pipes': [
        define('vidurkiai-zip'),
        define('skaiciai-zip'),
        define('vidurkiai'),
        define('skaiciai'),
        define('imones-puslapis', compress=True),
        define('imones'),
    ],
    'tasks': [
        task('vidurkiai-zip').monthly().download('http://sodra.is.lt/Failai/Vidurkiai.zip'),
        task('vidurkiai-zip', 'vidurkiai').
            call(partial(read_csv, 'VIDURKIAI.CSV', 'kodas', ['regnr', 'kodas', 'alga', 'autorine', 'viso'])).
            dedup(),
        task('skaiciai-zip').monthly().download('http://sodra.is.lt/Failai/Apdraustuju_skaicius.zip'),
        task('skaiciai-zip', 'skaiciai').
            call(partial(read_csv, 'APDRAUSTUJU_SKAICIUS.CSV', 'kodas', ['regnr', 'kodas', 'skaicius'])).
            dedup(),
        task('vidurkiai', 'imones-puslapis').download(
            'https://draudejai.sodra.lt/draudeju_viesi_duomenys/',
            method='POST',
            data={
                'formType': 'NEW',
                'year': '2017',
                'month': '1',
                'declarantCode2': this.value.kodas.cast(int).cast(str),
Exemplo n.º 32
0
from databot import define, task, this, strformat, select


search_url = 'https://www.limis.lt/greita-paieska/rezultatai/-/exhibitList/form?searchOnlySimpleMetadata=false&searchOnlyWithImages=false&searchInExhibits=true&searchInArchives=true&searchInLibraries=true&searchInAudioVideo=true&searchInPhotos=true&s_tab=&s_id=2duvdg1N5K4dHB0W&backUrl=https%3a%2f%2fwww.limis.lt%2fpradinis%2f-%2fexhibitSearchFast%2fform&listDisplayMode=simple&_exhibitListportlet_WAR_limiskportlet_searchType=&page={page}&rowsOnPage=48'

pipeline = {
    'pipes': [
        define('paieška'),
        define('paieškos-puslapių-numeriai'),
        define('paieškos-puslapiai', compress=True),
        define('eksponatų-nuorodos'),
        define('eksponatų-puslapiai', compress=True),
        define('eksponatų-duomenys'),
    ],
    'tasks': [
        task('paieška').daily().clean().download(search_url.format(page=1), check='select[name=page]'),

        task('paieška', 'paieškos-puslapių-numeriai').daily().
        select(['select[name=page] option @value']).
        dedup(),

        task('paieškos-puslapių-numeriai', 'paieškos-puslapiai').
        download(strformat(search_url, page=this.key), check='#exhibitListBlockId'),

        task('paieškos-puslapiai', 'eksponatų-nuorodos').select([
            '#exhibitListBlockId .thumbs-with-title > li span.vertical-scroller > a', ('@href', ':text'),
        ]),

        task('eksponatų-nuorodos', 'eksponatų-puslapiai').download(check='#exhibit_block_main_info'),

        task('eksponatų-puslapiai', 'eksponatų-duomenys').select(this.key, {
Exemplo n.º 33
0
                    '.inventoryBaseDataTable tr',
                    (
                        'td[1]:content',
                        'td[2]:content',
                    )
                ]),
            })


pipeline = {
    'pipes': [
        define('index pages'),
        define('data'),
    ],
    'tasks': [
        task('index pages').call(extract_archive_pages,
                                 range(1812, 1921, 5)).clean().reset(),
        task('index pages', 'data').select(
            '.inventoryLabel:text', {
                'fondas':
                '.upperHierarchyTreeInner xpath:a[1]/text()',
                'apyrašas':
                '.upperHierarchyTreeInner xpath:a[2]/text()',
                'data':
                select([
                    '.inventoryBaseDataTable tr',
                    (
                        'td[1]:content',
                        'td[2]:content',
                    )
                ]).cast(dict),
            }),
Exemplo n.º 34
0
                'osm_id': row.osm_id,
                'name': row.name,
                'lon': row.lon,
                'lat': row.lat,
                'religion': row.religion,
                'denomination': row.denomination,
                'place': find_closes_place(conn, point, row),
            }


pipeline = {
    'pipes': [
        define('baznycios'),
    ],
    'tasks': [
        task('baznycios').once().clean().append(
            query(), progress='baznycios').compact(),
        task('baznycios').once().export('data/osm/baznycios.csv',
                                        include=[
                                            'osm_id',
                                            'name',
                                            'religion',
                                            'denomination',
                                            'lon',
                                            'lat',
                                            'place.osm_id',
                                            'place.name',
                                            'place.distance',
                                            'place.population',
                                            'place.lon',
                                            'place.lat',
                                        ])
Exemplo n.º 35
0
cookies = settings['cookies']['www.lrs.lt']

pipeline = {
    'pipes': [
        define('klausimų-puslapiai', botlib.dburi('lrs/posedziai')),
        define('dokumentų-sąrašas'),
        define('dokumentų-puslapiai', compress=True),
        define('susijusių-dokumentų-sąrašas'),
        define('susijusių-dokumentų-puslapiai', compress=True),
        define('metadata'),
        define('texts'),
    ],
    'tasks': [
        task('klausimų-puslapiai', 'dokumentų-sąrašas').select([
            '#page-content div.default b xpath:a[text()="dokumento tekstas"]/@href'
        ]).dedup(),
        task('dokumentų-sąrašas', 'dokumentų-puslapiai').download(
            cookies=cookies,
            check=
            '#page-content div.default b xpath:a[text()="dokumento tekstas"]'),
        task('dokumentų-puslapiai', 'susijusių-dokumentų-sąrašas').select([
            '#page-content div.default b xpath:a[text()="susiję dokumentai"]/@href'
        ]).dedup(),
        task('susijusių-dokumentų-puslapiai').download(
            cookies=cookies,
            check=
            '#page-content div.default b xpath:a[text()="susiję dokumentai"]'),
        task('dokumentų-puslapiai', 'metadata').select(
            this.key,
            call(dict, [
Exemplo n.º 36
0
    }


pipeline = {
    'pipes': [
        define('raidės-nuorodos'),
        define('raidės-puslapiai', compress=True),
        define('sąrašas-nuorodos'),
        define('sąrašas-puslapiai', compress=True),
        define('vardai-nuorodos'),
        define('vardai-puslapiai', compress=True),
        define('vardai'),
    ],
    'tasks': [
        # Vardo pirmos raidės sąrašas
        task('raidės-nuorodos').monthly().append(
            'https://www.tevu-darzelis.lt/vaiku-vardai/A/'),
        task('raidės-puslapiai', 'raidės-nuorodos',
             watch=True).select(['#alphabet li a@href']).dedup(),
        task('raidės-nuorodos', 'raidės-puslapiai', watch=True).download(),

        # Sąrašo puslapiavimas
        task('raidės-puslapiai', 'sąrašas-nuorodos',
             watch=True).select(['.pagination li a@href']).dedup(),
        task('sąrašas-puslapiai', 'sąrašas-nuorodos',
             watch=True).select(['.pagination li a@href']).dedup(),
        task('sąrašas-nuorodos', 'sąrašas-puslapiai', watch=True).download(),

        # Vardų puslapiai
        task('sąrašas-puslapiai', 'vardai-nuorodos').select([
            '.name-list li',
            ('a@href', {
Exemplo n.º 37
0
import botlib

from datetime import timedelta
from databot import define, task, this, select

pipeline = {
    'pipes': [
        define('index urls'),
        define('index pages'),
        define('dataset urls'),
        define('dataset pages'),
        define('dataset data'),
        define('datasets'),
    ],
    'tasks': [
        task('index urls').daily().append(
            'http://opendata.gov.lt/index.php?vars=/public/public/search'),
        task('index urls', 'index pages', watch=True).download(),
        task('index pages', 'index urls',
             watch=True).select(['td > a.path@href']).dedup(),
        task('index pages', 'dataset urls').select(
            ['form[name=frm] > table > tr > td[3] > a@href']),
        task('dataset urls').clean(timedelta(days=7)).dedup(),
        task('dataset urls', 'dataset pages').download(),
        task('dataset pages', 'dataset data').select(this.key, [
            'table xpath:tr[count(td)=2]',
            (
                'td[1]:content',
                select('td[2]:content').strip(),
            )
        ]),
        task('dataset data').clean(timedelta(days=7)).dedup(),
Exemplo n.º 38
0
pipeline = {
    'pipes': [
        define('klausimų-puslapiai', botlib.dburi('lrs/posedziai')),
        define('balsavimų-sąrašas'),
        define('balsavimų-puslapiai', compress=True),
        define('balsavimų-duomenys'),
        define('registracijos-sąrašas'),
        define('registracijos-puslapiai', compress=True),
    ],
    'tasks': [
        # Darbotvarkės klausimas (balsavimai)
        task('klausimų-puslapiai', 'balsavimų-sąrašas').select(
            [
                '.sale_svarst_eiga tr td[2] xpath:a[text()="balsavimas"]',
                '@href'
            ],
            check=
            'xpath://h1[contains(text(), "Darbotvarkės klausimas")]/text()',
        ).dedup(),
        task('balsavimų-sąrašas', 'balsavimų-puslapiai').download(
            cookies=cookies, check='#page-content h1.page-title'),
        task('balsavimų-puslapiai', 'balsavimų-duomenys').select(
            this.key, {
                'data':
                select('h1.page-title:text').re(r'\d{4}-\d{2}-\d{2}'),
                'posėdis':
                select('h1.page-title:text').re(r'(\w+) posėdis\)'),
                'klausimai': [
                    'xpath://b/a[contains(@class, "link") and text()="dokumento tekstas"]',
                    {
                        'pavadinimas':