Exemplo n.º 1
0
def test_run_error_limit_n(bot, capsys):
    def handler(row):
        if row.key > 1:
            raise ValueError('Error.')
        else:
            yield row.key, row.value.upper()

    pipeline = {
        'pipes': [
            define('p1'),
            define('p2'),
        ],
        'tasks': [
            task('p1').once().append([(1, 'a'), (2, 'b'), (3, 'c'), (4, 'd')]),
            task('p1', 'p2').call(handler),
        ]
    }

    with pytest.raises(ExpressionError):
        bot.main(pipeline, argv=['run', '-f', '2', '-l', '0'])

    assert bot.output.output.getvalue() == textwrap.dedent('''\
    Validating pipeline.

    Run pipeline (limit=0).
    - key: 3
      value: 'c'
    ''')
    assert list(bot.pipe('p2').items()) == [(1, 'A')]
    assert capsys.readouterr(
    )[0] == 'Interrupting bot because error limit of 2 was reached.\n'
    assert task('p1', 'p2').errors.count()._eval(bot) == 2
Exemplo n.º 2
0
def test_run_error_limit_n(bot, capsys):

    def handler(row):
        if row.key > 1:
            raise ValueError('Error.')
        else:
            yield row.key, row.value.upper()

    pipeline = {
        'pipes': [
            define('p1'),
            define('p2'),
        ],
        'tasks': [
            task('p1').once().append([(1, 'a'), (2, 'b'), (3, 'c'), (4, 'd')]),
            task('p1', 'p2').call(handler),
        ]
    }

    with pytest.raises(ExpressionError):
        bot.main(pipeline, argv=['run', '-f', '2', '-l', '0'])

    assert bot.output.output.getvalue() == textwrap.dedent('''\
    Validating pipeline.

    Run pipeline (limit=0).
    - key: 3
      value: 'c'
    ''')
    assert list(bot.pipe('p2').items()) == [(1, 'A')]
    assert capsys.readouterr()[0] == 'Interrupting bot because error limit of 2 was reached.\n'
    assert task('p1', 'p2').errors.count()._eval(bot) == 2
Exemplo n.º 3
0
def test_main(db):
    pipeline = {
        'pipes': [
            define('p1'),
            define('p2'),
        ],
        'tasks': [
            task('p1').once().append([('1', 'a'), ('2', 'b'), ('3', 'c')]),
            task('p1', 'p2').select(this.key, this.value.upper()),
        ],
    }

    bot = db.Bot().main(pipeline, argv=['-v0', 'run'])

    assert list(bot.pipe('p1').items()) == [('1', 'a'), ('2', 'b'), ('3', 'c')]
    assert list(bot.pipe('p2').items()) == [('1', 'A'), ('2', 'B'), ('3', 'C')]
Exemplo n.º 4
0
def main():
    botlib.runbot({
        'pipes': [
            define('places'),
        ],
        'tasks': [
            task('places').daily().clean().append(query_places(), progress='places'),
            task('places').export('data/osm/places.csv', include=[
                'osm_id',
                'type',
                'place',
                'population',
                'wikipedia_title',
                'wikipedia_lang',
                'lon',
                'lat',
                'admin_level_6_osm_id',
                'admin_level_6',
                'admin_level_5_osm_id',
                'admin_level_5',
                'admin_level_4_osm_id',
                'admin_level_4',
            ])
        ],
    })
Exemplo n.º 5
0
def test_run():
    pipeline = {
        'pipes': [
            define('a'),
            define('b'),
        ],
        'tasks': [
            task('a').append(['a', 'A', 'b']),
            task('a', 'b').select(this.key.upper()),
            task().compact(),
        ],
    }

    bot = Bot()
    bot.main(pipeline, ['run', '-f'])

    assert list(bot.pipe('a').keys()) == ['a', 'A', 'b']
    assert list(bot.pipe('b').keys()) == ['A', 'B']
Exemplo n.º 6
0
def test_run(bot):
    pipeline = {
        'pipes': [
            define('p1'),
            define('p2'),
        ],
        'tasks': [
            task('p1').once().append([(1, 'a'), (2, 'b')]),
            task('p1', 'p2').select(this.key, this.value.upper()),
        ]
    }

    bot.main(pipeline, argv=['run', '-f'])
    assert bot.output.output.getvalue() == textwrap.dedent('''\
    Validating pipeline.

    Run pipeline (limit=1).

    Run pipeline (limit=0).
    ''')
    assert list(bot.pipe('p2').items()) == [(1, 'A'), (2, 'B')]
Exemplo n.º 7
0
def test_run(bot):
    pipeline = {
        'pipes': [
            define('p1'),
            define('p2'),
        ],
        'tasks': [
            task('p1').once().append([(1, 'a'), (2, 'b')]),
            task('p1', 'p2').select(this.key, this.value.upper()),
        ]
    }

    bot.main(pipeline, argv=['run', '-f'])
    assert bot.output.output.getvalue() == textwrap.dedent('''\
    Validating pipeline.

    Run pipeline (limit=1).

    Run pipeline (limit=0).
    ''')
    assert list(bot.pipe('p2').items()) == [(1, 'A'), (2, 'B')]
Exemplo n.º 8
0
def main(argv=None, output=sys.stdout):
    argv = argv or sys.argv[1:]

    parser = argparse.ArgumentParser()
    parser.add_argument('db', help='path to sqlite datbase or database connection string')
    args = parser.parse_args(argv[:1])
    bot = databot.Bot(args.db, output=output)

    pipeline = {
        'pipes': [databot.define(pipe.pipe) for pipe in get_pipe_tables(bot)],
        'tasks': [],
    }

    bot.main(pipeline, argv=argv[1:])
Exemplo n.º 9
0
def main(argv=None, output=sys.stdout):
    argv = argv or sys.argv[1:]

    parser = argparse.ArgumentParser()
    parser.add_argument(
        'db', help='path to sqlite datbase or database connection string')
    args = parser.parse_args(argv[:1])
    bot = databot.Bot(args.db, output=output)

    pipeline = {
        'pipes': [databot.define(pipe.pipe) for pipe in get_pipe_tables(bot)],
        'tasks': [],
    }

    bot.main(pipeline, argv=argv[1:])
Exemplo n.º 10
0
class key(Call):

    def __init__(self, *queries):
        self.queries = queries

    def __call__(self, select, row, node, many=False, single=True):
        return '/'.join([
            normtime(select.render(row, node, q, many, single))
            for q in self.queries
        ])


pipeline = {
    'pipes': [
        define('pages', compress=True),
        define('data'),
    ],
    'tasks': [
        task('pages').freq(minutes=5).download('http://www.meteo.lt/lt_LT/miestas?placeCode=Vilnius'),
        task('pages', 'data').select([
            '.forecast-hours', (
                key(select(['xpath://body css:.forecast-hours .forecastTime:text']).min(),
                    select('.forecastTime:text')),
                {
                    'base': select(['xpath://body css:.forecast-hours .forecastTime:text']).min().apply(normtime),  # precision=hours base time
                    'time': select('.forecastTime:text').apply(normtime),  # precision=hours prediction time
                    'temperature': select('.temperature:text').cast(int),  # °C
                    'wind_direction': select('.windDirectionGroundDegree:text').cast(int),  # degrees
                    'wind_speed': select('.windSpeedGround:text').cast(int),  # m/s
                    'gust_speed': select('.windGustGround:text').cast(int),  # m/s
Exemplo n.º 11
0
#!/usr/bin/env python3

from databot import Bot, define, task, first

pipeline = {
    'pipes': [
        define('index'),
        define('news'),
    ],
    'tasks': [
        task('index').once().download('https://www.reddit.com/'),
        task('index', 'news').select([
            '.thing.link',
            ('.entry .title > a@href', {
                'title': '.entry .title > a:text',
                'score': '.midcol .score.likes@title',
                'time': first(['.tagline time@datetime']),
                'comments': '.entry a.comments:text',
            })
        ]),
        task('news').export('/tmp/reddit.jsonl'),
        task().compact(),
    ],
}

if __name__ == '__main__':
    Bot('/tmp/reddit.db').main(pipeline)
Exemplo n.º 12
0
        for k, v in json.loads(tops.group(1)).items()
    }
    data = json.loads(data.group(1))
    assert len(tops) == len(data)
    data = dict(zip(sorted(tops.keys()), data))
    return {
        'year': data,
        'mean': sum(data.values()) / len(data),
        'max': max(data.values()),
        'top': min(tops.values())
    }


pipeline = {
    'pipes': [
        define('raidės-nuorodos'),
        define('raidės-puslapiai', compress=True),
        define('sąrašas-nuorodos'),
        define('sąrašas-puslapiai', compress=True),
        define('vardai-nuorodos'),
        define('vardai-puslapiai', compress=True),
        define('vardai'),
    ],
    'tasks': [
        # Vardo pirmos raidės sąrašas
        task('raidės-nuorodos').monthly().append(
            'https://www.tevu-darzelis.lt/vaiku-vardai/A/'),
        task('raidės-puslapiai', 'raidės-nuorodos',
             watch=True).select(['#alphabet li a@href']).dedup(),
        task('raidės-nuorodos', 'raidės-puslapiai', watch=True).download(),
Exemplo n.º 13
0
                break

            try:
                browser.wait.until(
                    attribute_has_changed(By.CSS_SELECTOR,
                                          '.searchResultDescription a', 'href',
                                          first_item_link))
            except TimeoutException:
                browser.get_screenshot_as_file(
                    '/tmp/epaveldas_attribute_has_changed.png')

    except:
        browser.get_screenshot_as_file('/tmp/epaveldas_error.png')
        raise
    finally:
        browser.quit()


pipeline = {
    'pipes': [
        define('paieškos-nuorodos'),
    ],
    'tasks': [
        task('paieškos-nuorodos').once().clean().append(
            extract_index_urls(), progress='paieškos-nuorodos').dedup(),
    ],
}

if __name__ == '__main__':
    botlib.runbot(pipeline)
Exemplo n.º 14
0
#!/usr/bin/env python3

import yaml
import botlib

from databot import define, task, this, select

with open('settings.yml') as f:
    settings = yaml.load(f)

cookies = settings['cookies']['www.lrs.lt']

pipeline = {
    'pipes': [
        define('klausimų-puslapiai', botlib.dburi('lrs/posedziai')),
        define('balsavimų-sąrašas'),
        define('balsavimų-puslapiai', compress=True),
        define('balsavimų-duomenys'),
        define('registracijos-sąrašas'),
        define('registracijos-puslapiai', compress=True),
    ],
    'tasks': [
        # Darbotvarkės klausimas (balsavimai)
        task('klausimų-puslapiai', 'balsavimų-sąrašas').select(
            [
                '.sale_svarst_eiga tr td[2] xpath:a[text()="balsavimas"]',
                '@href'
            ],
            check=
            'xpath://h1[contains(text(), "Darbotvarkės klausimas")]/text()',
        ).dedup(),
Exemplo n.º 15
0
#!/usr/bin/env python3

import botlib

from datetime import timedelta
from databot import define, task, this, select

pipeline = {
    'pipes': [
        define('index urls'),
        define('index pages'),
        define('dataset urls'),
        define('dataset pages'),
        define('dataset data'),
        define('datasets'),
    ],
    'tasks': [
        task('index urls').daily().append(
            'http://opendata.gov.lt/index.php?vars=/public/public/search'),
        task('index urls', 'index pages', watch=True).download(),
        task('index pages', 'index urls',
             watch=True).select(['td > a.path@href']).dedup(),
        task('index pages', 'dataset urls').select(
            ['form[name=frm] > table > tr > td[3] > a@href']),
        task('dataset urls').clean(timedelta(days=7)).dedup(),
        task('dataset urls', 'dataset pages').download(),
        task('dataset pages', 'dataset data').select(this.key, [
            'table xpath:tr[count(td)=2]',
            (
                'td[1]:content',
                select('td[2]:content').strip(),
Exemplo n.º 16
0
#!/usr/bin/env python3

import yaml
import botlib

from databot import define, task, this, select, call

with open('settings.yml') as f:
    settings = yaml.load(f)

cookies = settings['cookies']['www.lrs.lt']

pipeline = {
    'pipes': [
        define('klausimų-puslapiai', botlib.dburi('lrs/posedziai')),
        define('dokumentų-sąrašas'),
        define('dokumentų-puslapiai', compress=True),
        define('susijusių-dokumentų-sąrašas'),
        define('susijusių-dokumentų-puslapiai', compress=True),
        define('metadata'),
        define('texts'),
    ],
    'tasks': [
        task('klausimų-puslapiai', 'dokumentų-sąrašas').select([
            '#page-content div.default b xpath:a[text()="dokumento tekstas"]/@href'
        ]).dedup(),
        task('dokumentų-sąrašas', 'dokumentų-puslapiai').download(
            cookies=cookies,
            check=
            '#page-content div.default b xpath:a[text()="dokumento tekstas"]'),
        task('dokumentų-puslapiai', 'susijusių-dokumentų-sąrašas').select([
Exemplo n.º 17
0
            sep=r';\s*',
            engine='python',
            encoding='iso-8859-4',
            decimal=',',
            skiprows=12,
            comment=';',
            header=None,
            names=names,
        )

    yield from ((str(x[key]), json.loads(x.to_json())) for _, x in data.iterrows())


pipeline = {
    'pipes': [
        define('vidurkiai-zip'),
        define('skaiciai-zip'),
        define('vidurkiai'),
        define('skaiciai'),
        define('imones-puslapis', compress=True),
        define('imones'),
    ],
    'tasks': [
        task('vidurkiai-zip').monthly().download('http://sodra.is.lt/Failai/Vidurkiai.zip'),
        task('vidurkiai-zip', 'vidurkiai').
            call(partial(read_csv, 'VIDURKIAI.CSV', 'kodas', ['regnr', 'kodas', 'alga', 'autorine', 'viso'])).
            dedup(),
        task('skaiciai-zip').monthly().download('http://sodra.is.lt/Failai/Apdraustuju_skaicius.zip'),
        task('skaiciai-zip', 'skaiciai').
            call(partial(read_csv, 'APDRAUSTUJU_SKAICIUS.CSV', 'kodas', ['regnr', 'kodas', 'skaicius'])).
            dedup(),
Exemplo n.º 18
0
#!/usr/bin/env python3

from databot import Bot, define, task, first

pipeline = {
    'pipes': [
        define('index'),
        define('news'),
    ],
    'tasks': [
        task('index').once().download('https://www.reddit.com/'),
        task('index', 'news').select([
            '.thing.link', (
                '.entry .title > a@href', {
                    'title': '.entry .title > a:text',
                    'score': '.midcol .score.likes@title',
                    'time': first(['.tagline time@datetime']),
                    'comments': '.entry a.comments:text',
                }
            )
        ]),
        task('news').export('/tmp/reddit.jsonl'),
        task().compact(),
    ],
}

if __name__ == '__main__':
    Bot('/tmp/reddit.db').main(pipeline)
Exemplo n.º 19
0
    for qry in queries:
        for row in conn.execute(qry):
            yield row.osm_id, {
                'osm_id': row.osm_id,
                'name': row.name,
                'lon': row.lon,
                'lat': row.lat,
                'religion': row.religion,
                'denomination': row.denomination,
                'place': find_closes_place(conn, point, row),
            }


pipeline = {
    'pipes': [
        define('baznycios'),
    ],
    'tasks': [
        task('baznycios').once().clean().append(
            query(), progress='baznycios').compact(),
        task('baznycios').once().export('data/osm/baznycios.csv',
                                        include=[
                                            'osm_id',
                                            'name',
                                            'religion',
                                            'denomination',
                                            'lon',
                                            'lat',
                                            'place.osm_id',
                                            'place.name',
                                            'place.distance',
Exemplo n.º 20
0
#!/usr/bin/env python3

import botlib

from databot import define, task, this, strformat, select


search_url = 'https://www.limis.lt/greita-paieska/rezultatai/-/exhibitList/form?searchOnlySimpleMetadata=false&searchOnlyWithImages=false&searchInExhibits=true&searchInArchives=true&searchInLibraries=true&searchInAudioVideo=true&searchInPhotos=true&s_tab=&s_id=2duvdg1N5K4dHB0W&backUrl=https%3a%2f%2fwww.limis.lt%2fpradinis%2f-%2fexhibitSearchFast%2fform&listDisplayMode=simple&_exhibitListportlet_WAR_limiskportlet_searchType=&page={page}&rowsOnPage=48'

pipeline = {
    'pipes': [
        define('paieška'),
        define('paieškos-puslapių-numeriai'),
        define('paieškos-puslapiai', compress=True),
        define('eksponatų-nuorodos'),
        define('eksponatų-puslapiai', compress=True),
        define('eksponatų-duomenys'),
    ],
    'tasks': [
        task('paieška').daily().clean().download(search_url.format(page=1), check='select[name=page]'),

        task('paieška', 'paieškos-puslapių-numeriai').daily().
        select(['select[name=page] option @value']).
        dedup(),

        task('paieškos-puslapių-numeriai', 'paieškos-puslapiai').
        download(strformat(search_url, page=this.key), check='#exhibitListBlockId'),

        task('paieškos-puslapiai', 'eksponatų-nuorodos').select([
            '#exhibitListBlockId .thumbs-with-title > li span.vertical-scroller > a', ('@href', ':text'),
        ]),
Exemplo n.º 21
0
        start = min(start)
        end = max(end)
        total = sum(b - a for a, b in years)
    else:
        start = end = 0
    return {
        'parapija': parapija,
        'pradžia': start,
        'pabaiga': end,
        'trukmė': total,
    }


pipeline = {
    'pipes': [
        define('paieškos-nuorodos'),
        define('paieškos-puslapiai', compress=True),
        define('knygos-duomenys'),
    ],
    'tasks': [
        # task('paieškos-nuorodos').once().append(extract_index_urls(),
        #                                         progress='paieškos-nuorodos').dedup(),
        task('paieškos-nuorodos', 'paieškos-puslapiai').download(),
        task('paieškos-puslapiai', 'knygos-duomenys').select(
            this.key, {
                'url':
                this.value.url,
                'antraštė':
                select('.authorTitle').text(),
                'd1':
                select([