예제 #1
0
def test_null(Html, freezetime):
    freezetime('2017-05-18T14:43:40.876642')

    row = Html(['<div><p id="this"> p1 </p>/div>'])

    selector = html.Select(select('#wrong:text?').strip())
    with pytest.raises(html.SelectorError) as e:
        selector(row)
    assert str(e.value) == dedent('''\
        Expression error while evaluating None. Error: error while processing expression:
          this.
          strip()
        evaluated with:
          Row({
              'compression': None,
              'created': datetime.datetime(2017, 5, 18, 14, 43, 40, 876642),
              'id': 1,
              'key': 'http://exemple.com',
              'value': {'content': b'<div><p id="this"> p1 </p>/div>'},
          }). Context:

        <html>
          <body>
            <div><p id="this"> p1 </p>/div&gt;</div>
          </body>
        </html>
    ''')

    selector = html.Select(select('#wrong:text?').null().strip())
    assert selector(row) is None

    selector = html.Select(select('#this:text?').null().strip())
    assert selector(row) == 'p1'
예제 #2
0
def test_expresion_str():
    expr = (task('a', 'b').select(
        ['query', ('key', {
            'foo': select('query'),
        })]).download(this.key,
                      check=select('query')).urlparse().query.key.cast(int))
    assert str(expr) == dedent('''
        task('a', 'b').
        select(['query', ('key', {'foo': select('query')})]).
        download(this.key, check=select('query')).
        urlparse().query.key.
        cast('int')
    ''').strip()
예제 #3
0
def test_empty_result(Html):
    row = Html([
        '<div>',
        '  <h1>Test</h1>',
        '  <p>1</p>',
        '  <p>2</p>',
        '  <p>3</p>',
        '  <h2></h2>',
        '</div>',
    ])

    # Raise error, if selector did not found anything
    selector = html.Select(['p.new:text'])
    with pytest.raises(html.SelectorError) as e:
        selector(row)
    assert str(e.value) == (
        "Select query did not returned any results. Row key: 'http://exemple.com'. Query: ['p.new:text']"
    )

    # Allow empty result from selector.
    selector = html.Select(['p.new:text'], check=False)
    assert selector(row) == []

    # Allow empty result from selector, but check if we still looking at the right page.
    selector = html.Select(['p.new:text'], check='xpath://h1[text() = "Test"]')
    assert selector(row) == []

    # Existing element without content should not be threated as emtpy.
    selector = html.Select(select('xpath://h2').text())
    assert selector(row) == ''
예제 #4
0
def test_apply_with_select_arg(Html):
    def f(value, link):
        return value, link

    row = Html(['<div><p>1</p></div>'])
    selector = html.Select(select('p:text').cast(int).apply(f, this.key))
    assert selector(row) == (1, 'http://exemple.com')
예제 #5
0
def test_oneof(Html):
    row = Html([
        '<div>'
        '  <p><a>1</a></p>'
        '  <p><b>2</b></p>'
        '  <p><i>3</i></p>'
        '</div>'
    ])
    selector = html.Select([
        'div p', oneof(
            select('a:text'),
            select('b:text'),
            select('i:text'),
        )
    ])
    assert selector(row) == ['1', '2', '3']
예제 #6
0
def test_expresion_str():
    expr = (
        task('a', 'b').
        select(['query', ('key', {
            'foo': select('query'),
        })]).
        download(this.key, check=select('query')).
        urlparse().
        query.key.cast(int)
    )
    assert str(expr) == dedent('''
        task('a', 'b').
        select(['query', ('key', {'foo': select('query')})]).
        download(this.key, check=select('query')).
        urlparse().query.key.
        cast('int')
    ''').strip()
예제 #7
0
def test_url(Html):
    row = Html([
        '<div>',
        '<a href="http://example.com/?id=1">a</a>',
        '<a href="http://example.com/">b</a>',
        '</div>',
    ])
    selector = html.Select(['a', select('@href').url(query='id')])
    assert selector(row) == ['http://example.com/?id=1', None]
예제 #8
0
def test_text(Html):
    row = Html(['<div><p>p1</p>p2<br>p3<br>p4</div>'])
    selector = html.Select(select('div').text())
    assert selector(row).splitlines() == [
        'p1', '',
        'p2', '',
        'p3', '',
        'p4'
    ]
예제 #9
0
def test_select_method(bot):
    row = Row({
        'key': 1,
        'value': {
            'xml': (
                '<div>'
                '  <p>1</p>'
                '  <p>2</p>'
                '  <p>3</p>'
                '</div>'
            ),
        },
    })

    selector = html.Select(this.value.xml.select([select('div p:text').cast(int)]))
    assert selector(row) == [1, 2, 3]
예제 #10
0
        task('paieška').daily().clean().download(search_url.format(page=1), check='select[name=page]'),

        task('paieška', 'paieškos-puslapių-numeriai').daily().
        select(['select[name=page] option @value']).
        dedup(),

        task('paieškos-puslapių-numeriai', 'paieškos-puslapiai').
        download(strformat(search_url, page=this.key), check='#exhibitListBlockId'),

        task('paieškos-puslapiai', 'eksponatų-nuorodos').select([
            '#exhibitListBlockId .thumbs-with-title > li span.vertical-scroller > a', ('@href', ':text'),
        ]),

        task('eksponatų-nuorodos', 'eksponatų-puslapiai').download(check='#exhibit_block_main_info'),

        task('eksponatų-puslapiai', 'eksponatų-duomenys').select(this.key, {
            'info': select(['#exhibit_block_main_info > tr', (
                select('xpath:./td[1]').text().strip(),
                select('xpath:./td[2]').text().strip(),
            )]).cast(dict),
            'aprašymas': select('xpath://td[text() = "Eksponato aprašymas"]/following-sibling::td?').null().text(),
            'pateikė': select('xpath://td[text() = "Duomenis pateikė"]/following-sibling::td?').null().text(),
            'savininkas': select('xpath://td[text() = "Savininkas"]/following-sibling::td?').null().text(),
        }),
    ],
}


if __name__ == '__main__':
    botlib.runbot(pipeline)
예제 #11
0
 task('dokumentų-sąrašas', 'dokumentų-puslapiai').download(
     cookies=cookies,
     check=
     '#page-content div.default b xpath:a[text()="dokumento tekstas"]'),
 task('dokumentų-puslapiai', 'susijusių-dokumentų-sąrašas').select([
     '#page-content div.default b xpath:a[text()="susiję dokumentai"]/@href'
 ]).dedup(),
 task('susijusių-dokumentų-puslapiai').download(
     cookies=cookies,
     check=
     '#page-content div.default b xpath:a[text()="susiję dokumentai"]'),
 task('dokumentų-puslapiai', 'metadata').select(
     this.key,
     call(dict, [
         '.basic .ltb',
         (select(':text').strip(), select('b:text?').strip())
     ])).dedup(),
 task('dokumentų-puslapiai',
      'texts').select(this.key, 'body > div:content').dedup(),
 task('metadata').export('data/lrs/dokumentai/metadata.csv',
                         include=[
                             'key',
                             'Data:',
                             'Rūšis:',
                             'Kalba:',
                             'Numeris:',
                             'Statusas:',
                         ]),
 task('texts').export('data/lrs/dokumentai/texts.csv',
                      include=[
                          'key',
예제 #12
0
def test_text_text(Html):
    row = Html(['<div><p>1</p>2<p>3</p>4</div>'])
    selector = html.Select(select(['xpath://p[1]/following-sibling::node()']).text())
    assert selector(row) == '2 3\n\n4'
예제 #13
0
def test_text_processing_instructions(Html):
    row = Html(['<div><p>text <?xml:namespace prefix="o" /></p></div>'])
    selector = html.Select(select('div').text())
    assert selector(row) == 'text'
예제 #14
0
def test_text_comments(Html):
    row = Html(['<div><p>text <!-- comment --><o:p></o:p></p></div>'])
    selector = html.Select(select('div').text())
    assert selector(row) == 'text'
예제 #15
0
        define('dataset data'),
        define('datasets'),
    ],
    'tasks': [
        task('index urls').daily().append(
            'http://opendata.gov.lt/index.php?vars=/public/public/search'),
        task('index urls', 'index pages', watch=True).download(),
        task('index pages', 'index urls',
             watch=True).select(['td > a.path@href']).dedup(),
        task('index pages', 'dataset urls').select(
            ['form[name=frm] > table > tr > td[3] > a@href']),
        task('dataset urls').clean(timedelta(days=7)).dedup(),
        task('dataset urls', 'dataset pages').download(),
        task('dataset pages', 'dataset data').select(this.key, [
            'table xpath:tr[count(td)=2]',
            (
                'td[1]:content',
                select('td[2]:content').strip(),
            )
        ]),
        task('dataset data').clean(timedelta(days=7)).dedup(),
        task('dataset data',
             'datasets').call(lambda x: [(x.key, dict(x.value))]),
        task('datasets').export('data/ivpk/opendata-gov-lt/datasets.jsonl'),
        task().compact(),
    ],
}

if __name__ == '__main__':
    botlib.runbot(pipeline)
예제 #16
0
def test_select_outside_list(Html):
    row = Html(['<div><a name="1">a</a><a name="2"></a></div>'])
    selector = html.Select(select(['a@name']).apply(len))
    assert selector(row) == 2
예제 #17
0
    'tasks': [
        task('vidurkiai-zip').monthly().download('http://sodra.is.lt/Failai/Vidurkiai.zip'),
        task('vidurkiai-zip', 'vidurkiai').
            call(partial(read_csv, 'VIDURKIAI.CSV', 'kodas', ['regnr', 'kodas', 'alga', 'autorine', 'viso'])).
            dedup(),
        task('skaiciai-zip').monthly().download('http://sodra.is.lt/Failai/Apdraustuju_skaicius.zip'),
        task('skaiciai-zip', 'skaiciai').
            call(partial(read_csv, 'APDRAUSTUJU_SKAICIUS.CSV', 'kodas', ['regnr', 'kodas', 'skaicius'])).
            dedup(),
        task('vidurkiai', 'imones-puslapis').download(
            'https://draudejai.sodra.lt/draudeju_viesi_duomenys/',
            method='POST',
            data={
                'formType': 'NEW',
                'year': '2017',
                'month': '1',
                'declarantCode2': this.value.kodas.cast(int).cast(str),
                'actionName': 'MEAN',
            },
            check='xpath://td[text() = "Draudėjo pavadinimas"]',
        ),
        task('imones-puslapis', 'imones').select(this.value.request.data.declarantCode2, {
            'pavadinimas': select('xpath://td[text() = "Draudėjo pavadinimas"]/following-sibling::td[1]/text()'),
        })
    ],
}


if __name__ == '__main__':
    botlib.runbot(pipeline)
예제 #18
0
def test_call_getitem(Html):
    row = Html(['<div><a name="1">a</a><a name="2">b</a></div>'])
    qry = html.Select(select(['div > a'])[0].text().upper())
    assert qry(row) == 'A'
예제 #19
0
def test_inline_call(Html):
    row = Html(['<div><a name="1">a</a><a name="2">b</a></div>'])
    qry = html.Select(['div > a', ('@name', select(':text').upper())])
    assert qry(row) == [('1', 'A'), ('2', 'B')]
예제 #20
0
        task('raidės-puslapiai', 'raidės-nuorodos',
             watch=True).select(['#alphabet li a@href']).dedup(),
        task('raidės-nuorodos', 'raidės-puslapiai', watch=True).download(),

        # Sąrašo puslapiavimas
        task('raidės-puslapiai', 'sąrašas-nuorodos',
             watch=True).select(['.pagination li a@href']).dedup(),
        task('sąrašas-puslapiai', 'sąrašas-nuorodos',
             watch=True).select(['.pagination li a@href']).dedup(),
        task('sąrašas-nuorodos', 'sąrašas-puslapiai', watch=True).download(),

        # Vardų puslapiai
        task('sąrašas-puslapiai', 'vardai-nuorodos').select([
            '.name-list li',
            ('a@href', {
                'name': select('a').text(),
                'class': select('a@class'),
            })
        ]).dedup(),
        task('vardai-nuorodos', 'vardai-puslapiai').download(),

        # Vardai
        task('vardai-puslapiai', 'vardai').select(
            this.key.urlparse().path, {
                'lytis':
                select('#page-left xpath:.//h1[1]/@class'),
                'vardas':
                select('#page-left xpath:.//h1[1]/strong/text()'),
                'kilmė':
                select(
                    '#name-info xpath:./p[strong/text() = "Vardo kilmė:"]/text()?'
예제 #21
0
pipeline = {
    'pipes': [
        define('paieškos-nuorodos'),
        define('paieškos-puslapiai', compress=True),
        define('knygos-duomenys'),
    ],
    'tasks': [
        # task('paieškos-nuorodos').once().append(extract_index_urls(),
        #                                         progress='paieškos-nuorodos').dedup(),
        task('paieškos-nuorodos', 'paieškos-puslapiai').download(),
        task('paieškos-puslapiai', 'knygos-duomenys').select(
            this.key, {
                'url':
                this.value.url,
                'antraštė':
                select('.authorTitle').text(),
                'd1':
                select([
                    '.entryTable tr',
                    (
                        select('th:content'),
                        select('td:content').strip(),
                    ),
                ]).apply(dict),
                'd2':
                select('.authorTitle').text().apply(parse_title),
            }),
        task('knygos-duomenys').export(
            'data/epaveldas/metrikai/knygos.csv',
            update=lambda row: {
                'url': row.value['url'],
예제 #22
0
def test_select_outside_nested_list(Html):
    row = Html(['<div><a name="1">a</a><a name="2"></a></div>'])
    selector = html.Select(['a@name', select().cast(int)])
    assert selector(row) == [1, 2]
예제 #23
0
def test_select_inside_list(Html):
    row = Html(['<div><a name="1">a</a><a name="2"></a></div>'])
    selector = html.Select([select('a@name').cast(int)])
    assert selector(row) == [1, 2]
예제 #24
0
 'tasks': [
     # Darbotvarkės klausimas (balsavimai)
     task('klausimų-puslapiai', 'balsavimų-sąrašas').select(
         [
             '.sale_svarst_eiga tr td[2] xpath:a[text()="balsavimas"]',
             '@href'
         ],
         check=
         'xpath://h1[contains(text(), "Darbotvarkės klausimas")]/text()',
     ).dedup(),
     task('balsavimų-sąrašas', 'balsavimų-puslapiai').download(
         cookies=cookies, check='#page-content h1.page-title'),
     task('balsavimų-puslapiai', 'balsavimų-duomenys').select(
         this.key, {
             'data':
             select('h1.page-title:text').re(r'\d{4}-\d{2}-\d{2}'),
             'posėdis':
             select('h1.page-title:text').re(r'(\w+) posėdis\)'),
             'klausimai': [
                 'xpath://b/a[contains(@class, "link") and text()="dokumento tekstas"]',
                 {
                     'pavadinimas':
                     select(
                         'xpath:./../preceding-sibling::b[contains(a/@class, "link")][1]/a/text()[1]'
                     ),
                     'rūšis':
                     select(
                         'xpath:./../preceding-sibling::b[contains(a/@class, "link")][1]/following-sibling::text()[1]'
                     ),
                     'klausimo-nuoroda':
                     select(
예제 #25
0

pipeline = {
    'pipes': [
        define('index pages'),
        define('data'),
    ],
    'tasks': [
        task('index pages').call(extract_archive_pages,
                                 range(1812, 1921, 5)).clean().reset(),
        task('index pages', 'data').select(
            '.inventoryLabel:text', {
                'fondas':
                '.upperHierarchyTreeInner xpath:a[1]/text()',
                'apyrašas':
                '.upperHierarchyTreeInner xpath:a[2]/text()',
                'data':
                select([
                    '.inventoryBaseDataTable tr',
                    (
                        'td[1]:content',
                        'td[2]:content',
                    )
                ]).cast(dict),
            }),
    ],
}

if __name__ == '__main__':
    botlib.runbot(define, run)
예제 #26
0
        return '/'.join([
            normtime(select.render(row, node, q, many, single))
            for q in self.queries
        ])


pipeline = {
    'pipes': [
        define('pages', compress=True),
        define('data'),
    ],
    'tasks': [
        task('pages').freq(minutes=5).download('http://www.meteo.lt/lt_LT/miestas?placeCode=Vilnius'),
        task('pages', 'data').select([
            '.forecast-hours', (
                key(select(['xpath://body css:.forecast-hours .forecastTime:text']).min(),
                    select('.forecastTime:text')),
                {
                    'base': select(['xpath://body css:.forecast-hours .forecastTime:text']).min().apply(normtime),  # precision=hours base time
                    'time': select('.forecastTime:text').apply(normtime),  # precision=hours prediction time
                    'temperature': select('.temperature:text').cast(int),  # °C
                    'wind_direction': select('.windDirectionGroundDegree:text').cast(int),  # degrees
                    'wind_speed': select('.windSpeedGround:text').cast(int),  # m/s
                    'gust_speed': select('.windGustGround:text').cast(int),  # m/s
                    'precipitation': select('.precipitation:text').cast(float),  # mm/h
                    'pressure': select('.pressureMeanSea:text').cast(int),  # hPa
                    'humidity': select('.humidityGround:text').cast(int),  # %
                    'feels_like': select('.feelLike:text').cast(int),  # °C
                }
            )
        ]).compact(),