Exemplo n.º 1
0
                            selector='a',
                            func='sel_attr',
                            kws={'attr': 'href'},
                            source={'active': False}),
                   ])
      ]),

parsed = [
    a['url'] for cat in categories
    for a in cl.nos_nl.articles.find({'category': cat})
]
nos_sources = [
    Source(url=url['url'],
           attrs=[Attr(name='category', value=url['category'])])
    for url in cl.nos_nl.article_urls.find() if url['url'] not in parsed
]
nos = ScrapeModel(name='nos.nl',
                  domain='http://nos.nl',
                  num_getters=10,
                  phases=[
                      Phase(n_workers=5,
                            sources=nos_sources,
                            templates=(article(db_type='mongo_db',
                                               db='nos_nl',
                                               table='articles'), ))
                  ])

disp = Dispatcher()
disp.add_scraper(nos)
disp.run()
Exemplo n.º 2
0
                selector='.chart-row__current-week',
                func='sel_text',
                kws={'numbers': True})

song = Template(name='song',
                selector='.chart-row__main-display',
                db_type='MongoDB',
                db='billboard',
                table='songs2',
                required=True,
                attrs=[name, artist, position])

next_date = Template(name='next_date',
                     selector='#chart-nav',
                     required=True,
                     attrs=[
                         Attr(name='next_week',
                              func='sel_url',
                              selector='a[title="Next Week"]',
                              source={'copy_attrs': 'year'}),
                         Attr(name='year',
                              func='sel_url',
                              selector='a[title="Next Week"]',
                              kws={'regex': '\/([0-9\-]+)'})
                     ])

charts = ScrapeModel(
    name='Hot 100',
    domain='https://www.billboard.com/',
    phases=[Phase(n_workers=5, sources=start, templates=[song, next_date])])
Exemplo n.º 3
0
col = db.episode

sources = (Source(url="http://www.luckytv.nl/afleveringen/page/{}/".format(i))
           for i in range(1, 50))

LuckyTV = ScrapeModel(name='Lucky TV',
                      domain='http://www.luckytv.nl/',
                      num_getters=2,
                      phases=[
                          Phase(source_worker=WebSource,
                                parser=HTMLParser,
                                sources=sources,
                                templates=(Template(
                                    name='episode',
                                    selector='article.video',
                                    db_type='mongo_db',
                                    db='lucky_tv',
                                    table='episodes',
                                    attrs=(
                                        Attr(name='url',
                                             selector='a:nth-of-type(1)',
                                             func='sel_url'),
                                        Attr(name='title',
                                             selector='.video__title',
                                             func='sel_text'),
                                        Attr(name='date',
                                             selector='.video__date',
                                             func='sel_text'),
                                    )), )),
                      ])
Exemplo n.º 4
0
wordpress_source = ProgramSource(
    function='yes y | wpscan {} --batch --follow-redirection')

robot_source = ProgramSource(
    function='/home/jim/git/robot-detect/robot-detect --csv -q {}')

passwords_source = ProgramSource(
    function='grep -ra @{}: ~/ideas/BreachCompilation/data')

defcon_1 = ScrapeModel(name='nmap_test',
                       domain='',
                       num_getters=1,
                       phases=[
                           ip_phase(n_workers=10,
                                    templates=[
                                        ip_template(db='defcon',
                                                    table='companies',
                                                    func='update',
                                                    kws={'key': '_id'},
                                                    source=['ip'])
                                    ],
                                    sources=jacco), port_phase,
                           Phase(templates=[git_template],
                                 sources=jacco_git,
                                 parser=TextParser),
                           Phase(templates=[ds_store_template],
                                 sources=jacco_ds,
                                 parser=TextParser)
                       ])
Exemplo n.º 5
0
sub_page_name = Attr(name='pagename', func='sel_text')

category_temp = Template(name='sub_page',
                         selector='.sections a',
                         db='startpagina',
                         table='subpages',
                         db_type='mongo_db',
                         attrs=(sub_page_url,
                                sub_page_name))

website_url = Attr(name='url', func='sel_url')
website_name = Attr(name='name', func='sel_text')

website_temp = Template(name='website',
                        selector='#columns a',
                        db='startpagina',
                        table='websites',
                        db_type='mongo_db',
                        attrs=(website_url, website_name))

model = ScrapeModel(name='startpagina', domain='http://www.startpagina.nl',
                    phases=[
                        Phase(n_workers=3, sources=[start_url],
                            templates=[category_temp]),
                        Phase(n_workers=3, templates=[website_temp])
                    ])

d = Dispatcher()
d.add_scraper(model)
d.run()
Exemplo n.º 6
0
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source
from scrape_models.objects import occassions

sources = [
    Source(
        url=
        'http://ww4.autoscout24.nl/?atype=B&mmvco=0&cy=NL&ustate=N%2CU&fromhome=1&intcidm=HP-Searchmask-Button&dtr=s&results=20'
    )
]
autoscout = ScrapeModel(name='autoscout24',
                        domain='autoscout24.nl',
                        phases=[
                            Phase(sources=sources,
                                  templates=[occassions.autoscout_template]),
                        ])
Exemplo n.º 7
0
                 func='sel_text')

article = Template(
    name='article', selector='.col__inner',
    attrs=(
        title_attr,
        text_attr,
        date_attr,
        author_attr,
        tags_attr
    )
)

rtl= ScrapeModel(
    name='rtl', domain='http://www.rtlnieuws.nl/',
    num_getters=1, phases=[
        Phase(sources=[
            Source(url="http://www.parool.nl/archief/2012")],
            templates=(calendar, year)
            ),
        Phase(templates=(
                article_url(db_type='mongo_db', db='parool',
                            table='article_urls'),
                pagination)
            ),
        Phase(templates=(article(db_type='mongo_db', db='parool',
                               table='articles'),
                       )
            ),
    ])
Exemplo n.º 8
0
                Attr(name='url', selector='a.shadow-block',
                        func='sel_attr', kws={'attr': 'href'},
                        source=Source(active=False, copy_attrs='category')),
                Attr(name='title', selector='h3', func='sel_text'),
                Attr(name='excerpt', selector='div > p', func='sel_text'),
                Attr(name='date', selector='div.wrapper.small div:nth-of-type(1)',
                            func='sel_text'),
                Attr(name='num_reactions', selector='div.amount', func='sel_text'),
            ]
        )
    ])

metro = ScrapeModel(
    name='metronieuws.nl', domain='http://metronieuws.nl', num_getters=5,
    phases=[
        headline_phase,
        Phase(sources=sources, n_workers=3, templates=[
            article(
                selector='.artikel', db='metronieuws',
                table='articles', db_type='MongoDB', attrs=[
                    title_attr,
                    text_attr(selector='.content .field-items .field-item > p'),
                    author_attr(selector='.username'),
                    Attr(name='date', func='sel_attr', kws={'attr':'content'},
                         selector='.small span[datatype="xsd:dateTime"]'),
                    tags_attr,
                ])
        ])
    ])

Exemplo n.º 9
0
                                   selector='h3 a',
                                   func='sel_url',
                                   source={'active': False}), ))
pagination = Template(name='pagination',
                      selector='.pagers',
                      attrs=(Attr(name='page',
                                  selector='a',
                                  func='sel_url',
                                  source=True), ))

bedrijven_pagina = ScrapeModel(
    name='Bedrijven Pagina',
    domain='https://www.bedrijvenpagina.nl/',
    num_getters=2,
    phases=[
        Phase(source_worker=WebSource,
              parser=HTMLParser,
              sources=[Source(url='https://www.bedrijvenpagina.nl/')],
              templates=(category_menu, )),
        Phase(source_worker=WebSource,
              parser=HTMLParser,
              templates=(result_list, pagination)),
        Phase(source_worker=WebSource,
              parser=HTMLParser,
              templates=(company, ))
    ])

disp = Dispatcher()
disp.add_scraper(bedrijven_pagina)
disp.run()
Exemplo n.º 10
0
    name='motorcycle', selector='section.search-results-table article',
    table='marktplaats',
    attrs=[
        make(selector='h2 a', kws={'regex': '(\w+) -,'}),
        price(selector='.price', kws={'regex': '(\d+),'}),
        year(selector='.listing-priority-product-container',
             kws={'regex': '(\d{4})'}),
        mileage(selector='.listing-priority-product-container',
                    kws={'regex': '(.*) km'}),
        city(selector='.location-name'),
        url(selector='h2 > a'),
    ]
)
autoscout = ScrapeModel(
    name='autoscout24',
    domain='autoscout24.nl',
    phases=[
        Phase(sources=[], templates=[autoscout_template]),
])

autotrader = ScrapeModel(name='autotrader', domain='http://autotrader.nl', num_sources=1, cookies={'CookieOptIn': 'true'},
                          phases=[
    Phase(
        sources=[
            Source(url='http://www.autotrader.nl/motor/zoeken/'),
        ],
        templates=[
            Template(name='motorcycle', selector='.result',
                            store=StoreObject(func=store_mongo,
                                                     kws={'db': 'moto', 'collection': 'autotrader'}),
                       attrs=[
                       ]),
Exemplo n.º 11
0
dabanga = ScrapeModel(
    name='dabanga',
    domain='https://www.dabangasudan.org/en',
    num_getters=2,
    phases=[
        Phase(
            sources=(Source(url="https://www.dabangasudan.org/en/all-news"), ),
            templates=(
                Template(name='article_url',
                         selector='.list-item.news-item-small',
                         db_type='mongo_db',
                         db='dabanga',
                         table='article_urls',
                         attrs=[
                             Attr(name='url',
                                  selector='a:nth-of-type(1)',
                                  func='sel_url',
                                  source={'active': False}),
                         ]),
                Template(name='pagination',
                         selector='.pager',
                         attrs=[
                             Attr(name='url',
                                  selector='a',
                                  func='sel_url',
                                  source=True),
                         ]),
            )),
        Phase(synchronize=True,
              templates=[
                  Template(name='article',
                           selector='#content',
                           db_type='mongo_db',
                           db='dabanga',
                           table='article',
                           attrs=[
                               Attr(name='title',
                                    selector='h1',
                                    func='sel_text'),
                               Attr(name='text',
                                    selector='.article .body-text',
                                    func='sel_text'),
                               Attr(name='date',
                                    selector='.article .time',
                                    func='sel_text'),
                               Attr(name='place',
                                    selector='.article .place',
                                    func='sel_text'),
                               Attr(name='img',
                                    selector='.article img',
                                    func='sel_attr',
                                    kws={'attr': 'src'}),
                           ]),
              ])
    ])
Exemplo n.º 12
0
start = (Source(url='https://www.manageengine.com/products.html?MEtab'),)

demo_template = Template(
    name='demo',
    selector='.all_prod_over',
    db='test',
    db_type='MongoDB',
    table='test',
    attrs=[
        Attr(
            name='url',
            selector='a',
            func='sel_url',
            kws={'regex': 'http[s]?://([\.a-z]*)[\/\?]',
                 'index': 0},
            source={'active': False}
        )
    ]
)

manageengine = ScrapeModel(
    name='manageengine',
    domain='',
    num_getters=1,
    phases=[
        Phase(sources=start, templates=[demo_template]),
        ip_phase,
        port_phase(
    ]
)
Exemplo n.º 13
0
theoffice = ScrapeModel(
    name='theoffice',
    domain='http://watchtheofficeonline.com',
    num_getters=2,
    phases=[
        Phase(sources=[
            Source(url=extended_url.format(season, episode))
            for season in range(1, 10) for episode in range(1, 30)
        ],
              templates=(Template(
                  name='episode',
                  selector='#Rapidvideo',
                  db_type='ShellCommand',
                  db='theoffice',
                  table='season',
                  kws={'command': create_dir + ' & ' + youtube_dl},
                  attrs=(
                      Attr(name='url',
                           selector='a',
                           func=['sel_url', 'sel_text'],
                           kws=[{}, {
                               'needle': r'.*(s\d+e\d+)'
                           }]),
                      Attr(name='episode',
                           selector='.textwidget',
                           func='sel_text',
                           kws={
                               'index': 3,
                               'substitute': '_',
                               'replacers': ' '
                           }),
                      Attr(name='season',
                           selector='.textwidget',
                           func='sel_text',
                           kws={
                               'index': 1,
                               'replacers': ' '
                           }),
                  )), )),
    ])
Exemplo n.º 14
0
from modelscraper.sources import ProgramSource

port_template = Template(name='ports',
                         selector='port',
                         db_type='mongo_db',
                         db='ports',
                         table='ports',
                         attrs=(Attr(name='portnumber',
                                     func='sel_attr',
                                     kws={'attr': 'portid'}),
                                Attr(name='state',
                                     selector='state',
                                     func='sel_attr',
                                     kws={'attr': 'state'}),
                                Attr(name='service',
                                     selector='service',
                                     func='sel_attr',
                                     kws={'attr': 'name'})))
nmap = ScrapeModel(
    name='nmap_test',
    domain='',
    phases=[
        Phase(sources=(Source(url='nmap -oX - duwo.multiposs.nl'), ),
              templates=[port_template],
              source_worker=ProgramSource)
    ])

disp = Dispatcher()
disp.add_scraper(nmap)
disp.run()
Exemplo n.º 15
0
song = Template(name='song',
                db_type='MongoDB',
                db='midi',
                table='songs',
                attrs=[title, artist, midi_url])

freemidi_template = song(
    table='freemidi',
    selector='#mainContent div.col-xs-12:nth-of-type(1)',
    attrs=[
        title(selector=
              'li.active:nth-child(3) > a:nth-child(1) > span:nth-child(1)'),
        artist(
            selector=
            'ol.breadcrumb:nth-child(1) > li:nth-child(2) > a:nth-child(1) > span:nth-child(1)'
        ),
    ])

freemidi_sources = (Source(
    url='https://freemidi.org/download-{}'.format(i),
    attrs=[midi_url(value='https://freemidi.org/getter-{}'.format(i))])
                    for i in range(25803))

freemidi = ScrapeModel(domain='http://freemidi.org',
                       phases=[
                           Phase(n_workers=3,
                                 sources=freemidi_sources,
                                 templates=[freemidi_template])
                       ])
Exemplo n.º 16
0
npo_tv_programs = ScrapeModel(
    name='npo_tv_programs',
    domain='http://npo.nl',
    num_getters=2,
    phases=[
        Phase(
            source_worker=WebSource,
            parser=HTMLParser,
            sources=[Source(url=series_url.format(i)) for i in range(0, 242)],
            templates=(
                Template(
                    name='program',
                    selector='.content-column.quarter',
                    db_type='mongo_db',
                    db='npo_tv_programs',
                    table='programs',
                    attrs=(
                        Attr(name='title', selector='h3', func='sel_text'),
                        Attr(name='url',
                             selector='a.full-link',
                             func='sel_url',
                             source=Source(
                                 active=False)),  # source is for next run
                    )),
                Template(name='next_url'),
            )),
        Phase(
            source_worker=WebSource,
            parser=HTMLParser,
            templates=(
                Template(
                    name='episodes',
                    selector='.item-list.item-container div.item',
                    db_type='mongo_db',
                    db='npo_tv_programs',
                    table='episodes',
                    attrs=(
                        Attr(name='episode_url',
                             selector='a',
                             func='sel_url',
                             source=Source(
                                 active=False)),  # source is for next run
                        Attr(name='episode_title',
                             selector='h3',
                             func='sel_text'),
                        Attr(name='episode_text',
                             selector='p',
                             func='sel_text'),
                        Attr(name='program',
                             selector='meta[property="og:title"]',
                             func='sel_text'),
                        Attr(name='date',
                             selector='h4',
                             func='sel_text',
                             kws={'all_text': False}))),
                Template(name='more_episodes',
                         selector='',
                         attrs=[
                             Attr(name='num_result',
                                  selector='div.search-results',
                                  func='sel_attr',
                                  kws={'attr': 'data-num-found'}),
                             Attr(name='pagesize',
                                  selector='div.search-results',
                                  func='sel_attr',
                                  kws={'attr': 'data-rows'}),
                             Attr(name='start',
                                  selector='div.search-results',
                                  func='sel_attr',
                                  kws={'attr': 'data-start'}),
                             Attr(name='result_form',
                                  selector='#program-search-form',
                                  func='fill_form')
                         ]),
            )),
    ])
Exemplo n.º 17
0
article = Template(name='article',
                   selector='.col__inner',
                   attrs=(title_attr, text_attr, date_attr, author_attr,
                          tags_attr))

parool = ScrapeModel(
    name='parool',
    domain='http://www.parool.nl/',
    cookies=cookie,
    num_getters=1,
    phases=[
        Phase(source_worker=WebSource,
              parser=HTMLParser,
              sources=[Source(url="http://www.parool.nl/archief/2012")],
              templates=(calendar, year)),
        Phase(source_worker=WebSource,
              parser=HTMLParser,
              templates=(article_url(db_type='mongo_db',
                                     db='parool',
                                     table='article_urls'), pagination)),
        Phase(source_worker=WebSource,
              parser=HTMLParser,
              templates=(article(db_type='mongo_db',
                                 db='parool',
                                 table='articles'), )),
    ])

volkskrant = ScrapeModel(
    name='volkskrant',
    domain='http://www.volkskrant.nl/',
    cookies={'nl_cookiewall_version': '1'},
Exemplo n.º 18
0
                                attrs=[
                                    Attr(name='submenu_item',
                                         selector='.c-product-tile__meta > a',
                                         func='sel_url',
                                         source={'active': False}),
                                    Attr(name='pagination_item',
                                         selector='li.is-nexy > a',
                                         source=True)
                                ])

product_name = product_name(selector='.c-offer__title')
price = price(selector='div.c-offer__price')
nutrition = nutrition(selector='.c-offer__nutrition table td')

product = Template(name='product',
                   db='foods',
                   table='spar2',
                   db_type='MongoDB',
                   attrs=[product_name, price, nutrition])

spar = ScrapeModel(
    name='spar.nl',
    cookie=cookie,
    domain='https://spar.nl',
    phases=[
        Phase(sources=(Source(url='https://spar.nl/boodschappen/'), ),
              templates=[menu_template]),
        Phase(templates=[productmenu_template]),
        Phase(templates=[product])
    ])
Exemplo n.º 19
0
search_url = 'http://vkplusmobilebackend.persgroep.net/rest/content/articles?query=&metadataNeeded=true&limit=10000'
next_page_url = 'http://vkplusmobilebackend.persgroep.net/rest/content/articles?query=&metadataNeeded=true&limit=10000&offset={}'

search_result = Template(
    name='search_result', db='volkskrant', table='article_urls',
    func='create',
    db_type='mongo_db', selector=('results', 'previews'),
    attrs=(
        Attr(name='id', selector=('content_link', 'id'), func='sel_text',
             source={'src_template': article_url, 'active': False}),
    ),
)

next_search = Template(
    name='next_result',
    attrs=(
        Attr(name='next_limit', selector=('results', 'next_offset'),
             func='sel_text', source={'src_template': next_page_url}),
    )
)

sources = (Source(url=search_url),)

volkskrant = ScrapeModel(
    name='volkskrant', domain='volkskrant',
    phases=[
        Phase(parser=JSONParser, n_workers=5, sources=sources,
            templates=(search_result, next_search)),
        Phase(parser=JSONParser, n_workers=5, sources=sources, templates=(article,)),
    ])
Exemplo n.º 20
0
uefa = ScrapeModel(
    name='eufa',
    domain='http://uefa.com',
    num_getters=2,
    phases=[
        Phase(sources=(Source(
            url="http://www.uefa.com/uefaeuro/season=2016/teams/index.html"),
                       ),
              templates=[
                  Template(name='team',
                           selector='.teams--qualified',
                           attrs=[
                               Attr(name='url',
                                    selector='a',
                                    func='sel_url',
                                    source={'active': False}),
                           ])
              ]),
        Phase(templates=[
            Template(name='player',
                     selector='.squad--team-player',
                     db_type='MongoDB',
                     db='uefa',
                     table='players',
                     attrs=[
                         Attr(name='name',
                              selector='.squad--player-name',
                              func='sel_text'),
                         Attr(name='player_url',
                              selector='.squad--player-name a',
                              func='sel_url'),
                         Attr(name='img',
                              selector='.squad--player-img img',
                              func='sel_attr',
                              kws={'attr': 'src'}),
                     ]),
            # Template(
            #     name='team', selector='',
            #     db_type='MongoDB', func='update', db='uefa', table='players',
            #     attrs=[
            #         Attr(name='team', selector='h1.team-name', func='sel_text'),
            #     ]
            # )
        ])
    ])
Exemplo n.º 21
0
metro = ScrapeModel(
    name='landmark',
    domain='https://mercury.landmarkglobal.com/',
    num_get=2,
    phases=[
        Phase(sources=(Source(
            url=
            "https://mercury.landmarkglobal.com/tracking/track.php?trck=LTN{}N1&Submit=Track"
            .format(i),
            method='post',
            data=[('sid', str(i)), ('options[]', 'display_full_history'),
                  ('options[]', 'use_cached_data_only'),
                  ('action', 'View+Complete+Tracking+History')])
                       for i in range(5000, 50000000)),
              templates=[
                  Template(
                      name='shipment',
                      selector=None,
                      db='shipments',
                      db_type='MongoDB',
                      table='shipment',
                      attrs=[
                          Attr(
                              name='carrier',
                              selector=
                              '#large_shipment_info_box > div:nth-child(2) > div:nth-child(1)',
                              func='sel_text',
                              kws={'regex': 'Carrier:\s(\w+)'}),
                          Attr(
                              name='shipped_to',
                              selector=
                              '#large_shipment_info_box > div:nth-child(2) > div:nth-child(2) div:nth-child(1) .align_left',
                              func='sel_text'),
                          Attr(
                              name='shipped_from',
                              selector=
                              '#large_shipment_info_box > div:nth-child(2) > div:nth-child(2) div:nth-child(2) .align_left',
                              func='sel_text'),
                      ]),
                  Template(name='event',
                           selector='table tr:not(:nth-child(1))',
                           db_type='MongoDB',
                           db='shipments',
                           table='events',
                           attrs=[
                               Attr(name='description',
                                    selector='td:nth-of-type(1)',
                                    func='sel_text'),
                               Attr(name='date',
                                    selector='td:nth-of-type(2)',
                                    func='sel_text'),
                               Attr(name='location',
                                    selector='td:nth-of-type(3)',
                                    func='sel_text'),
                           ]),
              ])
    ])
Exemplo n.º 22
0
motorparts = ScrapeModel(
    name='motorparts', domain='http://www.2wheelpros.com', num_sources=2,
    phases=[
        Phase(source_worker=WebSource, parser=HTMLParser,
            sources=(Source(url='http://www.2wheelpros.com/oem-parts/'),),
            templates=(
                Template(name='brand',
                         selector='#nav > ul > li:nth-of-type(1) > a', attrs=(
                             Attr(name='url', func='sel_url',
                                  source={'active': False}),
                         ),),)
            ),
        Phase(source_worker=WebSource, parser=HTMLParser, templates=(
            Template(name='year', selector='.yearlink', attrs=(
                Attr(name='url', func='sel_url', source={'active': False}),)),),
          ),
        Phase(source_worker=WebSource, parser=HTMLParser, templates=(
            Template(name='model', selector='.modellink', attrs=(
                Attr(name='url', func='sel_url', source={'active': False}),
            )
            ),
        ),
        ),
        Phase(source_worker=WebSource, parser=HTMLParser, templates=(
            Template(name='partCategory', db='motorparts', db_type='MongoDB',
                     table='part_categories', source={'active':False,
                                                      'parent':True},
                     selector='.category',
                     attrs=(
                         Attr(name='url', func='sel_url',
                              selector='a:last-of-type'),
                         Attr(name='name', func='sel_text',
                              selector='.description'),
                            )
            ),
            Template(name='motorcycle', db='motorparts', db_type='MongoDB',
                            table='motorcycles', attrs=(
                Attr(name='make', func='sel_text',
                            selector='#ctl00_cphMain_hHeadMake'),
                Attr(name='year', func='sel_text',
                            selector='#ctl00_cphMain_hHeadYear'),
                Attr(name='model', func='sel_text',
                    selector='.breadcrumbs a:last-of-type'),
                Attr(name='part_category_urls',
                    selector='.category a:last-of-type',
                    func='sel_url'),
            )),
            )
        ),
        Phase(source_worker=WebSource, parser=HTMLParser, templates=(
            Template(name='part', selector='.scrollable-area-2 .cart-table tr',
                     db='motorparts', table='parts', func='update',
                     db_type='MongoDB',
                     attrs=(
                         Attr(name='part_number', func='sel_text',
                              selector='h4 + span'),
                         Attr(name='amount', func='sel_text',
                              selector='.col-2 span:last-of-type'),
                         Attr(name='drawing_number', func='sel_text',
                              selector='.col-1 span'),
                     )),
        ))
])
Exemplo n.º 23
0
                        func='sel_text')
author_attr = Attr(name='author', selector='span.author',
                          func='sel_text')
tags_attr = Attr(name='tags', selector='.article.tags a span',
                        func='sel_text')

article = Template(
    name='article', selector='.column-content-background',
    db='nu_nl',
    db_type='mongo_db',
    table='articles',
    attrs=(
        title_attr,
        text_attr,
        date_attr,
        author_attr,
        tags_attr
    )
)

next_page = Template(
    name='next_page',
    selector='paging',
    attrs=(
        Attr(name='paging',

ad = ScrapeModel(
    name='ad.nl', domain='mobileapi.ad.nl', phases=[
        Phase(n_workers=2, sources=sources, templates=(

Exemplo n.º 24
0
                                       selector='.span4 a',
                                       func='sel_url',
                                       source=Source(active=False)), )), )),

npo_tv_programs = ScrapeModel(
    name='npo_tv_programs',
    domain='http://npo.nl',
    num_getters=2,
    phases=[
        Phase(n_workers=10,
              sources=sources,
              templates=(Template(
                  name='episode',
                  selector='.column-player-info',
                  db='dwdd',
                  func='update',
                  table='episodes',
                  db_type='mongo_db',
                  attrs=(
                      Attr(name='date',
                           selector='ul.the-player-meta-block__date-tags',
                           func='sel_text'),
                      Attr(name='description',
                           selector='.overflow-description',
                           func='sel_text'),
                  )), )),
    ])

disp = Dispatcher()
disp.add_scraper(npo_tv_programs)
disp.run()
Exemplo n.º 25
0
                     json_key='CVE_Items') for year in years)

meta_template = Template(
    name='meta', db='defcon', table='cve_meta', db_type='MongoDB', attrs=[
    Attr(name='last_modified', func='sel_text',
         kws={'regex': 'lastModifiedDate:(.*)'},
         source=cve_source)])

cve_template = Template(
    name='meta', db='defcon', table='cve', db_type='MongoDB', #func='update',
    #kws={'key': 'id'},
    attrs=[
        Attr(name='id', func='sel_text',
             selector=['cve', 'CVE_data_meta', 'ID']),
        Attr(name='cpes', func='sel_text',
             selector=['configurations', 'nodes', 'cpe', 'cpe23Uri']),
        Attr(name='affects', func='sel_dict',
             selector=['cve', 'affects']),
        Attr(name='problem_type', func='sel_text',
             selector=['cve', 'problemtype', 'problemtype_data', 'description',
                       'value']),
        Attr(name='description', func='sel_dict',
             selector=['cve', 'description', 'description_data', 'value']),
        Attr(name='impact', func='sel_dict',
             selector=['impact'])
    ])

cve = ScrapeModel(name='CVE', domain='static.nvd.nist.gov', phases=[
    Phase(sources=cve_source, parser=JSONParser, templates=[cve_template])
])
Exemplo n.º 26
0
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source
from modelscraper.parsers import TextParser
from modelscraper.dispatcher import Dispatcher
from pymongo import MongoClient


cl = MongoClient()
urls = (Source(url='https://tt888.omroep.nl/tt888/' + a['url'][0].split('/')[-1],
               attrs=(Attr(name='url', value=a['url']),))
        for a in cl.nos_journaal.episodes.find())

subtitles = ScrapeModel(
    name='subtitles', domain='https://tt888.omroep.nl/',
    phases=[
        Phase(n_workers=5, sources=urls, parser=TextParser,
            templates=(
                Template(
                    name='subtitle', db_type='mongo_db', db='nos_journaal',
                    table='episodes', func='update', kws={'key': 'url'},
                    attrs=(
                        Attr(name='subtitles', func='sel_text'),
                        )
                ),
            )
            )
    ])
del cl
d = Dispatcher()
d.add_scraper(subtitles)
d.run()
Exemplo n.º 27
0
                        Attr(name='title', selector='.title', func='sel_text'),
                        Attr(name='excerpt',
                             selector='.excerpt',
                             func='sel_text')
                    ])

title_attr = Attr(name='title', selector='h1', func='sel_text')
text_attr = Attr(name='text', selector='p', func='sel_text')
date_attr = Attr(name='date',
                 selector='.published span.small',
                 func='sel_text')
author_attr = Attr(name='author', selector='span.author', func='sel_text')
tags_attr = Attr(name='tags', selector='.article.tags a span', func='sel_text')

article = Template(name='article',
                   selector='.column-content-background',
                   db='nu_nl',
                   db_type='MongoDB',
                   table='articles',
                   attrs=(title_attr, text_attr, date_attr, author_attr,
                          tags_attr))

nu = ScrapeModel(name='nu.nl',
                 domain='http://nu.nl',
                 phases=[
                     Phase(n_workers=2,
                           sources=sources,
                           templates=(headline, )),
                     Phase(n_workers=2, templates=(article, ))
                 ])
Exemplo n.º 28
0
product = Phase(
    sources=sources,
    templates=[
        Template(
            name='product',
            db_type='mongo_db',
            db='makro',
            table='products',
            attrs=[
                Attr(name='name', selector='h1', func='sel_text'),
                Attr(name='price_gross',
                     selector='.price-gross',
                     func='sel_text'),  #  kws={'replacers': '€ '}),
                Attr(name='price_net', selector='.price-net',
                     func='sel_text'),  # kws={'replacers': '€ '}),
                Attr(name='sku', selector='.articlenumber', func='sel_text'),
                Attr(name='description', selector='.tab-1', func='sel_text'),
                Attr(name='category', selector='li.normal', func='sel_text')
            ])
    ])

makro = ScrapeModel(name='makro',
                    domain='https://www.makro.nl/',
                    num_getters=1,
                    phases=[product])

d = Dispatcher()
d.add_scraper(makro)
d.run()
Exemplo n.º 29
0
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source
from modelscraper.parsers import TextParser

with open('/usr/share/wordlists/nmap.lst', encoding='utf8') as fle:
    words = set(
        [''.join(c for c in w if c.isalpha()) for w in fle.readlines()])
base = 'https://www.spotify.com/nl/xhr/json/isEmailAvailable.php?email={}@mailinator.com'
sources = (Source(url=base.format(word),
                  attrs=(Attr(name='username', value=word), ))
           for word in words)

result = Template(name='username',
                  db='spotify',
                  table='users',
                  db_type='MongoDB',
                  attrs=[Attr(name='exists', func='sel_text')])

user = ScrapeModel(
    name='spotify',
    phases=[Phase(sources=sources, n_workers=10, templates=[result])])