def main(model, dummy): if len(model) == 1: model = model[0] dispatcher = Dispatcher() if model not in available_models: print('Model', model, 'is not in the folder "scrape_models".') print('These models are available:') pprint.pprint(available_models, compact=True) return imported = vars(importlib.import_module(f'scrape_models.{model}')).values() scrape_models = [model for model in imported if type(model) == ScrapeModel] dispatcher.add_scraper(scrape_models, dummy=dummy) dispatcher.run()
from modelscraper.dispatcher import Dispatcher import nu_nl import metronieuws import parool import volkskrant disp = Dispatcher() disp.add_scraper([volkskrant.volkskrant]) disp.run()
product = Phase( sources=sources, templates=[ Template( name='product', db_type='mongo_db', db='makro', table='products', attrs=[ Attr(name='name', selector='h1', func='sel_text'), Attr(name='price_gross', selector='.price-gross', func='sel_text'), # kws={'replacers': '€ '}), Attr(name='price_net', selector='.price-net', func='sel_text'), # kws={'replacers': '€ '}), Attr(name='sku', selector='.articlenumber', func='sel_text'), Attr(name='description', selector='.tab-1', func='sel_text'), Attr(name='category', selector='li.normal', func='sel_text') ]) ]) makro = ScrapeModel(name='makro', domain='https://www.makro.nl/', num_getters=1, phases=[product]) d = Dispatcher() d.add_scraper(makro) d.run()
selector='a', func='sel_attr', kws={'attr': 'href'}, source={'active': False}), ]) ]), parsed = [ a['url'] for cat in categories for a in cl.nos_nl.articles.find({'category': cat}) ] nos_sources = [ Source(url=url['url'], attrs=[Attr(name='category', value=url['category'])]) for url in cl.nos_nl.article_urls.find() if url['url'] not in parsed ] nos = ScrapeModel(name='nos.nl', domain='http://nos.nl', num_getters=10, phases=[ Phase(n_workers=5, sources=nos_sources, templates=(article(db_type='mongo_db', db='nos_nl', table='articles'), )) ]) disp = Dispatcher() disp.add_scraper(nos) disp.run()
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source from modelscraper.parsers import TextParser from modelscraper.dispatcher import Dispatcher from pymongo import MongoClient cl = MongoClient() urls = (Source(url='https://tt888.omroep.nl/tt888/' + a['url'][0].split('/')[-1], attrs=(Attr(name='url', value=a['url']),)) for a in cl.nos_journaal.episodes.find()) subtitles = ScrapeModel( name='subtitles', domain='https://tt888.omroep.nl/', phases=[ Phase(n_workers=5, sources=urls, parser=TextParser, templates=( Template( name='subtitle', db_type='mongo_db', db='nos_journaal', table='episodes', func='update', kws={'key': 'url'}, attrs=( Attr(name='subtitles', func='sel_text'), ) ), ) ) ]) del cl d = Dispatcher() d.add_scraper(subtitles) d.run()
func='sel_url', source=Source(active=False)), )), )), npo_tv_programs = ScrapeModel( name='npo_tv_programs', domain='http://npo.nl', num_getters=2, phases=[ Phase(n_workers=10, sources=sources, templates=(Template( name='episode', selector='.column-player-info', db='dwdd', func='update', table='episodes', db_type='mongo_db', attrs=( Attr(name='date', selector='ul.the-player-meta-block__date-tags', func='sel_text'), Attr(name='description', selector='.overflow-description', func='sel_text'), )), )), ]) disp = Dispatcher() disp.add_scraper(npo_tv_programs) disp.run()
Attr(name='make', func='sel_text', selector='#ctl00_cphMain_hHeadMake'), Attr(name='year', func='sel_text', selector='#ctl00_cphMain_hHeadYear'), Attr(name='model', func='sel_text', selector='.breadcrumbs a:last-of-type'), Attr(name='part_category_urls', selector='.category a:last-of-type', func='sel_url'), )), ) ), Phase(source_worker=WebSource, parser=HTMLParser, templates=( Template(name='part', selector='.scrollable-area-2 .cart-table tr', db='motorparts', table='parts', func='update', db_type='MongoDB', attrs=( Attr(name='part_number', func='sel_text', selector='h4 + span'), Attr(name='amount', func='sel_text', selector='.col-2 span:last-of-type'), Attr(name='drawing_number', func='sel_text', selector='.col-1 span'), )), )) ]) disp = Dispatcher() disp.add_scraper(motorparts) disp.run()
sub_page_name = Attr(name='pagename', func='sel_text') category_temp = Template(name='sub_page', selector='.sections a', db='startpagina', table='subpages', db_type='mongo_db', attrs=(sub_page_url, sub_page_name)) website_url = Attr(name='url', func='sel_url') website_name = Attr(name='name', func='sel_text') website_temp = Template(name='website', selector='#columns a', db='startpagina', table='websites', db_type='mongo_db', attrs=(website_url, website_name)) model = ScrapeModel(name='startpagina', domain='http://www.startpagina.nl', phases=[ Phase(n_workers=3, sources=[start_url], templates=[category_temp]), Phase(n_workers=3, templates=[website_temp]) ]) d = Dispatcher() d.add_scraper(model) d.run()
] ),) ), Phase(synchronize=False,templates=[ Template( name='player', selector='.squad--team-player', db_type='mongo_db', db='uefa', table='players', attrs=[ Attr(name='name', selector='.squad--player-name', func='sel_text'), Attr(name='player_url', selector='.squad--player-name a', func='sel_url'), Attr(name='img', selector='.squad--player-img img', func='sel_attr', kws={'attr': 'src'}), ] ), # Template( # name='team', selector='', # db_type='mongo_db', func='update', db='uefa', table='players', # attrs=[ # Attr(name='team', selector='h1.team-name', func='sel_text'), # ] # ) ] )] ) disp = Dispatcher() disp.add_scraper(uefa) disp.run()
from modelscraper.sources import ProgramSource port_template = Template(name='ports', selector='port', db_type='mongo_db', db='ports', table='ports', attrs=(Attr(name='portnumber', func='sel_attr', kws={'attr': 'portid'}), Attr(name='state', selector='state', func='sel_attr', kws={'attr': 'state'}), Attr(name='service', selector='service', func='sel_attr', kws={'attr': 'name'}))) nmap = ScrapeModel( name='nmap_test', domain='', phases=[ Phase(sources=(Source(url='nmap -oX - duwo.multiposs.nl'), ), templates=[port_template], source_worker=ProgramSource) ]) disp = Dispatcher() disp.add_scraper(nmap) disp.run()
selector='h3 a', func='sel_url', source={'active': False}), )) pagination = Template(name='pagination', selector='.pagers', attrs=(Attr(name='page', selector='a', func='sel_url', source=True), )) bedrijven_pagina = ScrapeModel( name='Bedrijven Pagina', domain='https://www.bedrijvenpagina.nl/', num_getters=2, phases=[ Phase(source_worker=WebSource, parser=HTMLParser, sources=[Source(url='https://www.bedrijvenpagina.nl/')], templates=(category_menu, )), Phase(source_worker=WebSource, parser=HTMLParser, templates=(result_list, pagination)), Phase(source_worker=WebSource, parser=HTMLParser, templates=(company, )) ]) disp = Dispatcher() disp.add_scraper(bedrijven_pagina) disp.run()
templates=[ Template(name='article', selector='#content', db_type='mongo_db', db='dabanga', table='article', attrs=[ Attr(name='title', selector='h1', func='sel_text'), Attr(name='text', selector='.article .body-text', func='sel_text'), Attr(name='date', selector='.article .time', func='sel_text'), Attr(name='place', selector='.article .place', func='sel_text'), Attr(name='img', selector='.article img', func='sel_attr', kws={'attr': 'src'}), ]), ]) ]) d = Dispatcher() d.add_scraper(dabanga) d.run()