selector='.chart-row__current-week', func='sel_text', kws={'numbers': True}) song = Template(name='song', selector='.chart-row__main-display', db_type='MongoDB', db='billboard', table='songs2', required=True, attrs=[name, artist, position]) next_date = Template(name='next_date', selector='#chart-nav', required=True, attrs=[ Attr(name='next_week', func='sel_url', selector='a[title="Next Week"]', source={'copy_attrs': 'year'}), Attr(name='year', func='sel_url', selector='a[title="Next Week"]', kws={'regex': '\/([0-9\-]+)'}) ]) charts = ScrapeModel( name='Hot 100', domain='https://www.billboard.com/', phases=[Phase(n_workers=5, sources=start, templates=[song, next_date])])
wordpress_source = ProgramSource( function='yes y | wpscan {} --batch --follow-redirection') robot_source = ProgramSource( function='/home/jim/git/robot-detect/robot-detect --csv -q {}') passwords_source = ProgramSource( function='grep -ra @{}: ~/ideas/BreachCompilation/data') defcon_1 = ScrapeModel(name='nmap_test', domain='', num_getters=1, phases=[ ip_phase(n_workers=10, templates=[ ip_template(db='defcon', table='companies', func='update', kws={'key': '_id'}, source=['ip']) ], sources=jacco), port_phase, Phase(templates=[git_template], sources=jacco_git, parser=TextParser), Phase(templates=[ds_store_template], sources=jacco_ds, parser=TextParser) ])
tags_attr = Attr(name='tags', selector='.ib.space-right a.link-grey', func='sel_text') article = Template(name='article', attrs=(title_attr, text_attr, date_attr, author_attr, tags_attr)) Phase(source_worker=WebSource, parser=HTMLParser, sources=nos_sources, templates=[ Template(name='article_url', selector='#archief li', db_type='mongo_db', db='nos_nl', table='article_urls', attrs=[ Attr(name='url', selector='a', func='sel_attr', kws={'attr': 'href'}, source={'active': False}), ]) ]), parsed = [ a['url'] for cat in categories for a in cl.nos_nl.articles.find({'category': cat}) ] nos_sources = [ Source(url=url['url'],
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source from scrape_models.objects import occassions sources = [ Source( url= 'http://ww4.autoscout24.nl/?atype=B&mmvco=0&cy=NL&ustate=N%2CU&fromhome=1&intcidm=HP-Searchmask-Button&dtr=s&results=20' ) ] autoscout = ScrapeModel(name='autoscout24', domain='autoscout24.nl', phases=[ Phase(sources=sources, templates=[occassions.autoscout_template]), ])
col = db.episode sources = (Source(url="http://www.luckytv.nl/afleveringen/page/{}/".format(i)) for i in range(1, 50)) LuckyTV = ScrapeModel(name='Lucky TV', domain='http://www.luckytv.nl/', num_getters=2, phases=[ Phase(source_worker=WebSource, parser=HTMLParser, sources=sources, templates=(Template( name='episode', selector='article.video', db_type='mongo_db', db='lucky_tv', table='episodes', attrs=( Attr(name='url', selector='a:nth-of-type(1)', func='sel_url'), Attr(name='title', selector='.video__title', func='sel_text'), Attr(name='date', selector='.video__date', func='sel_text'), )), )), ])
title_attr, text_attr, date_attr, author_attr, tags_attr ) ) headline_phase = Phase( sources=sources, n_workers=5, templates=[ Template( name='headline', selector='.row', db='metronieuws', db_type='MongoDB', kws={'key':'url'}, table='article_urls', attrs=[ Attr(name='url', selector='a.shadow-block', func='sel_attr', kws={'attr': 'href'}, source=Source(active=False, copy_attrs='category')), Attr(name='title', selector='h3', func='sel_text'), Attr(name='excerpt', selector='div > p', func='sel_text'), Attr(name='date', selector='div.wrapper.small div:nth-of-type(1)', func='sel_text'), Attr(name='num_reactions', selector='div.amount', func='sel_text'), ] ) ]) metro = ScrapeModel( name='metronieuws.nl', domain='http://metronieuws.nl', num_getters=5, phases=[ headline_phase, Phase(sources=sources, n_workers=3, templates=[ article(
sub_page_name = Attr(name='pagename', func='sel_text') category_temp = Template(name='sub_page', selector='.sections a', db='startpagina', table='subpages', db_type='mongo_db', attrs=(sub_page_url, sub_page_name)) website_url = Attr(name='url', func='sel_url') website_name = Attr(name='name', func='sel_text') website_temp = Template(name='website', selector='#columns a', db='startpagina', table='websites', db_type='mongo_db', attrs=(website_url, website_name)) model = ScrapeModel(name='startpagina', domain='http://www.startpagina.nl', phases=[ Phase(n_workers=3, sources=[start_url], templates=[category_temp]), Phase(n_workers=3, templates=[website_temp]) ]) d = Dispatcher() d.add_scraper(model) d.run()
func='sel_text') article = Template( name='article', selector='.col__inner', attrs=( title_attr, text_attr, date_attr, author_attr, tags_attr ) ) rtl= ScrapeModel( name='rtl', domain='http://www.rtlnieuws.nl/', num_getters=1, phases=[ Phase(sources=[ Source(url="http://www.parool.nl/archief/2012")], templates=(calendar, year) ), Phase(templates=( article_url(db_type='mongo_db', db='parool', table='article_urls'), pagination) ), Phase(templates=(article(db_type='mongo_db', db='parool', table='articles'), ) ), ])
song = Template(name='song', db_type='MongoDB', db='midi', table='songs', attrs=[title, artist, midi_url]) freemidi_template = song( table='freemidi', selector='#mainContent div.col-xs-12:nth-of-type(1)', attrs=[ title(selector= 'li.active:nth-child(3) > a:nth-child(1) > span:nth-child(1)'), artist( selector= 'ol.breadcrumb:nth-child(1) > li:nth-child(2) > a:nth-child(1) > span:nth-child(1)' ), ]) freemidi_sources = (Source( url='https://freemidi.org/download-{}'.format(i), attrs=[midi_url(value='https://freemidi.org/getter-{}'.format(i))]) for i in range(25803)) freemidi = ScrapeModel(domain='http://freemidi.org', phases=[ Phase(n_workers=3, sources=freemidi_sources, templates=[freemidi_template]) ])
selector='h3 a', func='sel_url', source={'active': False}), )) pagination = Template(name='pagination', selector='.pagers', attrs=(Attr(name='page', selector='a', func='sel_url', source=True), )) bedrijven_pagina = ScrapeModel( name='Bedrijven Pagina', domain='https://www.bedrijvenpagina.nl/', num_getters=2, phases=[ Phase(source_worker=WebSource, parser=HTMLParser, sources=[Source(url='https://www.bedrijvenpagina.nl/')], templates=(category_menu, )), Phase(source_worker=WebSource, parser=HTMLParser, templates=(result_list, pagination)), Phase(source_worker=WebSource, parser=HTMLParser, templates=(company, )) ]) disp = Dispatcher() disp.add_scraper(bedrijven_pagina) disp.run()
attrs=[ make(selector='h2 a', kws={'regex': '(\w+) -,'}), price(selector='.price', kws={'regex': '(\d+),'}), year(selector='.listing-priority-product-container', kws={'regex': '(\d{4})'}), mileage(selector='.listing-priority-product-container', kws={'regex': '(.*) km'}), city(selector='.location-name'), url(selector='h2 > a'), ] ) autoscout = ScrapeModel( name='autoscout24', domain='autoscout24.nl', phases=[ Phase(sources=[], templates=[autoscout_template]), ]) autotrader = ScrapeModel(name='autotrader', domain='http://autotrader.nl', num_sources=1, cookies={'CookieOptIn': 'true'}, phases=[ Phase( sources=[ Source(url='http://www.autotrader.nl/motor/zoeken/'), ], templates=[ Template(name='motorcycle', selector='.result', store=StoreObject(func=store_mongo, kws={'db': 'moto', 'collection': 'autotrader'}), attrs=[ ]), Template(name='next_page', selector='#pager', attrs=[
name='dabanga', domain='https://www.dabangasudan.org/en', num_getters=2, phases=[ Phase( sources=(Source(url="https://www.dabangasudan.org/en/all-news"), ), templates=( Template(name='article_url', selector='.list-item.news-item-small', db_type='mongo_db', db='dabanga', table='article_urls', attrs=[ Attr(name='url', selector='a:nth-of-type(1)', func='sel_url', source={'active': False}), ]), Template(name='pagination', selector='.pager', attrs=[ Attr(name='url', selector='a', func='sel_url', source=True), ]), )), Phase(synchronize=True, templates=[ Template(name='article', selector='#content',
start = (Source(url='https://www.manageengine.com/products.html?MEtab'),) demo_template = Template( name='demo', selector='.all_prod_over', db='test', db_type='MongoDB', table='test', attrs=[ Attr( name='url', selector='a', func='sel_url', kws={'regex': 'http[s]?://([\.a-z]*)[\/\?]', 'index': 0}, source={'active': False} ) ] ) manageengine = ScrapeModel( name='manageengine', domain='', num_getters=1, phases=[ Phase(sources=start, templates=[demo_template]), ip_phase, port_phase( ] )
Phase(sources=[ Source(url=extended_url.format(season, episode)) for season in range(1, 10) for episode in range(1, 30) ], templates=(Template( name='episode', selector='#Rapidvideo', db_type='ShellCommand', db='theoffice', table='season', kws={'command': create_dir + ' & ' + youtube_dl}, attrs=( Attr(name='url', selector='a', func=['sel_url', 'sel_text'], kws=[{}, { 'needle': r'.*(s\d+e\d+)' }]), Attr(name='episode', selector='.textwidget', func='sel_text', kws={ 'index': 3, 'substitute': '_', 'replacers': ' ' }), Attr(name='season', selector='.textwidget', func='sel_text', kws={ 'index': 1, 'replacers': ' ' }), )), )),
from modelscraper.sources import ProgramSource port_template = Template(name='ports', selector='port', db_type='mongo_db', db='ports', table='ports', attrs=(Attr(name='portnumber', func='sel_attr', kws={'attr': 'portid'}), Attr(name='state', selector='state', func='sel_attr', kws={'attr': 'state'}), Attr(name='service', selector='service', func='sel_attr', kws={'attr': 'name'}))) nmap = ScrapeModel( name='nmap_test', domain='', phases=[ Phase(sources=(Source(url='nmap -oX - duwo.multiposs.nl'), ), templates=[port_template], source_worker=ProgramSource) ]) disp = Dispatcher() disp.add_scraper(nmap) disp.run()
func='sel_text') tags_attr = Attr(name='tags', selector='.tag-list a.cta', func='sel_text') article = Template(name='article', selector='.col__inner', attrs=(title_attr, text_attr, date_attr, author_attr, tags_attr)) parool = ScrapeModel( name='parool', domain='http://www.parool.nl/', cookies=cookie, num_getters=1, phases=[ Phase(source_worker=WebSource, parser=HTMLParser, sources=[Source(url="http://www.parool.nl/archief/2012")], templates=(calendar, year)), Phase(source_worker=WebSource, parser=HTMLParser, templates=(article_url(db_type='mongo_db', db='parool', table='article_urls'), pagination)), Phase(source_worker=WebSource, parser=HTMLParser, templates=(article(db_type='mongo_db', db='parool', table='articles'), )), ]) volkskrant = ScrapeModel( name='volkskrant',
npo_tv_programs = ScrapeModel( name='npo_tv_programs', domain='http://npo.nl', num_getters=2, phases=[ Phase( source_worker=WebSource, parser=HTMLParser, sources=[Source(url=series_url.format(i)) for i in range(0, 242)], templates=( Template( name='program', selector='.content-column.quarter', db_type='mongo_db', db='npo_tv_programs', table='programs', attrs=( Attr(name='title', selector='h3', func='sel_text'), Attr(name='url', selector='a.full-link', func='sel_url', source=Source( active=False)), # source is for next run )), Template(name='next_url'), )), Phase( source_worker=WebSource, parser=HTMLParser, templates=( Template(
search_url = 'http://vkplusmobilebackend.persgroep.net/rest/content/articles?query=&metadataNeeded=true&limit=10000' next_page_url = 'http://vkplusmobilebackend.persgroep.net/rest/content/articles?query=&metadataNeeded=true&limit=10000&offset={}' search_result = Template( name='search_result', db='volkskrant', table='article_urls', func='create', db_type='mongo_db', selector=('results', 'previews'), attrs=( Attr(name='id', selector=('content_link', 'id'), func='sel_text', source={'src_template': article_url, 'active': False}), ), ) next_search = Template( name='next_result', attrs=( Attr(name='next_limit', selector=('results', 'next_offset'), func='sel_text', source={'src_template': next_page_url}), ) ) sources = (Source(url=search_url),) volkskrant = ScrapeModel( name='volkskrant', domain='volkskrant', phases=[ Phase(parser=JSONParser, n_workers=5, sources=sources, templates=(search_result, next_search)), Phase(parser=JSONParser, n_workers=5, sources=sources, templates=(article,)), ])
attrs=[ Attr(name='submenu_item', selector='.c-product-tile__meta > a', func='sel_url', source={'active': False}), Attr(name='pagination_item', selector='li.is-nexy > a', source=True) ]) product_name = product_name(selector='.c-offer__title') price = price(selector='div.c-offer__price') nutrition = nutrition(selector='.c-offer__nutrition table td') product = Template(name='product', db='foods', table='spar2', db_type='MongoDB', attrs=[product_name, price, nutrition]) spar = ScrapeModel( name='spar.nl', cookie=cookie, domain='https://spar.nl', phases=[ Phase(sources=(Source(url='https://spar.nl/boodschappen/'), ), templates=[menu_template]), Phase(templates=[productmenu_template]), Phase(templates=[product]) ])
Phase(sources=(Source( url= "https://mercury.landmarkglobal.com/tracking/track.php?trck=LTN{}N1&Submit=Track" .format(i), method='post', data=[('sid', str(i)), ('options[]', 'display_full_history'), ('options[]', 'use_cached_data_only'), ('action', 'View+Complete+Tracking+History')]) for i in range(5000, 50000000)), templates=[ Template( name='shipment', selector=None, db='shipments', db_type='MongoDB', table='shipment', attrs=[ Attr( name='carrier', selector= '#large_shipment_info_box > div:nth-child(2) > div:nth-child(1)', func='sel_text', kws={'regex': 'Carrier:\s(\w+)'}), Attr( name='shipped_to', selector= '#large_shipment_info_box > div:nth-child(2) > div:nth-child(2) div:nth-child(1) .align_left', func='sel_text'), Attr( name='shipped_from', selector= '#large_shipment_info_box > div:nth-child(2) > div:nth-child(2) div:nth-child(2) .align_left', func='sel_text'), ]), Template(name='event', selector='table tr:not(:nth-child(1))', db_type='MongoDB', db='shipments', table='events', attrs=[ Attr(name='description', selector='td:nth-of-type(1)', func='sel_text'), Attr(name='date', selector='td:nth-of-type(2)', func='sel_text'), Attr(name='location', selector='td:nth-of-type(3)', func='sel_text'), ]), ])
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source from modelscraper.workers import WebSource from modelscraper.parsers import HTMLParser uefa = ScrapeModel( name='eufa', domain='http://uefa.com', num_getters=2, phases=[ Phase(sources=(Source( url="http://www.uefa.com/uefaeuro/season=2016/teams/index.html"), ), templates=[ Template(name='team', selector='.teams--qualified', attrs=[ Attr(name='url', selector='a', func='sel_url', source={'active': False}), ]) ]), Phase(templates=[ Template(name='player', selector='.squad--team-player', db_type='MongoDB', db='uefa', table='players', attrs=[ Attr(name='name', selector='.squad--player-name',
import dns.query import dns.zone ip_template = Template(name='ip', db_type='MongoDB', db='', table='', parser=TextParser, attrs=(Attr( name='ip', func='sel_text', kws={'regex': '(\d+\.\d+\.\d+\.\d+)'}, ), )) ip_phase = Phase(n_workers=10, templates=[ip_template], source_worker=ProgramSource(function='host {}')) port_template = Template(name='ports', selector='port', db_type='MongoDB', db='monog', table='ports', attrs=(Attr(name='portnumber', func='sel_attr', kws={'attr': 'portid'}), Attr(name='state', selector='state', func='sel_attr', kws={'attr': 'state'}), Attr(name='service',
from modelscraper.dispatcher import Dispatcher from modelscraper.sources import WebSource from modelscraper.parsers import HTMLParser from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source motorparts = ScrapeModel( name='motorparts', domain='http://www.2wheelpros.com', num_sources=2, phases=[ Phase(source_worker=WebSource, parser=HTMLParser, sources=(Source(url='http://www.2wheelpros.com/oem-parts/'),), templates=( Template(name='brand', selector='#nav > ul > li:nth-of-type(1) > a', attrs=( Attr(name='url', func='sel_url', source={'active': False}), ),),) ), Phase(source_worker=WebSource, parser=HTMLParser, templates=( Template(name='year', selector='.yearlink', attrs=( Attr(name='url', func='sel_url', source={'active': False}),)),), ), Phase(source_worker=WebSource, parser=HTMLParser, templates=( Template(name='model', selector='.modellink', attrs=( Attr(name='url', func='sel_url', source={'active': False}), ) ), ), ), Phase(source_worker=WebSource, parser=HTMLParser, templates=( Template(name='partCategory', db='motorparts', db_type='MongoDB', table='part_categories', source={'active':False,
func='sel_text') author_attr = Attr(name='author', selector='span.author', func='sel_text') tags_attr = Attr(name='tags', selector='.article.tags a span', func='sel_text') article = Template( name='article', selector='.column-content-background', db='nu_nl', db_type='mongo_db', table='articles', attrs=( title_attr, text_attr, date_attr, author_attr, tags_attr ) ) next_page = Template( name='next_page', selector='paging', attrs=( Attr(name='paging', ad = ScrapeModel( name='ad.nl', domain='mobileapi.ad.nl', phases=[ Phase(n_workers=2, sources=sources, templates=(
from modelscraper.dispatcher import Dispatcher from pymongo import MongoClient cl = MongoClient() sources = (Source(url=a['url'][0]) for a in cl.dwdd.episode_urls.find()) programs_az = Phase( sources=[ Source(url="http://www.npo.nl/programmas/a-z", params={'page': i}) for i in range(0, 1) ], templates=( Template( name='program', selector='.content-column.quarter', db_type='mongo_db', db='npo_tv_programs', table='programs', attrs=( Attr(name='title', selector='h3', func='sel_text'), Attr(name='url', selector='a.full-link', func='sel_url', source=Source(active=False)), # source is for next run )), )) nos_search = 'https://www.npo.nl/de-wereld-draait-door/VARA_101377717/search?media_type=broadcast&start_date=&end_date=&start={}&rows=100' episodes_phase = Phase(n_workers=5, sources=(Source(url=nos_search.format(start)) for start in range(0, 2194, 100)), templates=(Template(
json_key='CVE_Items') for year in years) meta_template = Template( name='meta', db='defcon', table='cve_meta', db_type='MongoDB', attrs=[ Attr(name='last_modified', func='sel_text', kws={'regex': 'lastModifiedDate:(.*)'}, source=cve_source)]) cve_template = Template( name='meta', db='defcon', table='cve', db_type='MongoDB', #func='update', #kws={'key': 'id'}, attrs=[ Attr(name='id', func='sel_text', selector=['cve', 'CVE_data_meta', 'ID']), Attr(name='cpes', func='sel_text', selector=['configurations', 'nodes', 'cpe', 'cpe23Uri']), Attr(name='affects', func='sel_dict', selector=['cve', 'affects']), Attr(name='problem_type', func='sel_text', selector=['cve', 'problemtype', 'problemtype_data', 'description', 'value']), Attr(name='description', func='sel_dict', selector=['cve', 'description', 'description_data', 'value']), Attr(name='impact', func='sel_dict', selector=['impact']) ]) cve = ScrapeModel(name='CVE', domain='static.nvd.nist.gov', phases=[ Phase(sources=cve_source, parser=JSONParser, templates=[cve_template]) ])
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source from modelscraper.parsers import TextParser from modelscraper.dispatcher import Dispatcher from pymongo import MongoClient cl = MongoClient() urls = (Source(url='https://tt888.omroep.nl/tt888/' + a['url'][0].split('/')[-1], attrs=(Attr(name='url', value=a['url']),)) for a in cl.nos_journaal.episodes.find()) subtitles = ScrapeModel( name='subtitles', domain='https://tt888.omroep.nl/', phases=[ Phase(n_workers=5, sources=urls, parser=TextParser, templates=( Template( name='subtitle', db_type='mongo_db', db='nos_journaal', table='episodes', func='update', kws={'key': 'url'}, attrs=( Attr(name='subtitles', func='sel_text'), ) ), ) ) ]) del cl d = Dispatcher() d.add_scraper(subtitles) d.run()
Attr(name='title', selector='.title', func='sel_text'), Attr(name='excerpt', selector='.excerpt', func='sel_text') ]) title_attr = Attr(name='title', selector='h1', func='sel_text') text_attr = Attr(name='text', selector='p', func='sel_text') date_attr = Attr(name='date', selector='.published span.small', func='sel_text') author_attr = Attr(name='author', selector='span.author', func='sel_text') tags_attr = Attr(name='tags', selector='.article.tags a span', func='sel_text') article = Template(name='article', selector='.column-content-background', db='nu_nl', db_type='MongoDB', table='articles', attrs=(title_attr, text_attr, date_attr, author_attr, tags_attr)) nu = ScrapeModel(name='nu.nl', domain='http://nu.nl', phases=[ Phase(n_workers=2, sources=sources, templates=(headline, )), Phase(n_workers=2, templates=(article, )) ])
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source from pymongo import MongoClient cl = MongoClient() product_sources = cl.makro.product_urls.find() sources = [Source(url=p['url'][0]) for p in product_sources] categories = Phase( sources=(Source(url="https://www.makro.nl/cat/nl/products"), ), templates=(Template( name='product_category', selector='#left-navigation-container ul.vertical > li > a', db_type='mongo_db', db='makro', table='product_categories', attrs=[ Attr(name='url', func='sel_url', source={'active': False}, kws={ 'replacers': 'pageSize=(\d+)', 'substitute': 'pageSize=96' }), ]), )) product_lists = Phase(templates=[ Template(name='product_urls', selector='.product-list .product-tiles', db_type='mongo_db', db='makro', table='product_urls',
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source from modelscraper.parsers import TextParser with open('/usr/share/wordlists/nmap.lst', encoding='utf8') as fle: words = set( [''.join(c for c in w if c.isalpha()) for w in fle.readlines()]) base = 'https://www.spotify.com/nl/xhr/json/isEmailAvailable.php?email={}@mailinator.com' sources = (Source(url=base.format(word), attrs=(Attr(name='username', value=word), )) for word in words) result = Template(name='username', db='spotify', table='users', db_type='MongoDB', attrs=[Attr(name='exists', func='sel_text')]) user = ScrapeModel( name='spotify', phases=[Phase(sources=sources, n_workers=10, templates=[result])])