Phase( source_worker=WebSource, parser=HTMLParser, sources=[ Source( url= "https://www.erowid.org/experiences/exp.cgi?ShowViews=1&Cellar=0&Start=0&Max=24777" ) ], templates=(Template(name='report_url', selector='.exp-list-table tr', source={ 'active': False, 'copy_attrs': True }, attrs=( Attr(name='url', selector='td:nth-of-type(2) a', func='sel_url'), Attr(name='title', selector='td:nth-of-type(2) a', func='sel_text'), Attr(name='rating', selector='td:nth-of-type(1) img', func='sel_attr', kws={'attr': 'alt'}), Attr(name='author', selector='td:nth-of-type(3)', func='sel_text'), Attr(name='substances', selector='td:nth-of-type(4)', func='sel_text', kws={ 'replacers': '&', 'substitute': ',', 'regex': '([A-z0-9\-]+\s*[A-z0-9\-*\s]*)' }), Attr(name='date', selector='td:nth-of-type(5)', func='sel_text'), Attr(name='views', selector='td:nth-of-type(6)', func='sel_text'), )), )),
Phase( source_worker=WebSource, parser=HTMLParser, sources=[ Source( url="https://ghostlyhaks.com/forum/rom-eeprom-bios-efi-uefi" ) ], templates=( Template( name='forum_post', selector='.kbody tr', db_type='mongo_db', db='efi_dumps', table='forum_post', attrs=( Attr(name='url', selector='a.ktopic-title', func='sel_url', source=Source( active=False)), # source is for next run Attr(name='user', selector='.kwho-user', func='sel_text'), Attr(name='user_url', selector='.kwho-user', func='sel_url'), )), Template( name='next_page', selector='.kpagination', attrs=[ Attr(name='url', selector='a', func='sel_url', source=Source()) # source is for next run ]), )),
cl = MongoClient() db = cl.nytimes col = db.menu nytimes = ScrapeModel(name='nytimes', domain='https://www.nytimes.com/', num_getters=2, phases=[ Phase(source_worker=WebSource, parser=HTMLParser, sources=[ Source(url="https://www.nytimes.com/")], templates=( Template( name='menu', selector='#site-index-navigation li', db_type='MongoDB', db='nytimes', table='menu', attrs=( Attr(name='url', selector='a', func='sel_url', source=Source(active=False)), # source is for next run ) ), ) ), Phase(source_worker=WebSource, parser=HTMLParser, sources=[ Source(url="https://www.nytimes.com/")], templates=( Template( name='articlelist', selector='', db_type='MongoDB', db='nytimes', table='articles', attrs=( Attr(name='title', selector='h1', func='sel_text'),
belgian_parlement_roles = ScrapeModel( name='belgian_parlement_roles', domain='https://fr.wikipedia.org/', num_getters=2, phases=[ Phase( source_worker=WebSource, parser=HTMLParser, sources=[ Source( url= "https://fr.wikipedia.org/wiki/Liste_des_gouvernements_de_la_Belgique" ) ], templates=( Template( name='government', selector='.wikitable tr td:nth-of-type(2)', attrs=( Attr(name='url', selector='a', func='sel_url', source=Source( active=False)), # source is for next run )), )), Phase(source_worker=WebSource, parser=HTMLParser, templates=(Template(name='government', selector='table:nth-of-type(1) tr', db_type='mongo_db', db='belgian_politics',
Phase( source_worker=WebSource, parser=HTMLParser, sources=[ Source( url= 'http://www.kinky.nl/sex-afspraken/mannen/default.aspx?pagesize=5000', attrs=[Attr(name='sex', value='man')]), Source( url= 'http://www.kinky.nl/sex-afspraken/vrouwen/default.aspx?pagesize=5000', attrs=[Attr(name='sex', value='vrouw')]), Source( url= 'http://www.kinky.nl/sex-afspraken/transsexuelen/default.aspx?pagesize=5000', attrs=[Attr(name='sex', value='trans')]), Source( url= 'http://www.kinky.nl/sex-afspraken/stellen/default.aspx?pagesize=5000', attrs=[Attr(name='sex', value='stellen')]), Source( url= 'http://www.kinky.nl/sex-afspraken/gay/default.aspx?pagesize=5000', attrs=[Attr(name='sex', value='gay')]), ], templates=[ Template(name='advert', selector='#advertenties > div', db_type='mongo_db', db='kinky', table='adds', attrs=[ Attr(name='phone', selector='.quickinfo > span', func='sel_text', kws={ 'children': True, 'debug': True, 'regex': 'Mijn telefoonnummer: (.*)' }), Attr(name='city', selector='.quickinfo span.country', func='sel_text'), Attr(name='url', selector='.advertentie_kop a', func='sel_attr', kws={'attr': 'href'}) ]) ]),
southpark = ScrapeModel( name='southpark', domain='http://southpark.cc.com/', num_getters=2, phases=[ Phase( source_worker=WebSource, parser=HTMLParser, sources=[Source(url="http://southpark.cc.com/")], templates=( Template( name='video', selector='', db_type='mongo_db', db='southpark', table='video', attrs=( Attr(name='url', selector='a', func='sel_url', source=Source( active=False)), # source is for next run Attr(name='title', selector='h1', func='sel_text'), Attr(name='text', selector='p', func='sel_text'), )), )), Phase( source_worker=WebSource, parser=HTMLParser, sources=[Source(url="http://southpark.cc.com/")], templates=( Template(
Phase(source_worker=WebSource, parser=HTMLParser, sources=[ Source(url='https://www.youtube.com/user/ozzymanreviews/videos'), Source(url='https://www.youtube.com/user/Draadstal/videos'), Source(url='https://www.youtube.com/channel/UCQMs9pijXYAdqvkEMJyCM4g/videos'), Source(url='https://www.youtube.com/channel/UCi1LpRIlG1tDY5Z54VTel2w/videos'), Source(url='https://www.youtube.com/user/vpro/videos'), Source(url='https://www.youtube.com/user/nprmusic/videos'), ], templates=( Template( name='channel_videos', selector='li.channels-content-item', db_type='mongo_db', db='youtube_channel', table='channel_videos', attrs=[ Attr(name='url', selector='h3.yt-lockup-title a', func='sel_url'), Attr(name='title', selector='h3', func='sel_text'), Attr(name='views', selector='.yt-lockup-meta-info', func='sel_text', kws={'regex': '(.*) weergaven', 'numbers': True}), ] ), Template( name='next_videos', selector='.browse-items-load-more-button', attrs=[ Attr(name='url', func='sel_attr', kws={'attr': 'data-uix-load-more-href'}, source=Source(src_template='http://youtube.com{}', json_key=['content_html', 'load_more_widget_html'])) ]), ) ),
cl = MongoClient() db = cl.headlines col = db.category headlines = ScrapeModel(name='headlines', domain='http://www.headlines24.nl/', num_getters=2, phases=[ Phase(source_worker=WebSource, parser=HTMLParser, sources=[ Source(url="http://www.headlines24.nl/")], templates=( Template( name='category', selector='', db_type='mongo_db', db='headlines', table='category', attrs=( Attr(name='url', selector='a', func='sel_url', source=Source(active=False)), # source is for next run Attr(name='title', selector='h1', func='sel_text'), Attr(name='text', selector='p', func='sel_text'), ) ), ) ), Phase(source_worker=WebSource, parser=HTMLParser, sources=[ Source(url="http://www.headlines24.nl/")], templates=( Template( name='category', selector='', db_type='mongo_db', db='headlines', table='category',
from workers import WebSource from parsers import HTMLParser import string from components import ScrapeModel, Source, Phase, Attr, Template paradiso = ScrapeModel( name='paradiso', domain='https://paradiso.nl', phases=[ Phase(source_worker=WebSource, sources=[Source(url='https://paradiso.nl/web/Agenda.htm')], parser=HTMLParser, templates=[ Template(name='event_link', selector='a.event-link', attrs=[ Attr(name='url', func='sel_attr', kws={'attr': 'href'}, source={'active': False}) ]) ]), Phase(templates=[ Template(name='event', db_type='MongoDB', db='paradiso', table='events', attrs=[ Attr(name='name', selector='meta[name=evenementts]', func='sel_attr',
today = datetime.datetime.now().year print(today) volkskrant = ScrapeModel( name='volkskrant', domain='http://www.volkskrant.nl/', num_getters=2, cookies={'nl_cookiewall_version': '1'}, phases=[ Phase( source_worker=WebSource, parser=HTMLParser, sources=[ Source(url="http://www.volkskrant.nl/archief/{}".format(year)) for year in range(1987, today) ], templates=(Template(name='day_url', selector='td', attrs=(Attr( name='url', selector='a', func='sel_url', source=Source(active=False)), )), )), Phase( source_worker=WebSource, parser=HTMLParser, templates=( Template(name='article_url', selector='article', attrs=(Attr(name='url', selector='a', func='sel_url',
name='namen', domain='http://www.meertens.knaw.nl/', num_getters=2, phases=[ Phase(source_worker=WebSource, parser=HTMLParser, sources=( Source(url="http://www.meertens.knaw.nl/nvb/naam/begintmet/" + l) for l in ['Aad']), # string.ascii_lowercase), templates=[ Template( name='name', selector='tr.data', db_type='mongo_db', db='names', table='name_count_test', attrs=[ Attr(name='name', selector='td:nth-of-type(1)', func='sel_text'), Attr(name='men', selector='td:nth-of-type(2)', func='sel_text', kws={'numbers': True}), Attr(name='women', selector='td:nth-of-type(3)', func='sel_text', kws={'numbers': True}), Attr(name='url', selector='td:nth-of-type(1) a', func='sel_attr', kws={'attr': 'href'}, source={'active': False}, source_condition={'women': '> 50', 'men': '> 50'}), ] ), Template( name='next_url', selector='.right', attrs=[ Attr(name='next', selector='abc', func='sel_attr', kws={'attr': 'href'}, source={'active': True}), ]) ] ), Phase(source_worker=WebSource, parser=HTMLParser, templates=[
from components import ScrapeModel, Phase, Template, Attr, Source from workers import WebSource from parsers import HTMLParser pornstars = ScrapeModel( name='pornhub_pornstars', domain='http://pornhub.com', num_getters=2, phases=[ Phase(source_worker=WebSource, parser=HTMLParser, sources=[Source(url='http://www.pornhub.com/pornstars?o=a')], templates=[ Template(name='alphabet', selector='.alphabetFilter .dropdownWrapper li', attrs=[ Attr(name='url', selector='a', func='sel_url', source=Source(active=False)) ]) ]), Phase(source_worker=WebSource, parser=HTMLParser, templates=[ Template(name='pornstar', selector='.pornstarIndex li', db_type='MongoDB', db='pornstars', collection='ranking', attrs=[
from pymongo import MongoClient petitions = ScrapeModel( name='petitions', domain='https://petities.nl/', num_getters=2, phases=[ Phase(source_worker=WebSource, parser=HTMLParser, sources=( Source(url="https://petities.nl/petitions/borstkankeronderzoek-vervroegen/signatures?locale=nl"),), templates=[ Template(name='next_page', selector='.navigation-bar .navigation-bar', attrs=[Attr(name='url', selector='a', func='sel_url', kws={'attr': 'href'}, source=True)]), Template(name='signature', selector='.petition-signature-list', db_type='mongo_db', db='petitions', table='borstkanker', attrs=[ Attr(name='name', selector='.petition-signature-name', func='sel_text'), Attr(name='time', selector='.signature-time', func='sel_text'), Attr(name='location', selector='.petition-signature-location', func='sel_text'), Attr(name='occupation', selector='.petition-signature-occupation', func='sel_text') ]) ] ) ] ) ds = Dispatcher() ds.add_scraper(petitions) ds.run()
Phase( source_worker=WebSource, parser=HTMLParser, sources=[ Source(url="http://watchtheofficeonline.com/s{}e{}".format( season, episode)) for season in range(1, 10) for episode in range(1, 30) ], templates=(Template( name='episode', selector='.so-panel.widget.widget_siteorigin-panels-builder', db_type='shell_command', db='theoffice', table='season', kws={ 'command': 'sudo mkdir -p ' + filepath + '/{season}/ &' + ' sudo youtube-dl -o /mnt/Movies/{season}/{episode} {url}' }, attrs=( Attr(name='url', selector='a', func=['sel_url', 'sel_text'], kws=[{}, { 'needle': r'.*(s\d+e\d+)' }]), Attr(name='episode', selector='.textwidget', func='sel_text', kws={ 'index': 3, 'substitute': '_', 'replacers': ' ' }), Attr(name='season', selector='.textwidget', func='sel_text', kws={ 'index': 1, 'replacers': ' ' }), )), )),
cl = MongoClient() db = cl.gsmhelpdesk_nummerreeksen col = db.number_range gsmhelpdesk_nummerreeksen = ScrapeModel(name='gsmhelpdesk_nummerreeksen', domain='http://www.gsmhelpdesk.nl/', num_getters=2, phases=[ Phase(source_worker=WebSource, parser=HTMLParser, sources=[ Source(url="http://www.gsmhelpdesk.nl/helpdesk/30/nummerreeksen"), ], templates=( Template( name='number_range', selector='tr', db_type='mongo_db', db='gsmhelpdesk_nummerreeksen', table='number_range', attrs=( Attr(name='start', selector='td:nth-of-type(1)', func='sel_text', kws={'numbers': True}), Attr(name='end', selector='td:nth-of-type(2)', func='sel_text', kws={'numbers': True}), ) ), ) ), ] ) disp = Dispatcher() disp.add_scraper(gsmhelpdesk_nummerreeksen) disp.run()
Phase( parser=HTMLParser, source_worker=WebSource, sources=[ Source(url='http://funda.nl/huur/amsterdam/woonhuis/'), Source(url='http://funda.nl/huur/amsterdam/appartement/'), Source(url='http://funda.nl/koop/amsterdam/woonhuis/'), Source(url='http://funda.nl/koop/amsterdam/appartement'), ], templates=[ Template( name='house', selector='.search-result', db_type='mongo_db', db='funda', table='for_hire', attrs=[ Attr(name='price', selector='.search-result-price', func='sel_text', kws={'numbers': True}), Attr(name='street', selector='.search-result-title', func='sel_text'), Attr(name='realtor', selector='.realtor', func='sel_text'), Attr(name='rooms', selector='.search-result-info', func='sel_text', kws={ 'regex': '(\d+) kamers', 'numbers': True }), Attr(name='zip', selector='.search-result-subtitle', func='sel_text', kws={'regex': '(\d{4} \w{2})'}), Attr(name='city', func='sel_text', selector='.search-result-subtitle', kws={'regex': '\d{4} \w{2} (\w+)'}), Attr( name='living_area', func='sel_text', selector= '.search-result-info span[title="Woonoppervlakte"]', kws={ 'regex': '(\d+)', 'numbers': True }), Attr(name='meeting_url', selector='.search-result-header a', func='sel_attr', kws={'attr': 'href'}, source={ 'src_template': '{}bezichtiging/', 'active': False }), ]), Template( selector='.pagination', attrs=[ Attr( name='url', selector='a', func='sel_attr', kws={'attr': 'href'}, # source=Source() ) ]) ]),
Phase( source_worker=WebSource, parser=HTMLParser, sources=[Source(url="https://www.thuisbezorgd.nl/")], templates=( Template( name='sections', selector='', attrs=( Attr(name='url', selector='a[href*="eten-bestellen-"]', func='sel_url', source=Source()), # source is for next run )), Template( name='restaurant', selector='.restaurant', db_type='MongoDB', db='thuisbezorgd', table='restaurants', attrs=( Attr(name='url', selector='a.restaurantname', func='sel_url', source=Source( active=False, src_template='{}')), # source is for next run Attr(name='name', selector='a.restaurantname', func='sel_text'), )), )),
# TODO Set the right classes for the websites. from dispatcher import Dispatcher from components import Attr, Source, HTMLObject, Phase, ScrapeModel, Follow kinky =ScrapeModel(name='kinky', domain='kinky.nl', phases=[ Phase(source=[Source(url='http://kinky.nl/sex-afspraken/vrouwen/default.aspx?pagesize=4500')], templates=[Template(name='advert', selector='.advertentie_kop > a', attrs=[Attr(name='url', source={'active': False})]) ]), Phase(templates=[ Template(name='advert', selector='.advertentie_kop > a', db='kinky.nl', table='adverts', Attr(name='add_text', func= 'sel_text', selector='description p'), Attr(name='possibilities', func= 'sel_text', selector= '.possibilities li'), Attr(name='update', func= 'sel_text', selector='update'), Attr(name='town', func= 'sel_text', selector= '.naw div', kws={'regex': '.*Plaats: ([A-z]*);'}), Attr(name='work_area', func= 'sel_text', selector='naw div', kws={'regex': '.*Werkgebied: '}), Attr(name='prices', func= 'sel_text', selector='.prizes td:nth-of-type(2)'), Attr(name='pictures', func= 'sel_attr', selector='.galleryRow img', kws={'attr': 'src'}), Attr(name='phone', selector='.mainprofile .webbutton span', func='sel_text'), Attr(name='name', selector='h1.title', func='sel_text'), '__reg__age': ['.naw div', '.*Leeftijd: (\d\d)'], '__reg__length': ['.naw div', '.*Lengte: (\d\d\d)'], '__reg__hair_color': ['.naw div', '.*Kleur haar: ([a-z]*);'], '__reg__build': ['.naw div', '.*Lichaamsbouw: ([A-z]*);'], '__reg__looks': ['.naw div', '.*Uiterlijk: ([A-z]*);'], , , , 'start': [ 'meta': { 'sex': 'female',