示例#1
0
                selector='.chart-row__current-week',
                func='sel_text',
                kws={'numbers': True})

song = Template(name='song',
                selector='.chart-row__main-display',
                db_type='MongoDB',
                db='billboard',
                table='songs2',
                required=True,
                attrs=[name, artist, position])

next_date = Template(name='next_date',
                     selector='#chart-nav',
                     required=True,
                     attrs=[
                         Attr(name='next_week',
                              func='sel_url',
                              selector='a[title="Next Week"]',
                              source={'copy_attrs': 'year'}),
                         Attr(name='year',
                              func='sel_url',
                              selector='a[title="Next Week"]',
                              kws={'regex': '\/([0-9\-]+)'})
                     ])

charts = ScrapeModel(
    name='Hot 100',
    domain='https://www.billboard.com/',
    phases=[Phase(n_workers=5, sources=start, templates=[song, next_date])])
示例#2
0
wordpress_source = ProgramSource(
    function='yes y | wpscan {} --batch --follow-redirection')

robot_source = ProgramSource(
    function='/home/jim/git/robot-detect/robot-detect --csv -q {}')

passwords_source = ProgramSource(
    function='grep -ra @{}: ~/ideas/BreachCompilation/data')

defcon_1 = ScrapeModel(name='nmap_test',
                       domain='',
                       num_getters=1,
                       phases=[
                           ip_phase(n_workers=10,
                                    templates=[
                                        ip_template(db='defcon',
                                                    table='companies',
                                                    func='update',
                                                    kws={'key': '_id'},
                                                    source=['ip'])
                                    ],
                                    sources=jacco), port_phase,
                           Phase(templates=[git_template],
                                 sources=jacco_git,
                                 parser=TextParser),
                           Phase(templates=[ds_store_template],
                                 sources=jacco_ds,
                                 parser=TextParser)
                       ])
示例#3
0
tags_attr = Attr(name='tags',
                 selector='.ib.space-right a.link-grey',
                 func='sel_text')

article = Template(name='article',
                   attrs=(title_attr, text_attr, date_attr, author_attr,
                          tags_attr))
Phase(source_worker=WebSource,
      parser=HTMLParser,
      sources=nos_sources,
      templates=[
          Template(name='article_url',
                   selector='#archief li',
                   db_type='mongo_db',
                   db='nos_nl',
                   table='article_urls',
                   attrs=[
                       Attr(name='url',
                            selector='a',
                            func='sel_attr',
                            kws={'attr': 'href'},
                            source={'active': False}),
                   ])
      ]),

parsed = [
    a['url'] for cat in categories
    for a in cl.nos_nl.articles.find({'category': cat})
]
nos_sources = [
    Source(url=url['url'],
示例#4
0
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source
from scrape_models.objects import occassions

sources = [
    Source(
        url=
        'http://ww4.autoscout24.nl/?atype=B&mmvco=0&cy=NL&ustate=N%2CU&fromhome=1&intcidm=HP-Searchmask-Button&dtr=s&results=20'
    )
]
autoscout = ScrapeModel(name='autoscout24',
                        domain='autoscout24.nl',
                        phases=[
                            Phase(sources=sources,
                                  templates=[occassions.autoscout_template]),
                        ])
示例#5
0
col = db.episode

sources = (Source(url="http://www.luckytv.nl/afleveringen/page/{}/".format(i))
           for i in range(1, 50))

LuckyTV = ScrapeModel(name='Lucky TV',
                      domain='http://www.luckytv.nl/',
                      num_getters=2,
                      phases=[
                          Phase(source_worker=WebSource,
                                parser=HTMLParser,
                                sources=sources,
                                templates=(Template(
                                    name='episode',
                                    selector='article.video',
                                    db_type='mongo_db',
                                    db='lucky_tv',
                                    table='episodes',
                                    attrs=(
                                        Attr(name='url',
                                             selector='a:nth-of-type(1)',
                                             func='sel_url'),
                                        Attr(name='title',
                                             selector='.video__title',
                                             func='sel_text'),
                                        Attr(name='date',
                                             selector='.video__date',
                                             func='sel_text'),
                                    )), )),
                      ])
示例#6
0
        title_attr,
        text_attr,
        date_attr,
        author_attr,
        tags_attr
    )
)
headline_phase = Phase(
    sources=sources, n_workers=5, templates=[
        Template(
            name='headline', selector='.row', db='metronieuws',
            db_type='MongoDB', kws={'key':'url'},
            table='article_urls', attrs=[
                Attr(name='url', selector='a.shadow-block',
                        func='sel_attr', kws={'attr': 'href'},
                        source=Source(active=False, copy_attrs='category')),
                Attr(name='title', selector='h3', func='sel_text'),
                Attr(name='excerpt', selector='div > p', func='sel_text'),
                Attr(name='date', selector='div.wrapper.small div:nth-of-type(1)',
                            func='sel_text'),
                Attr(name='num_reactions', selector='div.amount', func='sel_text'),
            ]
        )
    ])

metro = ScrapeModel(
    name='metronieuws.nl', domain='http://metronieuws.nl', num_getters=5,
    phases=[
        headline_phase,
        Phase(sources=sources, n_workers=3, templates=[
            article(
示例#7
0
sub_page_name = Attr(name='pagename', func='sel_text')

category_temp = Template(name='sub_page',
                         selector='.sections a',
                         db='startpagina',
                         table='subpages',
                         db_type='mongo_db',
                         attrs=(sub_page_url,
                                sub_page_name))

website_url = Attr(name='url', func='sel_url')
website_name = Attr(name='name', func='sel_text')

website_temp = Template(name='website',
                        selector='#columns a',
                        db='startpagina',
                        table='websites',
                        db_type='mongo_db',
                        attrs=(website_url, website_name))

model = ScrapeModel(name='startpagina', domain='http://www.startpagina.nl',
                    phases=[
                        Phase(n_workers=3, sources=[start_url],
                            templates=[category_temp]),
                        Phase(n_workers=3, templates=[website_temp])
                    ])

d = Dispatcher()
d.add_scraper(model)
d.run()
示例#8
0
                 func='sel_text')

article = Template(
    name='article', selector='.col__inner',
    attrs=(
        title_attr,
        text_attr,
        date_attr,
        author_attr,
        tags_attr
    )
)

rtl= ScrapeModel(
    name='rtl', domain='http://www.rtlnieuws.nl/',
    num_getters=1, phases=[
        Phase(sources=[
            Source(url="http://www.parool.nl/archief/2012")],
            templates=(calendar, year)
            ),
        Phase(templates=(
                article_url(db_type='mongo_db', db='parool',
                            table='article_urls'),
                pagination)
            ),
        Phase(templates=(article(db_type='mongo_db', db='parool',
                               table='articles'),
                       )
            ),
    ])
示例#9
0
song = Template(name='song',
                db_type='MongoDB',
                db='midi',
                table='songs',
                attrs=[title, artist, midi_url])

freemidi_template = song(
    table='freemidi',
    selector='#mainContent div.col-xs-12:nth-of-type(1)',
    attrs=[
        title(selector=
              'li.active:nth-child(3) > a:nth-child(1) > span:nth-child(1)'),
        artist(
            selector=
            'ol.breadcrumb:nth-child(1) > li:nth-child(2) > a:nth-child(1) > span:nth-child(1)'
        ),
    ])

freemidi_sources = (Source(
    url='https://freemidi.org/download-{}'.format(i),
    attrs=[midi_url(value='https://freemidi.org/getter-{}'.format(i))])
                    for i in range(25803))

freemidi = ScrapeModel(domain='http://freemidi.org',
                       phases=[
                           Phase(n_workers=3,
                                 sources=freemidi_sources,
                                 templates=[freemidi_template])
                       ])
示例#10
0
                                   selector='h3 a',
                                   func='sel_url',
                                   source={'active': False}), ))
pagination = Template(name='pagination',
                      selector='.pagers',
                      attrs=(Attr(name='page',
                                  selector='a',
                                  func='sel_url',
                                  source=True), ))

bedrijven_pagina = ScrapeModel(
    name='Bedrijven Pagina',
    domain='https://www.bedrijvenpagina.nl/',
    num_getters=2,
    phases=[
        Phase(source_worker=WebSource,
              parser=HTMLParser,
              sources=[Source(url='https://www.bedrijvenpagina.nl/')],
              templates=(category_menu, )),
        Phase(source_worker=WebSource,
              parser=HTMLParser,
              templates=(result_list, pagination)),
        Phase(source_worker=WebSource,
              parser=HTMLParser,
              templates=(company, ))
    ])

disp = Dispatcher()
disp.add_scraper(bedrijven_pagina)
disp.run()
示例#11
0
    attrs=[
        make(selector='h2 a', kws={'regex': '(\w+) -,'}),
        price(selector='.price', kws={'regex': '(\d+),'}),
        year(selector='.listing-priority-product-container',
             kws={'regex': '(\d{4})'}),
        mileage(selector='.listing-priority-product-container',
                    kws={'regex': '(.*) km'}),
        city(selector='.location-name'),
        url(selector='h2 > a'),
    ]
)
autoscout = ScrapeModel(
    name='autoscout24',
    domain='autoscout24.nl',
    phases=[
        Phase(sources=[], templates=[autoscout_template]),
])

autotrader = ScrapeModel(name='autotrader', domain='http://autotrader.nl', num_sources=1, cookies={'CookieOptIn': 'true'},
                          phases=[
    Phase(
        sources=[
            Source(url='http://www.autotrader.nl/motor/zoeken/'),
        ],
        templates=[
            Template(name='motorcycle', selector='.result',
                            store=StoreObject(func=store_mongo,
                                                     kws={'db': 'moto', 'collection': 'autotrader'}),
                       attrs=[
                       ]),
            Template(name='next_page', selector='#pager', attrs=[
示例#12
0
 name='dabanga',
 domain='https://www.dabangasudan.org/en',
 num_getters=2,
 phases=[
     Phase(
         sources=(Source(url="https://www.dabangasudan.org/en/all-news"), ),
         templates=(
             Template(name='article_url',
                      selector='.list-item.news-item-small',
                      db_type='mongo_db',
                      db='dabanga',
                      table='article_urls',
                      attrs=[
                          Attr(name='url',
                               selector='a:nth-of-type(1)',
                               func='sel_url',
                               source={'active': False}),
                      ]),
             Template(name='pagination',
                      selector='.pager',
                      attrs=[
                          Attr(name='url',
                               selector='a',
                               func='sel_url',
                               source=True),
                      ]),
         )),
     Phase(synchronize=True,
           templates=[
               Template(name='article',
                        selector='#content',
示例#13
0
start = (Source(url='https://www.manageengine.com/products.html?MEtab'),)

demo_template = Template(
    name='demo',
    selector='.all_prod_over',
    db='test',
    db_type='MongoDB',
    table='test',
    attrs=[
        Attr(
            name='url',
            selector='a',
            func='sel_url',
            kws={'regex': 'http[s]?://([\.a-z]*)[\/\?]',
                 'index': 0},
            source={'active': False}
        )
    ]
)

manageengine = ScrapeModel(
    name='manageengine',
    domain='',
    num_getters=1,
    phases=[
        Phase(sources=start, templates=[demo_template]),
        ip_phase,
        port_phase(
    ]
)
示例#14
0
 Phase(sources=[
     Source(url=extended_url.format(season, episode))
     for season in range(1, 10) for episode in range(1, 30)
 ],
       templates=(Template(
           name='episode',
           selector='#Rapidvideo',
           db_type='ShellCommand',
           db='theoffice',
           table='season',
           kws={'command': create_dir + ' & ' + youtube_dl},
           attrs=(
               Attr(name='url',
                    selector='a',
                    func=['sel_url', 'sel_text'],
                    kws=[{}, {
                        'needle': r'.*(s\d+e\d+)'
                    }]),
               Attr(name='episode',
                    selector='.textwidget',
                    func='sel_text',
                    kws={
                        'index': 3,
                        'substitute': '_',
                        'replacers': ' '
                    }),
               Attr(name='season',
                    selector='.textwidget',
                    func='sel_text',
                    kws={
                        'index': 1,
                        'replacers': ' '
                    }),
           )), )),
示例#15
0
from modelscraper.sources import ProgramSource

port_template = Template(name='ports',
                         selector='port',
                         db_type='mongo_db',
                         db='ports',
                         table='ports',
                         attrs=(Attr(name='portnumber',
                                     func='sel_attr',
                                     kws={'attr': 'portid'}),
                                Attr(name='state',
                                     selector='state',
                                     func='sel_attr',
                                     kws={'attr': 'state'}),
                                Attr(name='service',
                                     selector='service',
                                     func='sel_attr',
                                     kws={'attr': 'name'})))
nmap = ScrapeModel(
    name='nmap_test',
    domain='',
    phases=[
        Phase(sources=(Source(url='nmap -oX - duwo.multiposs.nl'), ),
              templates=[port_template],
              source_worker=ProgramSource)
    ])

disp = Dispatcher()
disp.add_scraper(nmap)
disp.run()
示例#16
0
                   func='sel_text')
tags_attr = Attr(name='tags', selector='.tag-list a.cta', func='sel_text')

article = Template(name='article',
                   selector='.col__inner',
                   attrs=(title_attr, text_attr, date_attr, author_attr,
                          tags_attr))

parool = ScrapeModel(
    name='parool',
    domain='http://www.parool.nl/',
    cookies=cookie,
    num_getters=1,
    phases=[
        Phase(source_worker=WebSource,
              parser=HTMLParser,
              sources=[Source(url="http://www.parool.nl/archief/2012")],
              templates=(calendar, year)),
        Phase(source_worker=WebSource,
              parser=HTMLParser,
              templates=(article_url(db_type='mongo_db',
                                     db='parool',
                                     table='article_urls'), pagination)),
        Phase(source_worker=WebSource,
              parser=HTMLParser,
              templates=(article(db_type='mongo_db',
                                 db='parool',
                                 table='articles'), )),
    ])

volkskrant = ScrapeModel(
    name='volkskrant',
示例#17
0
npo_tv_programs = ScrapeModel(
    name='npo_tv_programs',
    domain='http://npo.nl',
    num_getters=2,
    phases=[
        Phase(
            source_worker=WebSource,
            parser=HTMLParser,
            sources=[Source(url=series_url.format(i)) for i in range(0, 242)],
            templates=(
                Template(
                    name='program',
                    selector='.content-column.quarter',
                    db_type='mongo_db',
                    db='npo_tv_programs',
                    table='programs',
                    attrs=(
                        Attr(name='title', selector='h3', func='sel_text'),
                        Attr(name='url',
                             selector='a.full-link',
                             func='sel_url',
                             source=Source(
                                 active=False)),  # source is for next run
                    )),
                Template(name='next_url'),
            )),
        Phase(
            source_worker=WebSource,
            parser=HTMLParser,
            templates=(
                Template(
示例#18
0
search_url = 'http://vkplusmobilebackend.persgroep.net/rest/content/articles?query=&metadataNeeded=true&limit=10000'
next_page_url = 'http://vkplusmobilebackend.persgroep.net/rest/content/articles?query=&metadataNeeded=true&limit=10000&offset={}'

search_result = Template(
    name='search_result', db='volkskrant', table='article_urls',
    func='create',
    db_type='mongo_db', selector=('results', 'previews'),
    attrs=(
        Attr(name='id', selector=('content_link', 'id'), func='sel_text',
             source={'src_template': article_url, 'active': False}),
    ),
)

next_search = Template(
    name='next_result',
    attrs=(
        Attr(name='next_limit', selector=('results', 'next_offset'),
             func='sel_text', source={'src_template': next_page_url}),
    )
)

sources = (Source(url=search_url),)

volkskrant = ScrapeModel(
    name='volkskrant', domain='volkskrant',
    phases=[
        Phase(parser=JSONParser, n_workers=5, sources=sources,
            templates=(search_result, next_search)),
        Phase(parser=JSONParser, n_workers=5, sources=sources, templates=(article,)),
    ])
示例#19
0
                                attrs=[
                                    Attr(name='submenu_item',
                                         selector='.c-product-tile__meta > a',
                                         func='sel_url',
                                         source={'active': False}),
                                    Attr(name='pagination_item',
                                         selector='li.is-nexy > a',
                                         source=True)
                                ])

product_name = product_name(selector='.c-offer__title')
price = price(selector='div.c-offer__price')
nutrition = nutrition(selector='.c-offer__nutrition table td')

product = Template(name='product',
                   db='foods',
                   table='spar2',
                   db_type='MongoDB',
                   attrs=[product_name, price, nutrition])

spar = ScrapeModel(
    name='spar.nl',
    cookie=cookie,
    domain='https://spar.nl',
    phases=[
        Phase(sources=(Source(url='https://spar.nl/boodschappen/'), ),
              templates=[menu_template]),
        Phase(templates=[productmenu_template]),
        Phase(templates=[product])
    ])
示例#20
0
 Phase(sources=(Source(
     url=
     "https://mercury.landmarkglobal.com/tracking/track.php?trck=LTN{}N1&Submit=Track"
     .format(i),
     method='post',
     data=[('sid', str(i)), ('options[]', 'display_full_history'),
           ('options[]', 'use_cached_data_only'),
           ('action', 'View+Complete+Tracking+History')])
                for i in range(5000, 50000000)),
       templates=[
           Template(
               name='shipment',
               selector=None,
               db='shipments',
               db_type='MongoDB',
               table='shipment',
               attrs=[
                   Attr(
                       name='carrier',
                       selector=
                       '#large_shipment_info_box > div:nth-child(2) > div:nth-child(1)',
                       func='sel_text',
                       kws={'regex': 'Carrier:\s(\w+)'}),
                   Attr(
                       name='shipped_to',
                       selector=
                       '#large_shipment_info_box > div:nth-child(2) > div:nth-child(2) div:nth-child(1) .align_left',
                       func='sel_text'),
                   Attr(
                       name='shipped_from',
                       selector=
                       '#large_shipment_info_box > div:nth-child(2) > div:nth-child(2) div:nth-child(2) .align_left',
                       func='sel_text'),
               ]),
           Template(name='event',
                    selector='table tr:not(:nth-child(1))',
                    db_type='MongoDB',
                    db='shipments',
                    table='events',
                    attrs=[
                        Attr(name='description',
                             selector='td:nth-of-type(1)',
                             func='sel_text'),
                        Attr(name='date',
                             selector='td:nth-of-type(2)',
                             func='sel_text'),
                        Attr(name='location',
                             selector='td:nth-of-type(3)',
                             func='sel_text'),
                    ]),
       ])
示例#21
0
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source
from modelscraper.workers import WebSource
from modelscraper.parsers import HTMLParser

uefa = ScrapeModel(
    name='eufa',
    domain='http://uefa.com',
    num_getters=2,
    phases=[
        Phase(sources=(Source(
            url="http://www.uefa.com/uefaeuro/season=2016/teams/index.html"),
                       ),
              templates=[
                  Template(name='team',
                           selector='.teams--qualified',
                           attrs=[
                               Attr(name='url',
                                    selector='a',
                                    func='sel_url',
                                    source={'active': False}),
                           ])
              ]),
        Phase(templates=[
            Template(name='player',
                     selector='.squad--team-player',
                     db_type='MongoDB',
                     db='uefa',
                     table='players',
                     attrs=[
                         Attr(name='name',
                              selector='.squad--player-name',
示例#22
0
import dns.query
import dns.zone

ip_template = Template(name='ip',
                       db_type='MongoDB',
                       db='',
                       table='',
                       parser=TextParser,
                       attrs=(Attr(
                           name='ip',
                           func='sel_text',
                           kws={'regex': '(\d+\.\d+\.\d+\.\d+)'},
                       ), ))

ip_phase = Phase(n_workers=10,
                 templates=[ip_template],
                 source_worker=ProgramSource(function='host {}'))

port_template = Template(name='ports',
                         selector='port',
                         db_type='MongoDB',
                         db='monog',
                         table='ports',
                         attrs=(Attr(name='portnumber',
                                     func='sel_attr',
                                     kws={'attr': 'portid'}),
                                Attr(name='state',
                                     selector='state',
                                     func='sel_attr',
                                     kws={'attr': 'state'}),
                                Attr(name='service',
示例#23
0
from modelscraper.dispatcher import Dispatcher
from modelscraper.sources import WebSource
from modelscraper.parsers import HTMLParser
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source

motorparts = ScrapeModel(
    name='motorparts', domain='http://www.2wheelpros.com', num_sources=2,
    phases=[
        Phase(source_worker=WebSource, parser=HTMLParser,
            sources=(Source(url='http://www.2wheelpros.com/oem-parts/'),),
            templates=(
                Template(name='brand',
                         selector='#nav > ul > li:nth-of-type(1) > a', attrs=(
                             Attr(name='url', func='sel_url',
                                  source={'active': False}),
                         ),),)
            ),
        Phase(source_worker=WebSource, parser=HTMLParser, templates=(
            Template(name='year', selector='.yearlink', attrs=(
                Attr(name='url', func='sel_url', source={'active': False}),)),),
          ),
        Phase(source_worker=WebSource, parser=HTMLParser, templates=(
            Template(name='model', selector='.modellink', attrs=(
                Attr(name='url', func='sel_url', source={'active': False}),
            )
            ),
        ),
        ),
        Phase(source_worker=WebSource, parser=HTMLParser, templates=(
            Template(name='partCategory', db='motorparts', db_type='MongoDB',
                     table='part_categories', source={'active':False,
示例#24
0
                        func='sel_text')
author_attr = Attr(name='author', selector='span.author',
                          func='sel_text')
tags_attr = Attr(name='tags', selector='.article.tags a span',
                        func='sel_text')

article = Template(
    name='article', selector='.column-content-background',
    db='nu_nl',
    db_type='mongo_db',
    table='articles',
    attrs=(
        title_attr,
        text_attr,
        date_attr,
        author_attr,
        tags_attr
    )
)

next_page = Template(
    name='next_page',
    selector='paging',
    attrs=(
        Attr(name='paging',

ad = ScrapeModel(
    name='ad.nl', domain='mobileapi.ad.nl', phases=[
        Phase(n_workers=2, sources=sources, templates=(

示例#25
0
from modelscraper.dispatcher import Dispatcher
from pymongo import MongoClient

cl = MongoClient()
sources = (Source(url=a['url'][0]) for a in cl.dwdd.episode_urls.find())

programs_az = Phase(
    sources=[
        Source(url="http://www.npo.nl/programmas/a-z", params={'page': i})
        for i in range(0, 1)
    ],
    templates=(
        Template(
            name='program',
            selector='.content-column.quarter',
            db_type='mongo_db',
            db='npo_tv_programs',
            table='programs',
            attrs=(
                Attr(name='title', selector='h3', func='sel_text'),
                Attr(name='url',
                     selector='a.full-link',
                     func='sel_url',
                     source=Source(active=False)),  # source is for next run
            )), ))

nos_search = 'https://www.npo.nl/de-wereld-draait-door/VARA_101377717/search?media_type=broadcast&start_date=&end_date=&start={}&rows=100'
episodes_phase = Phase(n_workers=5,
                       sources=(Source(url=nos_search.format(start))
                                for start in range(0, 2194, 100)),
                       templates=(Template(
示例#26
0
                     json_key='CVE_Items') for year in years)

meta_template = Template(
    name='meta', db='defcon', table='cve_meta', db_type='MongoDB', attrs=[
    Attr(name='last_modified', func='sel_text',
         kws={'regex': 'lastModifiedDate:(.*)'},
         source=cve_source)])

cve_template = Template(
    name='meta', db='defcon', table='cve', db_type='MongoDB', #func='update',
    #kws={'key': 'id'},
    attrs=[
        Attr(name='id', func='sel_text',
             selector=['cve', 'CVE_data_meta', 'ID']),
        Attr(name='cpes', func='sel_text',
             selector=['configurations', 'nodes', 'cpe', 'cpe23Uri']),
        Attr(name='affects', func='sel_dict',
             selector=['cve', 'affects']),
        Attr(name='problem_type', func='sel_text',
             selector=['cve', 'problemtype', 'problemtype_data', 'description',
                       'value']),
        Attr(name='description', func='sel_dict',
             selector=['cve', 'description', 'description_data', 'value']),
        Attr(name='impact', func='sel_dict',
             selector=['impact'])
    ])

cve = ScrapeModel(name='CVE', domain='static.nvd.nist.gov', phases=[
    Phase(sources=cve_source, parser=JSONParser, templates=[cve_template])
])
示例#27
0
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source
from modelscraper.parsers import TextParser
from modelscraper.dispatcher import Dispatcher
from pymongo import MongoClient


cl = MongoClient()
urls = (Source(url='https://tt888.omroep.nl/tt888/' + a['url'][0].split('/')[-1],
               attrs=(Attr(name='url', value=a['url']),))
        for a in cl.nos_journaal.episodes.find())

subtitles = ScrapeModel(
    name='subtitles', domain='https://tt888.omroep.nl/',
    phases=[
        Phase(n_workers=5, sources=urls, parser=TextParser,
            templates=(
                Template(
                    name='subtitle', db_type='mongo_db', db='nos_journaal',
                    table='episodes', func='update', kws={'key': 'url'},
                    attrs=(
                        Attr(name='subtitles', func='sel_text'),
                        )
                ),
            )
            )
    ])
del cl
d = Dispatcher()
d.add_scraper(subtitles)
d.run()
示例#28
0
                        Attr(name='title', selector='.title', func='sel_text'),
                        Attr(name='excerpt',
                             selector='.excerpt',
                             func='sel_text')
                    ])

title_attr = Attr(name='title', selector='h1', func='sel_text')
text_attr = Attr(name='text', selector='p', func='sel_text')
date_attr = Attr(name='date',
                 selector='.published span.small',
                 func='sel_text')
author_attr = Attr(name='author', selector='span.author', func='sel_text')
tags_attr = Attr(name='tags', selector='.article.tags a span', func='sel_text')

article = Template(name='article',
                   selector='.column-content-background',
                   db='nu_nl',
                   db_type='MongoDB',
                   table='articles',
                   attrs=(title_attr, text_attr, date_attr, author_attr,
                          tags_attr))

nu = ScrapeModel(name='nu.nl',
                 domain='http://nu.nl',
                 phases=[
                     Phase(n_workers=2,
                           sources=sources,
                           templates=(headline, )),
                     Phase(n_workers=2, templates=(article, ))
                 ])
示例#29
0
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source
from pymongo import MongoClient

cl = MongoClient()
product_sources = cl.makro.product_urls.find()
sources = [Source(url=p['url'][0]) for p in product_sources]

categories = Phase(
    sources=(Source(url="https://www.makro.nl/cat/nl/products"), ),
    templates=(Template(
        name='product_category',
        selector='#left-navigation-container ul.vertical > li > a',
        db_type='mongo_db',
        db='makro',
        table='product_categories',
        attrs=[
            Attr(name='url',
                 func='sel_url',
                 source={'active': False},
                 kws={
                     'replacers': 'pageSize=(\d+)',
                     'substitute': 'pageSize=96'
                 }),
        ]), ))

product_lists = Phase(templates=[
    Template(name='product_urls',
             selector='.product-list .product-tiles',
             db_type='mongo_db',
             db='makro',
             table='product_urls',
示例#30
0
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source
from modelscraper.parsers import TextParser

with open('/usr/share/wordlists/nmap.lst', encoding='utf8') as fle:
    words = set(
        [''.join(c for c in w if c.isalpha()) for w in fle.readlines()])
base = 'https://www.spotify.com/nl/xhr/json/isEmailAvailable.php?email={}@mailinator.com'
sources = (Source(url=base.format(word),
                  attrs=(Attr(name='username', value=word), ))
           for word in words)

result = Template(name='username',
                  db='spotify',
                  table='users',
                  db_type='MongoDB',
                  attrs=[Attr(name='exists', func='sel_text')])

user = ScrapeModel(
    name='spotify',
    phases=[Phase(sources=sources, n_workers=10, templates=[result])])