示例#1
0
from modelscraper.dispatcher import Dispatcher
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source
from pymongo import MongoClient
from modelscraper.workers import WebSource
from modelscraper.parsers import HTMLParser

category_menu = Template(name='category_menu',
                         selector='li.dropdown:nth-child(2)',
                         attrs=[
                             Attr(name='category',
                                  selector='a',
                                  source={'active': False},
                                  func='sel_url')
                         ])

name = Attr(name='name', selector='h1.box-title', func='sel_text')
street = Attr(name='street', selector='.street-address', func='sel_text')
postal = Attr(name='postal', selector='.postal-code', func='sel_text')
city = Attr(name='city', selector='.locality', func='sel_text')
telephone = Attr(name='telephone', selector='.tel', func='sel_text')
website = Attr(name='website', selector='.url a', func='sel_url')
mail = Attr(name='email', selector='.mail a', func='sel_url')
kvk = Attr(name='kvk', selector='.kvk a', func='sel_text')
description = Attr(name='description',
                   selector='div[itemprop="description"] > p',
                   func='sel_text')
branches = Attr(name='branches', selector='.omschrijving a', func='sel_text')

company = Template(name='company',
                   selector=None,
                   db_type='mongo_db',
示例#2
0
from modelscraper.parsers import TextParser, CSVParser
from .objects.networking import ip_phase, ip_template, port_phase

from pymongo import MongoClient

no_ip = MongoClient().defcon.companies.find({
    'website': {
        '$ne': None
    },
    'ip': {
        '$exists': False
    }
})
companies2 = MongoClient().defcon.companies.find({'website': {'$ne': None}})

defcon_base = Template(db='defcon', db_type='MongoDB')

jacco_base = 'jackling.nl'
jacco = (Source(url='jackling.nl'), )
jacco_git = (Source(url='http://{}/.git/config'.format(jacco_base)), )
jacco_ds = (Source(url='http://{}/.DS_STORE'.format(jacco_base)), )

git_sources = (Source(url='http://{}/.git/config'.format(c['website']),
                      attrs=[Attr(name='kvk', value=c['id'])],
                      copy_attrs='kvk') for c in companies2)
ds_store_sources = (Source(url='http://{}/.DS_STORE'.format(c['website']),
                           attrs=[Attr(name='kvk', value=c['id'])],
                           copy_attrs='kvk') for c in companies2)

git_template = Template(name='Git exposed',
                        db_type='MongoDB',
示例#3
0
title_attr = Attr(name='title', selector='h1', func='sel_text')
text_attr = Attr(name='text', selector='.article_body', func='sel_text')
date_attr = Attr(name='date',
                 selector='time:nth-of-type(1)',
                 func='sel_attr',
                 kws={'attr': 'datetime'})
author_attr = Attr(name='author',
                   selector='span[itemprop="author"]',
                   func='sel_text')
tags_attr = Attr(name='tags',
                 selector='.ib.space-right a.link-grey',
                 func='sel_text')

article = Template(name='article',
                   attrs=(title_attr, text_attr, date_attr, author_attr,
                          tags_attr))
Phase(source_worker=WebSource,
      parser=HTMLParser,
      sources=nos_sources,
      templates=[
          Template(name='article_url',
                   selector='#archief li',
                   db_type='mongo_db',
                   db='nos_nl',
                   table='article_urls',
                   attrs=[
                       Attr(name='url',
                            selector='a',
                            func='sel_attr',
                            kws={'attr': 'href'},
示例#4
0
from modelscraper.workers import WebSource
from modelscraper.parsers import HTMLParser

uefa = ScrapeModel(
    name='eufa',
    domain='http://uefa.com',
    num_getters=2,
    phases=[
        Phase(sources=(Source(
            url="http://www.uefa.com/uefaeuro/season=2016/teams/index.html"),
                       ),
              templates=[
                  Template(name='team',
                           selector='.teams--qualified',
                           attrs=[
                               Attr(name='url',
                                    selector='a',
                                    func='sel_url',
                                    source={'active': False}),
                           ])
              ]),
        Phase(templates=[
            Template(name='player',
                     selector='.squad--team-player',
                     db_type='MongoDB',
                     db='uefa',
                     table='players',
                     attrs=[
                         Attr(name='name',
                              selector='.squad--player-name',
                              func='sel_text'),
                         Attr(name='player_url',
示例#5
0
col = db.episode

sources = (Source(url="http://www.luckytv.nl/afleveringen/page/{}/".format(i))
           for i in range(1, 50))

LuckyTV = ScrapeModel(name='Lucky TV',
                      domain='http://www.luckytv.nl/',
                      num_getters=2,
                      phases=[
                          Phase(source_worker=WebSource,
                                parser=HTMLParser,
                                sources=sources,
                                templates=(Template(
                                    name='episode',
                                    selector='article.video',
                                    db_type='mongo_db',
                                    db='lucky_tv',
                                    table='episodes',
                                    attrs=(
                                        Attr(name='url',
                                             selector='a:nth-of-type(1)',
                                             func='sel_url'),
                                        Attr(name='title',
                                             selector='.video__title',
                                             func='sel_text'),
                                        Attr(name='date',
                                             selector='.video__date',
                                             func='sel_text'),
                                    )), )),
                      ])
示例#6
0
sources = [*binnenland, *buitenland]

title_attr = Attr(name='title', selector='h1', func='sel_text')
text_attr = Attr(name='text', selector='.article_body', func='sel_text')
date_attr = Attr(name='date', selector='time:nth-of-type(1)', func='sel_attr',
                 kws={'attr': 'datetime'})
author_attr = Attr(name='author', selector='span[itemprop="author"]',
                   func='sel_text')
tags_attr = Attr(name='tags', selector='.tag',
                 func='sel_text')

article = Template(
    name='article',
    attrs=(
        title_attr,
        text_attr,
        date_attr,
        author_attr,
        tags_attr
    )
)
headline_phase = Phase(
    sources=sources, n_workers=5, templates=[
        Template(
            name='headline', selector='.row', db='metronieuws',
            db_type='MongoDB', kws={'key':'url'},
            table='article_urls', attrs=[
                Attr(name='url', selector='a.shadow-block',
                        func='sel_attr', kws={'attr': 'href'},
                        source=Source(active=False, copy_attrs='category')),
                Attr(name='title', selector='h3', func='sel_text'),
                Attr(name='excerpt', selector='div > p', func='sel_text'),
示例#7
0
        # "SelectedStore":{"StoreId":27,"StoreReferenceKey":384},
        "SelectedStore": {
            "StoreId": 207,
            "StoreReferenceKey": 493
        },
        "HasSelectedStore": True,
        "AcceptedCookies": None,
        "LastViewedProducts": None
    }
}

menu_template = Template(name='menu',
                         attrs=[
                             Attr(name='menu_item',
                                  selector='.c-category-tile__item',
                                  func='sel_url',
                                  source={
                                      'active': False,
                                      'src_template': '{}?ppp=72'
                                  })
                         ])

productmenu_template = Template(name='submenu',
                                selector='.c-product-tile',
                                attrs=[
                                    Attr(name='submenu_item',
                                         selector='.c-product-tile__meta > a',
                                         func='sel_url',
                                         source={'active': False}),
                                    Attr(name='pagination_item',
                                         selector='li.is-nexy > a',
                                         source=True)
示例#8
0
from modelscraper.parsers import HTMLParser
from pymongo import MongoClient

cl = MongoClient()

cookie = {'nl_cookiewall_version': '1'}

telegraaf_url = 'http://www.telegraaf.nl/jsp/search_result_page.jsp?method=&keyword=de&pagenr={}'
telegraaf_search = (Source(url=telegraaf_url.format(i))
                    for i in range(1, 5001))

calendar = Template(
    name='archive_url',
    selector='',
    attrs=(
        Attr(name='url',
             selector='td a',
             func='sel_url',
             source=Source(active=False)),  # source is for next run
    ))

year = Template(
    name='archive_url_year',
    selector='.year-list__item',
    attrs=(
        Attr(name='url', selector='a', func='sel_url',
             source=True),  # source is for next run
    ))

article_url = Template(
    name='article_url',
示例#9
0
author_attr = Attr(name='author', selector='written_by', func='sel_text')
tags_attr = Attr(name='tags', selector=('tags', 'name'), func='sel_text')
category_attr = Attr(name='category', selector=('section', 'name'),
                     func='sel_text')
counters_attr = Attr(name='counters', selector='counters', func='sel_text')
intro_attr = Attr(name='excerpt', selector='intro', func='sel_text')
type_attr = Attr(name='excerpt', selector='type', func='sel_text')

article = Template(
    name='article',
    db='volkskrant', table='articles',
    db_type='mongo_db',
    attrs=(
        title_attr,
        text_attr,
        date_attr,
        author_attr,
        tags_attr,
        category_attr,
        counters_attr,
        intro_attr,
        type_attr
    )
)

article_url = 'http://vkplusmobilebackend.persgroep.net/rest/content/articles/{}'
search_url = 'http://vkplusmobilebackend.persgroep.net/rest/content/articles?query=&metadataNeeded=true&limit=10000'
next_page_url = 'http://vkplusmobilebackend.persgroep.net/rest/content/articles?query=&metadataNeeded=true&limit=10000&offset={}'

search_result = Template(
    name='search_result', db='volkskrant', table='article_urls',
    func='create',
示例#10
0
search_url = 'https://www.rtlnieuws.nl/search/nieuws/{}'
search_terms = ['economie', 'nederland']
title_attr = Attr(name='title', selector='h1', func='sel_text')
text_attr = Attr(name='text', selector='p', func='sel_text')
date_attr = Attr(name='date', selector='time', func='sel_text')
author_attr = Attr(name='author', selector='span[itemprop="author"]',
                   func='sel_text')
tags_attr = Attr(name='tags', selector='.tag-list a.cta',
                 func='sel_text')

article = Template(
    name='article', selector='.col__inner',
    attrs=(
        title_attr,
        text_attr,
        date_attr,
        author_attr,
        tags_attr
    )
)

rtl= ScrapeModel(
    name='rtl', domain='http://www.rtlnieuws.nl/',
    num_getters=1, phases=[
        Phase(sources=[
            Source(url="http://www.parool.nl/archief/2012")],
            templates=(calendar, year)
            ),
        Phase(templates=(
                article_url(db_type='mongo_db', db='parool',
                            table='article_urls'),
示例#11
0
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source
from modelscraper.workers import WebSource
from modelscraper.parsers import HTMLParser

title = Attr(name='title', func='sel_text')
artist = Attr(name='artist', func='sel_text')
midi_url = Attr(name='midi_url')

song = Template(name='song',
                db_type='MongoDB',
                db='midi',
                table='songs',
                attrs=[title, artist, midi_url])

freemidi_template = song(
    table='freemidi',
    selector='#mainContent div.col-xs-12:nth-of-type(1)',
    attrs=[
        title(selector=
              'li.active:nth-child(3) > a:nth-child(1) > span:nth-child(1)'),
        artist(
            selector=
            'ol.breadcrumb:nth-child(1) > li:nth-child(2) > a:nth-child(1) > span:nth-child(1)'
        ),
    ])

freemidi_sources = (Source(
    url='https://freemidi.org/download-{}'.format(i),
    attrs=[midi_url(value='https://freemidi.org/getter-{}'.format(i))])
                    for i in range(25803))
示例#12
0
#Databases
test_db = Sqlite(db='test')
test_mongo = MongoDB(db='test')

#Parsers
jsonp = JSONParser()
textp = TextParser()
csvp = CSVParser()
htmlp = HTMLParser()

json_nested = Template(
    source=json_test,
    database=[test_db, test_mongo],
    table='tst',
    name='json_nested',
    selector=[jsonp.select('html'),
              htmlp.select('.content')],
    attrs=[
        Attr(name='url',
             func=htmlp.text(selector='h1', template='partialtest {}'))
    ])

html_functions = Template(source=html,
                          name='html_functions',
                          database=test_mongo,
                          table='html_test',
                          dated=True,
                          emits=html2,
                          selector=htmlp.select('html'),
                          attrs=[
                              Attr(name='table',
示例#13
0
make = Attr(name='make', func='sel_text')
year = Attr(name='year', func='sel_text', kws={'numbers': True}, type=int)
mileage = Attr(name='mileage', func='sel_text', kw={'numbers': True}, type=int)
city = Attr(name='city', func='sel_text')
url = Attr(name='url', func='sel_url')
zipcode = Attr(name='zip', func='sel_text')
power = Attr(name='power', func='sel_text')

vehicle = Template(
    name='vehicle', db_type='MongoDB', db='vehicles',
    attrs=[
        vehicle_type,
        price,
        brand,
        make,
        year,
        mileage,
        city,
        url,
        zipcode,
        power
    ]
)

autoscout_template = vehicle(
    table='autoscout', regex='var articlesFromServer = (.+)\|\|',
    attrs=[
        vehicle_type,
        price(selector='price_raw'),
        brand(selector='mk'),
        make(selector='md'),
示例#14
0
from modelscraper.dispatcher import Dispatcher
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source
from pymongo import MongoClient
import string


start_url = Source(url='https://www.jasminedirectory.com/')
sub_page_url = Attr(name='sub_page', func='sel_url',
                    source={'active': False, 'parent': True})
sub_page_name = Attr(name='pagename', func='sel_text')

category_temp = Template(name='sub_category',
                         selector='li strong a:nth-of-type(1)',
                         db='jasminedirectory',
                         table='maincats',
                         db_type='mongo_db',
                         attrs=(sub_page_url,
                                sub_page_name))

category_temp2 = Template(name='sub_page',
                         selector='li strong a:nth-of-type(1)',
                         db='jasminedirectory',
                         table='subcats',
                         db_type='mongo_db',
                         attrs=(sub_page_url,
                                sub_page_name))


website_url = Attr(name='url', func='sel_url')
website_name = Attr(name='name', func='sel_text')
示例#15
0
 name='npo_tv_programs',
 domain='http://npo.nl',
 num_getters=2,
 phases=[
     Phase(
         source_worker=WebSource,
         parser=HTMLParser,
         sources=[Source(url=series_url.format(i)) for i in range(0, 242)],
         templates=(
             Template(
                 name='program',
                 selector='.content-column.quarter',
                 db_type='mongo_db',
                 db='npo_tv_programs',
                 table='programs',
                 attrs=(
                     Attr(name='title', selector='h3', func='sel_text'),
                     Attr(name='url',
                          selector='a.full-link',
                          func='sel_url',
                          source=Source(
                              active=False)),  # source is for next run
                 )),
             Template(name='next_url'),
         )),
     Phase(
         source_worker=WebSource,
         parser=HTMLParser,
         templates=(
             Template(
                 name='episodes',
                 selector='.item-list.item-container div.item',
示例#16
0
    ('options[]', 'use_cached_data_only'),
    ('action', 'View+Complete+Tracking+History')])
         for i in range(5000, 50000000)),
templates=[
    Template(
        name='shipment',
        selector=None,
        db='shipments',
        db_type='MongoDB',
        table='shipment',
        attrs=[
            Attr(
                name='carrier',
                selector=
                '#large_shipment_info_box > div:nth-child(2) > div:nth-child(1)',
                func='sel_text',
                kws={'regex': 'Carrier:\s(\w+)'}),
            Attr(
                name='shipped_to',
                selector=
                '#large_shipment_info_box > div:nth-child(2) > div:nth-child(2) div:nth-child(1) .align_left',
                func='sel_text'),
            Attr(
                name='shipped_from',
                selector=
                '#large_shipment_info_box > div:nth-child(2) > div:nth-child(2) div:nth-child(2) .align_left',
                func='sel_text'),
        ]),
    Template(name='event',
             selector='table tr:not(:nth-child(1))',
             db_type='MongoDB',
             db='shipments',
示例#17
0
from modelscraper.components import ScrapeModel, Template, Attr
from modelscraper.sources import WebSource

text = Attr(name='text', func='sel_html')
title = Attr(name='title', func='sel_text')
pictures = Attr(name='pictures',
                func='sel_attr',
                selector='img',
                kws={'attr': 'src'})
date = Attr(name='date', func='sel_text')
related = Attr(name='related', func='sel_url')
author = Attr(name='author', func='sel_text')
tags = Attr(name='author', func='sel_text')

article = Template(name='article',
                   attrs=(text, title, date, author, tags, pictures, related),
                   db='news',
                   db_type='MongoDB')

article_url = Attr(name='url', func='sel_url')

tweakers_article_source = WebSource()
tweakers_list = Template(
    selector='',
    attrs=[article_url(selector='', emits=tweakers_article_source)])

tweakers = article(source=tweakers_article_source,
                   table='tweakers.net',
                   selector='#contentArea',
                   attrs=(
                       text(selector='.article p'),
                       title(selector='h1'),
示例#18
0
import re
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source
from modelscraper.parsers import JSONParser
import vehicle


city = Attr(name='city', func='sel_text')
zipcode = Attr(name='zipcode', func='sel_text')

occasion = Template(
    name='occasion', db_type='mongo_db',
    attrs=[*vehicle.attrs,
        city,
        zipcode,
    ]
)


autotrader_template = vehicle(
    table='autotrader', selector='.result',
    attrs=[
        brand(selector='h2', kws={'regex': '(^\w+)'}),
        make(selector='h2', kws={'regex': '^\w+ (.*)'}),
        price(selector='.result-price-label'),
        year(selector='.col-left',kws={'regex': '\w{3} (\d{4})'}),
        mileage(selector='.col-left', kws={'regex': '(.*) km'}),
        url(selector='a.tracker'),
        #Attr(name='dealer_name', selector='.dealer-info div', func=sel_text),
        city,
        zipcode,
        power,
示例#19
0
from modelscraper.parsers import TextParser
from modelscraper.dispatcher import Dispatcher
from pymongo import MongoClient

cl = MongoClient()
urls = (Source(url='https://tt888.omroep.nl/tt888/' +
               a['url'][0].split('/')[-1],
               attrs=(Attr(name='url', value=a['url']), ))
        for a in cl.dwdd.episodes.find())

subtitles = ScrapeModel(name='subtitles',
                        domain='https://tt888.omroep.nl/',
                        phases=[
                            Phase(n_workers=5,
                                  sources=urls,
                                  parser=TextParser,
                                  templates=(Template(
                                      name='subtitle',
                                      db_type='MongoDB',
                                      db='dwdd',
                                      table='episodes',
                                      func='update',
                                      kws={'key': 'url'},
                                      attrs=(Attr(name='subtitles',
                                                  func='sel_text'), )), ))
                        ])
del cl
d = Dispatcher()
d.add_scraper(subtitles)
d.run()
示例#20
0
from modelscraper.components import Phase, Template, Attr
from modelscraper.sources import BaseSourceWorker, ProgramSource
from modelscraper.parsers import TextParser
import dns.resolver
import dns.query
import dns.zone

ip_template = Template(name='ip',
                       db_type='MongoDB',
                       db='',
                       table='',
                       parser=TextParser,
                       attrs=(Attr(
                           name='ip',
                           func='sel_text',
                           kws={'regex': '(\d+\.\d+\.\d+\.\d+)'},
                       ), ))

ip_phase = Phase(n_workers=10,
                 templates=[ip_template],
                 source_worker=ProgramSource(function='host {}'))

port_template = Template(name='ports',
                         selector='port',
                         db_type='MongoDB',
                         db='monog',
                         table='ports',
                         attrs=(Attr(name='portnumber',
                                     func='sel_attr',
                                     kws={'attr': 'portid'}),
                                Attr(name='state',
示例#21
0
from modelscraper.dispatcher import Dispatcher
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source
from pymongo import MongoClient
import string


start_url = Source(url='http://www.startpagina.nl/dochters/')
sub_page_url = Attr(name='sub_page', func='sel_url', source={'active': False,
                                                             'parent': True})
sub_page_name = Attr(name='pagename', func='sel_text')

category_temp = Template(name='sub_page',
                         selector='.sections a',
                         db='startpagina',
                         table='subpages',
                         db_type='mongo_db',
                         attrs=(sub_page_url,
                                sub_page_name))

website_url = Attr(name='url', func='sel_url')
website_name = Attr(name='name', func='sel_text')

website_temp = Template(name='website',
                        selector='#columns a',
                        db='startpagina',
                        table='websites',
                        db_type='mongo_db',
                        attrs=(website_url, website_name))

model = ScrapeModel(name='startpagina', domain='http://www.startpagina.nl',
                    phases=[
示例#22
0
title_attr = Attr(name='title', selector='h1', func='sel_text')
text_attr = Attr(name='text', selector='p', func='sel_text')
date_attr = Attr(name='date', selector='.published span.small',
                        func='sel_text')
author_attr = Attr(name='author', selector='span.author',
                          func='sel_text')
tags_attr = Attr(name='tags', selector='.article.tags a span',
                        func='sel_text')

article = Template(
    name='article', selector='.column-content-background',
    db='nu_nl',
    db_type='mongo_db',
    table='articles',
    attrs=(
        title_attr,
        text_attr,
        date_attr,
        author_attr,
        tags_attr
    )
)

next_page = Template(
    name='next_page',
    selector='paging',
    attrs=(
        Attr(name='paging',

ad = ScrapeModel(
    name='ad.nl', domain='mobileapi.ad.nl', phases=[
示例#23
0
from modelscraper.dispatcher import Dispatcher
from modelscraper.sources import WebSource
from modelscraper.parsers import HTMLParser
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source

motorparts = ScrapeModel(
    name='motorparts', domain='http://www.2wheelpros.com', num_sources=2,
    phases=[
        Phase(source_worker=WebSource, parser=HTMLParser,
            sources=(Source(url='http://www.2wheelpros.com/oem-parts/'),),
            templates=(
                Template(name='brand',
                         selector='#nav > ul > li:nth-of-type(1) > a', attrs=(
                             Attr(name='url', func='sel_url',
                                  source={'active': False}),
                         ),),)
            ),
        Phase(source_worker=WebSource, parser=HTMLParser, templates=(
            Template(name='year', selector='.yearlink', attrs=(
                Attr(name='url', func='sel_url', source={'active': False}),)),),
          ),
        Phase(source_worker=WebSource, parser=HTMLParser, templates=(
            Template(name='model', selector='.modellink', attrs=(
                Attr(name='url', func='sel_url', source={'active': False}),
            )
            ),
        ),
        ),
        Phase(source_worker=WebSource, parser=HTMLParser, templates=(
            Template(name='partCategory', db='motorparts', db_type='MongoDB',
                     table='part_categories', source={'active':False,
示例#24
0
from modelscraper.parsers import JSONParser, TextParser
import datetime


now = str(datetime.datetime.now()).replace('-', '')[:8]
JSON_URL = 'https://static.nvd.nist.gov/feeds/json/cve/1.0/nvdcve-1.0-{}.json.zip'
JSON_URL = 'http://0.0.0.0:8000/nvdcve-1.0-{}.json'

years = range(2002, datetime.datetime.now().year)
# years = [2002]
cve_source = (Source(url=JSON_URL.format(year), compression='',
                     json_key='CVE_Items') for year in years)

meta_template = Template(
    name='meta', db='defcon', table='cve_meta', db_type='MongoDB', attrs=[
    Attr(name='last_modified', func='sel_text',
         kws={'regex': 'lastModifiedDate:(.*)'},
         source=cve_source)])

cve_template = Template(
    name='meta', db='defcon', table='cve', db_type='MongoDB', #func='update',
    #kws={'key': 'id'},
    attrs=[
        Attr(name='id', func='sel_text',
             selector=['cve', 'CVE_data_meta', 'ID']),
        Attr(name='cpes', func='sel_text',
             selector=['configurations', 'nodes', 'cpe', 'cpe23Uri']),
        Attr(name='affects', func='sel_dict',
             selector=['cve', 'affects']),
        Attr(name='problem_type', func='sel_text',
             selector=['cve', 'problemtype', 'problemtype_data', 'description',
示例#25
0
cl = MongoClient()
sources = (Source(url=a['url'][0]) for a in cl.dwdd.episode_urls.find())

programs_az = Phase(
    sources=[
        Source(url="http://www.npo.nl/programmas/a-z", params={'page': i})
        for i in range(0, 1)
    ],
    templates=(
        Template(
            name='program',
            selector='.content-column.quarter',
            db_type='mongo_db',
            db='npo_tv_programs',
            table='programs',
            attrs=(
                Attr(name='title', selector='h3', func='sel_text'),
                Attr(name='url',
                     selector='a.full-link',
                     func='sel_url',
                     source=Source(active=False)),  # source is for next run
            )), ))

nos_search = 'https://www.npo.nl/de-wereld-draait-door/VARA_101377717/search?media_type=broadcast&start_date=&end_date=&start={}&rows=100'
episodes_phase = Phase(n_workers=5,
                       sources=(Source(url=nos_search.format(start))
                                for start in range(0, 2194, 100)),
                       templates=(Template(
                           name='episodes',
                           selector='.list-item',
                           db_type='mongo_db',
示例#26
0
    'buitenland', 'binnenland', 'economie', 'algemeen', 'tech', 'sport'
]
sources = (Source(url=base_url.format(section=section, offset=offset),
                  copy_attrs=['category'],
                  attrs=[Attr(name='category', value=[section])])
           for section in sections for offset in range(0, 200000, 20))

headline = Template(name='headline',
                    selector='li',
                    db='nu_nl',
                    db_type='MongoDB',
                    table='article_urls',
                    attrs=[
                        Attr(name='url',
                             selector='a',
                             func='sel_url',
                             source={
                                 'active': False,
                                 'copy_attrs': 'category'
                             }),
                        Attr(name='title', selector='.title', func='sel_text'),
                        Attr(name='excerpt',
                             selector='.excerpt',
                             func='sel_text')
                    ])

title_attr = Attr(name='title', selector='h1', func='sel_text')
text_attr = Attr(name='text', selector='p', func='sel_text')
date_attr = Attr(name='date',
                 selector='.published span.small',
                 func='sel_text')
author_attr = Attr(name='author', selector='span.author', func='sel_text')
示例#27
0
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source
from modelscraper.parsers import TextParser
from modelscraper.dispatcher import Dispatcher
from pymongo import MongoClient


cl = MongoClient()
urls = (Source(url='https://tt888.omroep.nl/tt888/' + a['url'][0].split('/')[-1],
               attrs=(Attr(name='url', value=a['url']),))
        for a in cl.nos_journaal.episodes.find())

subtitles = ScrapeModel(
    name='subtitles', domain='https://tt888.omroep.nl/',
    phases=[
        Phase(n_workers=5, sources=urls, parser=TextParser,
            templates=(
                Template(
                    name='subtitle', db_type='mongo_db', db='nos_journaal',
                    table='episodes', func='update', kws={'key': 'url'},
                    attrs=(
                        Attr(name='subtitles', func='sel_text'),
                        )
                ),
            )
            )
    ])
del cl
d = Dispatcher()
d.add_scraper(subtitles)
d.run()
示例#28
0
from modelscraper.dispatcher import Dispatcher
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source
from modelscraper.sources import ProgramSource

port_template = Template(name='ports',
                         selector='port',
                         db_type='mongo_db',
                         db='ports',
                         table='ports',
                         attrs=(Attr(name='portnumber',
                                     func='sel_attr',
                                     kws={'attr': 'portid'}),
                                Attr(name='state',
                                     selector='state',
                                     func='sel_attr',
                                     kws={'attr': 'state'}),
                                Attr(name='service',
                                     selector='service',
                                     func='sel_attr',
                                     kws={'attr': 'name'})))
nmap = ScrapeModel(
    name='nmap_test',
    domain='',
    phases=[
        Phase(sources=(Source(url='nmap -oX - duwo.multiposs.nl'), ),
              templates=[port_template],
              source_worker=ProgramSource)
    ])

disp = Dispatcher()
disp.add_scraper(nmap)
示例#29
0
from pymongo import MongoClient

cl = MongoClient()
product_sources = cl.makro.product_urls.find()
sources = [Source(url=p['url'][0]) for p in product_sources]

categories = Phase(
    sources=(Source(url="https://www.makro.nl/cat/nl/products"), ),
    templates=(Template(
        name='product_category',
        selector='#left-navigation-container ul.vertical > li > a',
        db_type='mongo_db',
        db='makro',
        table='product_categories',
        attrs=[
            Attr(name='url',
                 func='sel_url',
                 source={'active': False},
                 kws={
                     'replacers': 'pageSize=(\d+)',
                     'substitute': 'pageSize=96'
                 }),
        ]), ))

product_lists = Phase(templates=[
    Template(name='product_urls',
             selector='.product-list .product-tiles',
             db_type='mongo_db',
             db='makro',
             table='product_urls',
             attrs=[
示例#30
0
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source

dabanga = ScrapeModel(
    name='dabanga',
    domain='https://www.dabangasudan.org/en',
    num_getters=2,
    phases=[
        Phase(
            sources=(Source(url="https://www.dabangasudan.org/en/all-news"), ),
            templates=(
                Template(name='article_url',
                         selector='.list-item.news-item-small',
                         db_type='mongo_db',
                         db='dabanga',
                         table='article_urls',
                         attrs=[
                             Attr(name='url',
                                  selector='a:nth-of-type(1)',
                                  func='sel_url',
                                  source={'active': False}),
                         ]),
                Template(name='pagination',
                         selector='.pager',
                         attrs=[
                             Attr(name='url',
                                  selector='a',
                                  func='sel_url',
                                  source=True),
                         ]),
            )),
        Phase(synchronize=True,