示例#1
0
 Phase(
     source_worker=WebSource,
     parser=HTMLParser,
     sources=[
         Source(
             url=
             "https://www.erowid.org/experiences/exp.cgi?ShowViews=1&Cellar=0&Start=0&Max=24777"
         )
     ],
     templates=(Template(name='report_url',
                         selector='.exp-list-table tr',
                         source={
                             'active': False,
                             'copy_attrs': True
                         },
                         attrs=(
                             Attr(name='url',
                                  selector='td:nth-of-type(2) a',
                                  func='sel_url'),
                             Attr(name='title',
                                  selector='td:nth-of-type(2) a',
                                  func='sel_text'),
                             Attr(name='rating',
                                  selector='td:nth-of-type(1) img',
                                  func='sel_attr',
                                  kws={'attr': 'alt'}),
                             Attr(name='author',
                                  selector='td:nth-of-type(3)',
                                  func='sel_text'),
                             Attr(name='substances',
                                  selector='td:nth-of-type(4)',
                                  func='sel_text',
                                  kws={
                                      'replacers':
                                      '&',
                                      'substitute':
                                      ',',
                                      'regex':
                                      '([A-z0-9\-]+\s*[A-z0-9\-*\s]*)'
                                  }),
                             Attr(name='date',
                                  selector='td:nth-of-type(5)',
                                  func='sel_text'),
                             Attr(name='views',
                                  selector='td:nth-of-type(6)',
                                  func='sel_text'),
                         )), )),
示例#2
0
 Phase(
     source_worker=WebSource,
     parser=HTMLParser,
     sources=[
         Source(
             url="https://ghostlyhaks.com/forum/rom-eeprom-bios-efi-uefi"
         )
     ],
     templates=(
         Template(
             name='forum_post',
             selector='.kbody tr',
             db_type='mongo_db',
             db='efi_dumps',
             table='forum_post',
             attrs=(
                 Attr(name='url',
                      selector='a.ktopic-title',
                      func='sel_url',
                      source=Source(
                          active=False)),  # source is for next run
                 Attr(name='user',
                      selector='.kwho-user',
                      func='sel_text'),
                 Attr(name='user_url',
                      selector='.kwho-user',
                      func='sel_url'),
             )),
         Template(
             name='next_page',
             selector='.kpagination',
             attrs=[
                 Attr(name='url',
                      selector='a',
                      func='sel_url',
                      source=Source())  # source is for next run
             ]),
     )),
示例#3
0

cl = MongoClient()
db = cl.nytimes
col = db.menu

nytimes = ScrapeModel(name='nytimes', domain='https://www.nytimes.com/',
    num_getters=2, phases=[

    Phase(source_worker=WebSource, parser=HTMLParser, sources=[
        Source(url="https://www.nytimes.com/")],
        templates=(
            Template(
                name='menu', selector='#site-index-navigation li',
                db_type='MongoDB', db='nytimes', table='menu',
                attrs=(
                    Attr(name='url', selector='a', func='sel_url',
                        source=Source(active=False)), # source is for next run
                )
            ),
        )
    ),

    Phase(source_worker=WebSource, parser=HTMLParser, sources=[
        Source(url="https://www.nytimes.com/")],
        templates=(
            Template(
                name='articlelist', selector='',
                db_type='MongoDB', db='nytimes', table='articles',
                attrs=(
                    Attr(name='title', selector='h1', func='sel_text'),
belgian_parlement_roles = ScrapeModel(
    name='belgian_parlement_roles',
    domain='https://fr.wikipedia.org/',
    num_getters=2,
    phases=[
        Phase(
            source_worker=WebSource,
            parser=HTMLParser,
            sources=[
                Source(
                    url=
                    "https://fr.wikipedia.org/wiki/Liste_des_gouvernements_de_la_Belgique"
                )
            ],
            templates=(
                Template(
                    name='government',
                    selector='.wikitable tr td:nth-of-type(2)',
                    attrs=(
                        Attr(name='url',
                             selector='a',
                             func='sel_url',
                             source=Source(
                                 active=False)),  # source is for next run
                    )), )),
        Phase(source_worker=WebSource,
              parser=HTMLParser,
              templates=(Template(name='government',
                                  selector='table:nth-of-type(1) tr',
                                  db_type='mongo_db',
                                  db='belgian_politics',
示例#5
0
 Phase(
     source_worker=WebSource,
     parser=HTMLParser,
     sources=[
         Source(
             url=
             'http://www.kinky.nl/sex-afspraken/mannen/default.aspx?pagesize=5000',
             attrs=[Attr(name='sex', value='man')]),
         Source(
             url=
             'http://www.kinky.nl/sex-afspraken/vrouwen/default.aspx?pagesize=5000',
             attrs=[Attr(name='sex', value='vrouw')]),
         Source(
             url=
             'http://www.kinky.nl/sex-afspraken/transsexuelen/default.aspx?pagesize=5000',
             attrs=[Attr(name='sex', value='trans')]),
         Source(
             url=
             'http://www.kinky.nl/sex-afspraken/stellen/default.aspx?pagesize=5000',
             attrs=[Attr(name='sex', value='stellen')]),
         Source(
             url=
             'http://www.kinky.nl/sex-afspraken/gay/default.aspx?pagesize=5000',
             attrs=[Attr(name='sex', value='gay')]),
     ],
     templates=[
         Template(name='advert',
                  selector='#advertenties > div',
                  db_type='mongo_db',
                  db='kinky',
                  table='adds',
                  attrs=[
                      Attr(name='phone',
                           selector='.quickinfo > span',
                           func='sel_text',
                           kws={
                               'children': True,
                               'debug': True,
                               'regex': 'Mijn telefoonnummer: (.*)'
                           }),
                      Attr(name='city',
                           selector='.quickinfo span.country',
                           func='sel_text'),
                      Attr(name='url',
                           selector='.advertentie_kop a',
                           func='sel_attr',
                           kws={'attr': 'href'})
                  ])
     ]),
示例#6
0
southpark = ScrapeModel(
    name='southpark',
    domain='http://southpark.cc.com/',
    num_getters=2,
    phases=[
        Phase(
            source_worker=WebSource,
            parser=HTMLParser,
            sources=[Source(url="http://southpark.cc.com/")],
            templates=(
                Template(
                    name='video',
                    selector='',
                    db_type='mongo_db',
                    db='southpark',
                    table='video',
                    attrs=(
                        Attr(name='url',
                             selector='a',
                             func='sel_url',
                             source=Source(
                                 active=False)),  # source is for next run
                        Attr(name='title', selector='h1', func='sel_text'),
                        Attr(name='text', selector='p', func='sel_text'),
                    )), )),
        Phase(
            source_worker=WebSource,
            parser=HTMLParser,
            sources=[Source(url="http://southpark.cc.com/")],
            templates=(
                Template(
示例#7
0
    Phase(source_worker=WebSource, parser=HTMLParser, sources=[
        Source(url='https://www.youtube.com/user/ozzymanreviews/videos'),
        Source(url='https://www.youtube.com/user/Draadstal/videos'),
        Source(url='https://www.youtube.com/channel/UCQMs9pijXYAdqvkEMJyCM4g/videos'),
        Source(url='https://www.youtube.com/channel/UCi1LpRIlG1tDY5Z54VTel2w/videos'),
        Source(url='https://www.youtube.com/user/vpro/videos'),
        Source(url='https://www.youtube.com/user/nprmusic/videos'),
    ],
        templates=(
            Template(
                name='channel_videos', selector='li.channels-content-item',
                db_type='mongo_db', db='youtube_channel', table='channel_videos',
                attrs=[
                    Attr(name='url', selector='h3.yt-lockup-title a', func='sel_url'),

                    Attr(name='title', selector='h3', func='sel_text'),

                    Attr(name='views', selector='.yt-lockup-meta-info', func='sel_text',
                         kws={'regex': '(.*) weergaven', 'numbers': True}),
                ]
            ),
            Template(
                name='next_videos', selector='.browse-items-load-more-button',
                attrs=[
                    Attr(name='url', func='sel_attr',
                         kws={'attr': 'data-uix-load-more-href'},
                         source=Source(src_template='http://youtube.com{}',
                                       json_key=['content_html', 'load_more_widget_html']))
                ]),
        )
    ),
示例#8
0
cl = MongoClient()
db = cl.headlines
col = db.category

headlines = ScrapeModel(name='headlines', domain='http://www.headlines24.nl/',
    num_getters=2, phases=[
    
    Phase(source_worker=WebSource, parser=HTMLParser, sources=[
        Source(url="http://www.headlines24.nl/")],
        templates=(
            Template(
                name='category', selector='',
                db_type='mongo_db', db='headlines', table='category',
                attrs=(
                    Attr(name='url', selector='a', func='sel_url',
                        source=Source(active=False)), # source is for next run

                    Attr(name='title', selector='h1', func='sel_text'),

                    Attr(name='text', selector='p', func='sel_text'),
                )
            ),
        )
    ),
    
    Phase(source_worker=WebSource, parser=HTMLParser, sources=[
        Source(url="http://www.headlines24.nl/")],
        templates=(
            Template(
                name='category', selector='',
                db_type='mongo_db', db='headlines', table='category',
示例#9
0
from workers import WebSource
from parsers import HTMLParser
import string
from components import ScrapeModel, Source, Phase, Attr, Template

paradiso = ScrapeModel(
    name='paradiso',
    domain='https://paradiso.nl',
    phases=[
        Phase(source_worker=WebSource,
              sources=[Source(url='https://paradiso.nl/web/Agenda.htm')],
              parser=HTMLParser,
              templates=[
                  Template(name='event_link',
                           selector='a.event-link',
                           attrs=[
                               Attr(name='url',
                                    func='sel_attr',
                                    kws={'attr': 'href'},
                                    source={'active': False})
                           ])
              ]),
        Phase(templates=[
            Template(name='event',
                     db_type='MongoDB',
                     db='paradiso',
                     table='events',
                     attrs=[
                         Attr(name='name',
                              selector='meta[name=evenementts]',
                              func='sel_attr',
示例#10
0
today = datetime.datetime.now().year
print(today)

volkskrant = ScrapeModel(
    name='volkskrant',
    domain='http://www.volkskrant.nl/',
    num_getters=2,
    cookies={'nl_cookiewall_version': '1'},
    phases=[
        Phase(
            source_worker=WebSource,
            parser=HTMLParser,
            sources=[
                Source(url="http://www.volkskrant.nl/archief/{}".format(year))
                for year in range(1987, today)
            ],
            templates=(Template(name='day_url',
                                selector='td',
                                attrs=(Attr(
                                    name='url',
                                    selector='a',
                                    func='sel_url',
                                    source=Source(active=False)), )), )),
        Phase(
            source_worker=WebSource,
            parser=HTMLParser,
            templates=(
                Template(name='article_url',
                         selector='article',
                         attrs=(Attr(name='url',
                                     selector='a',
                                     func='sel_url',
示例#11
0
文件: namen.py 项目: 0xh/modelscraper
 name='namen', domain='http://www.meertens.knaw.nl/', num_getters=2, phases=[
 Phase(source_worker=WebSource, parser=HTMLParser, sources=(
     Source(url="http://www.meertens.knaw.nl/nvb/naam/begintmet/" + l)
                 for l in ['Aad']), # string.ascii_lowercase),
     templates=[
         Template(
             name='name', selector='tr.data',
             db_type='mongo_db', db='names', table='name_count_test',
             attrs=[
                 Attr(name='name', selector='td:nth-of-type(1)',
                             func='sel_text'),
                 Attr(name='men', selector='td:nth-of-type(2)',
                             func='sel_text', kws={'numbers': True}),
                 Attr(name='women', selector='td:nth-of-type(3)',
                             func='sel_text', kws={'numbers': True}),
                 Attr(name='url', selector='td:nth-of-type(1) a',
                             func='sel_attr', kws={'attr': 'href'},
                             source={'active': False},
                             source_condition={'women': '> 50',
                                               'men': '> 50'}),
             ]
         ),
         Template(
             name='next_url', selector='.right',
             attrs=[
                 Attr(name='next', selector='abc',  func='sel_attr',
                             kws={'attr': 'href'}, source={'active': True}),
             ])
         ]
 ),
 Phase(source_worker=WebSource, parser=HTMLParser, templates=[
示例#12
0
from components import ScrapeModel, Phase, Template, Attr, Source
from workers import WebSource
from parsers import HTMLParser

pornstars = ScrapeModel(
    name='pornhub_pornstars',
    domain='http://pornhub.com',
    num_getters=2,
    phases=[
        Phase(source_worker=WebSource,
              parser=HTMLParser,
              sources=[Source(url='http://www.pornhub.com/pornstars?o=a')],
              templates=[
                  Template(name='alphabet',
                           selector='.alphabetFilter .dropdownWrapper li',
                           attrs=[
                               Attr(name='url',
                                    selector='a',
                                    func='sel_url',
                                    source=Source(active=False))
                           ])
              ]),
        Phase(source_worker=WebSource,
              parser=HTMLParser,
              templates=[
                  Template(name='pornstar',
                           selector='.pornstarIndex li',
                           db_type='MongoDB',
                           db='pornstars',
                           collection='ranking',
                           attrs=[
示例#13
0
from pymongo import MongoClient


petitions = ScrapeModel(
    name='petitions', domain='https://petities.nl/', num_getters=2,
    phases=[
        Phase(source_worker=WebSource, parser=HTMLParser, sources=(
            Source(url="https://petities.nl/petitions/borstkankeronderzoek-vervroegen/signatures?locale=nl"),),
            templates=[
                Template(name='next_page', selector='.navigation-bar .navigation-bar',
                         attrs=[Attr(name='url', selector='a', func='sel_url', kws={'attr': 'href'},
                                     source=True)]),
                Template(name='signature', selector='.petition-signature-list',
                         db_type='mongo_db', db='petitions', table='borstkanker',
                         attrs=[
                             Attr(name='name', selector='.petition-signature-name',
                                  func='sel_text'),
                             Attr(name='time', selector='.signature-time', func='sel_text'),
                             Attr(name='location', selector='.petition-signature-location',
                                  func='sel_text'),
                             Attr(name='occupation', selector='.petition-signature-occupation',
                                  func='sel_text')
                        ])
            ]
        )
    ]
)

ds = Dispatcher()
ds.add_scraper(petitions)
ds.run()
示例#14
0
 Phase(
     source_worker=WebSource,
     parser=HTMLParser,
     sources=[
         Source(url="http://watchtheofficeonline.com/s{}e{}".format(
             season, episode)) for season in range(1, 10)
         for episode in range(1, 30)
     ],
     templates=(Template(
         name='episode',
         selector='.so-panel.widget.widget_siteorigin-panels-builder',
         db_type='shell_command',
         db='theoffice',
         table='season',
         kws={
             'command':
             'sudo mkdir -p ' + filepath + '/{season}/ &' +
             ' sudo youtube-dl -o /mnt/Movies/{season}/{episode} {url}'
         },
         attrs=(
             Attr(name='url',
                  selector='a',
                  func=['sel_url', 'sel_text'],
                  kws=[{}, {
                      'needle': r'.*(s\d+e\d+)'
                  }]),
             Attr(name='episode',
                  selector='.textwidget',
                  func='sel_text',
                  kws={
                      'index': 3,
                      'substitute': '_',
                      'replacers': ' '
                  }),
             Attr(name='season',
                  selector='.textwidget',
                  func='sel_text',
                  kws={
                      'index': 1,
                      'replacers': ' '
                  }),
         )), )),
cl = MongoClient()
db = cl.gsmhelpdesk_nummerreeksen
col = db.number_range

gsmhelpdesk_nummerreeksen = ScrapeModel(name='gsmhelpdesk_nummerreeksen', domain='http://www.gsmhelpdesk.nl/', num_getters=2,
    phases=[

    Phase(source_worker=WebSource, parser=HTMLParser, sources=[
        Source(url="http://www.gsmhelpdesk.nl/helpdesk/30/nummerreeksen"),
    ],
    templates=(
        Template(
            name='number_range', selector='tr',
            db_type='mongo_db', db='gsmhelpdesk_nummerreeksen', table='number_range',
            attrs=(
                Attr(name='start', selector='td:nth-of-type(1)',
                            func='sel_text', kws={'numbers': True}),
                Attr(name='end', selector='td:nth-of-type(2)',
                            func='sel_text', kws={'numbers': True}),
                )
            ),
        )
    ),
    ]
)

disp = Dispatcher()
disp.add_scraper(gsmhelpdesk_nummerreeksen)
disp.run()
示例#16
0
 Phase(
     parser=HTMLParser,
     source_worker=WebSource,
     sources=[
         Source(url='http://funda.nl/huur/amsterdam/woonhuis/'),
         Source(url='http://funda.nl/huur/amsterdam/appartement/'),
         Source(url='http://funda.nl/koop/amsterdam/woonhuis/'),
         Source(url='http://funda.nl/koop/amsterdam/appartement'),
     ],
     templates=[
         Template(
             name='house',
             selector='.search-result',
             db_type='mongo_db',
             db='funda',
             table='for_hire',
             attrs=[
                 Attr(name='price',
                      selector='.search-result-price',
                      func='sel_text',
                      kws={'numbers': True}),
                 Attr(name='street',
                      selector='.search-result-title',
                      func='sel_text'),
                 Attr(name='realtor',
                      selector='.realtor',
                      func='sel_text'),
                 Attr(name='rooms',
                      selector='.search-result-info',
                      func='sel_text',
                      kws={
                          'regex': '(\d+) kamers',
                          'numbers': True
                      }),
                 Attr(name='zip',
                      selector='.search-result-subtitle',
                      func='sel_text',
                      kws={'regex': '(\d{4} \w{2})'}),
                 Attr(name='city',
                      func='sel_text',
                      selector='.search-result-subtitle',
                      kws={'regex': '\d{4} \w{2} (\w+)'}),
                 Attr(
                     name='living_area',
                     func='sel_text',
                     selector=
                     '.search-result-info span[title="Woonoppervlakte"]',
                     kws={
                         'regex': '(\d+)',
                         'numbers': True
                     }),
                 Attr(name='meeting_url',
                      selector='.search-result-header a',
                      func='sel_attr',
                      kws={'attr': 'href'},
                      source={
                          'src_template': '{}bezichtiging/',
                          'active': False
                      }),
             ]),
         Template(
             selector='.pagination',
             attrs=[
                 Attr(
                     name='url',
                     selector='a',
                     func='sel_attr',
                     kws={'attr': 'href'},
                     # source=Source()
                 )
             ])
     ]),
示例#17
0
 Phase(
     source_worker=WebSource,
     parser=HTMLParser,
     sources=[Source(url="https://www.thuisbezorgd.nl/")],
     templates=(
         Template(
             name='sections',
             selector='',
             attrs=(
                 Attr(name='url',
                      selector='a[href*="eten-bestellen-"]',
                      func='sel_url',
                      source=Source()),  # source is for next run
             )),
         Template(
             name='restaurant',
             selector='.restaurant',
             db_type='MongoDB',
             db='thuisbezorgd',
             table='restaurants',
             attrs=(
                 Attr(name='url',
                      selector='a.restaurantname',
                      func='sel_url',
                      source=Source(
                          active=False,
                          src_template='{}')),  # source is for next run
                 Attr(name='name',
                      selector='a.restaurantname',
                      func='sel_text'),
             )),
     )),
示例#18
0
# TODO Set the right classes for the websites.
from dispatcher import Dispatcher
from components import Attr, Source, HTMLObject, Phase, ScrapeModel, Follow

kinky =ScrapeModel(name='kinky', domain='kinky.nl', phases=[
    Phase(source=[Source(url='http://kinky.nl/sex-afspraken/vrouwen/default.aspx?pagesize=4500')],
        templates=[Template(name='advert', selector='.advertentie_kop > a',
                 attrs=[Attr(name='url', source={'active': False})])
        ]),
    Phase(templates=[
        Template(name='advert', selector='.advertentie_kop > a', db='kinky.nl', table='adverts',
                Attr(name='add_text', func= 'sel_text', selector='description p'),
                Attr(name='possibilities', func= 'sel_text', selector= '.possibilities li'),
                Attr(name='update', func= 'sel_text', selector='update'),
                Attr(name='town', func= 'sel_text', selector= '.naw div', kws={'regex': '.*Plaats: ([A-z]*);'}),
                Attr(name='work_area', func= 'sel_text', selector='naw div', kws={'regex': '.*Werkgebied: '}),
                Attr(name='prices', func= 'sel_text', selector='.prizes td:nth-of-type(2)'),
                Attr(name='pictures', func= 'sel_attr', selector='.galleryRow img', kws={'attr': 'src'}),
                Attr(name='phone', selector='.mainprofile .webbutton span', func='sel_text'),
                Attr(name='name', selector='h1.title', func='sel_text'),
                    '__reg__age': ['.naw div', '.*Leeftijd: (\d\d)'],
                    '__reg__length': ['.naw div', '.*Lengte: (\d\d\d)'],
                    '__reg__hair_color': ['.naw div', '.*Kleur haar: ([a-z]*);'],
                    '__reg__build': ['.naw div', '.*Lichaamsbouw: ([A-z]*);'],
                    '__reg__looks': ['.naw div', '.*Uiterlijk: ([A-z]*);'],
                ,
            ,
        ,
        'start': [
            'meta': {
                'sex': 'female',