Exemplo n.º 1
0
    def make_objects(self, template, selected, getter):
        objects = []
        # print('aantal links', len(selected))
        for sel in selected:
            objct = Template(name=template.name)
            objct.url = getter.url

            # Set predefined attributes from the getter.
            #print('aantal attrs', len(getter.attrs))
            for attr in getter.attrs:
                objct.attrs.append(attr.duplicate())

            # Set the attribute values
            for temp_attr in template.attrs:
                parsed = temp_attr.func(sel, temp_attr.selector,
                                        **temp_attr.kws)
                attr = Attr(name=temp_attr.name, value=parsed)
                objct.attrs.append(attr)

                # Create a request from the attribute if desirable
                if temp_attr.getter and parsed:
                    if type(parsed) != list:
                        parsed = [parsed]

                    for value in parsed:
                        new_getter = Getter(**temp_attr.getter)
                        new_getter.url = value
                        self._handle_getter(new_getter)

            if template.getter:
                self._handle_object_getter(objct)
            objects.append(objct)
        return objects
Exemplo n.º 2
0
templates=(Template(name='report_url',
                    selector='.exp-list-table tr',
                    source={
                        'active': False,
                        'copy_attrs': True
                    },
                    attrs=(
                        Attr(name='url',
                             selector='td:nth-of-type(2) a',
                             func='sel_url'),
                        Attr(name='title',
                             selector='td:nth-of-type(2) a',
                             func='sel_text'),
                        Attr(name='rating',
                             selector='td:nth-of-type(1) img',
                             func='sel_attr',
                             kws={'attr': 'alt'}),
                        Attr(name='author',
                             selector='td:nth-of-type(3)',
                             func='sel_text'),
                        Attr(name='substances',
                             selector='td:nth-of-type(4)',
                             func='sel_text',
                             kws={
                                 'replacers':
                                 '&',
                                 'substitute':
                                 ',',
                                 'regex':
                                 '([A-z0-9\-]+\s*[A-z0-9\-*\s]*)'
                             }),
                        Attr(name='date',
                             selector='td:nth-of-type(5)',
                             func='sel_text'),
                        Attr(name='views',
                             selector='td:nth-of-type(6)',
                             func='sel_text'),
                    )), )),
Exemplo n.º 3
0

cl = MongoClient()
db = cl.nytimes
col = db.menu

nytimes = ScrapeModel(name='nytimes', domain='https://www.nytimes.com/',
    num_getters=2, phases=[

    Phase(source_worker=WebSource, parser=HTMLParser, sources=[
        Source(url="https://www.nytimes.com/")],
        templates=(
            Template(
                name='menu', selector='#site-index-navigation li',
                db_type='MongoDB', db='nytimes', table='menu',
                attrs=(
                    Attr(name='url', selector='a', func='sel_url',
                        source=Source(active=False)), # source is for next run
                )
            ),
        )
    ),

    Phase(source_worker=WebSource, parser=HTMLParser, sources=[
        Source(url="https://www.nytimes.com/")],
        templates=(
            Template(
                name='articlelist', selector='',
                db_type='MongoDB', db='nytimes', table='articles',
                attrs=(
                    Attr(name='title', selector='h1', func='sel_text'),
Exemplo n.º 4
0
 sources=[
     Source(
         url="https://ghostlyhaks.com/forum/rom-eeprom-bios-efi-uefi"
     )
 ],
 templates=(
     Template(
         name='forum_post',
         selector='.kbody tr',
         db_type='mongo_db',
         db='efi_dumps',
         table='forum_post',
         attrs=(
             Attr(name='url',
                  selector='a.ktopic-title',
                  func='sel_url',
                  source=Source(
                      active=False)),  # source is for next run
             Attr(name='user',
                  selector='.kwho-user',
                  func='sel_text'),
             Attr(name='user_url',
                  selector='.kwho-user',
                  func='sel_url'),
         )),
     Template(
         name='next_page',
         selector='.kpagination',
         attrs=[
             Attr(name='url',
                  selector='a',
Exemplo n.º 5
0
                    url=
                    'http://www.kinky.nl/sex-afspraken/gay/default.aspx?pagesize=5000',
                    attrs=[Attr(name='sex', value='gay')]),
            ],
            templates=[
                Template(name='advert',
                         selector='#advertenties > div',
                         db_type='mongo_db',
                         db='kinky',
                         table='adds',
                         attrs=[
                             Attr(name='phone',
                                  selector='.quickinfo > span',
                                  func='sel_text',
                                  kws={
                                      'children': True,
                                      'debug': True,
                                      'regex': 'Mijn telefoonnummer: (.*)'
                                  }),
                             Attr(name='city',
                                  selector='.quickinfo span.country',
                                  func='sel_text'),
                             Attr(name='url',
                                  selector='.advertentie_kop a',
                                  func='sel_attr',
                                  kws={'attr': 'href'})
                         ])
            ]),
    ])
sexjobs = ScrapeModel(
    name='sexjobs',
    domain='http://www.sexjobs.nl/',
Exemplo n.º 6
0
 phases=[
     Phase(
         source_worker=WebSource,
         parser=HTMLParser,
         sources=[
             Source(
                 url=
                 "https://fr.wikipedia.org/wiki/Liste_des_gouvernements_de_la_Belgique"
             )
         ],
         templates=(
             Template(
                 name='government',
                 selector='.wikitable tr td:nth-of-type(2)',
                 attrs=(
                     Attr(name='url',
                          selector='a',
                          func='sel_url',
                          source=Source(
                              active=False)),  # source is for next run
                 )), )),
     Phase(source_worker=WebSource,
           parser=HTMLParser,
           templates=(Template(name='government',
                               selector='table:nth-of-type(1) tr',
                               db_type='mongo_db',
                               db='belgian_politics',
                               table='politicians',
                               attrs=(
                                   Attr(name='url',
                                        selector='td:nth-of-type(2) a',
                                        func='sel_url'),
Exemplo n.º 7
0
    phases=[
    Phase(source_worker=WebSource, parser=HTMLParser, sources=[
        Source(url='https://www.youtube.com/user/ozzymanreviews/videos'),
        Source(url='https://www.youtube.com/user/Draadstal/videos'),
        Source(url='https://www.youtube.com/channel/UCQMs9pijXYAdqvkEMJyCM4g/videos'),
        Source(url='https://www.youtube.com/channel/UCi1LpRIlG1tDY5Z54VTel2w/videos'),
        Source(url='https://www.youtube.com/user/vpro/videos'),
        Source(url='https://www.youtube.com/user/nprmusic/videos'),
    ],
        templates=(
            Template(
                name='channel_videos', selector='li.channels-content-item',
                db_type='mongo_db', db='youtube_channel', table='channel_videos',
                attrs=[
                    Attr(name='url', selector='h3.yt-lockup-title a', func='sel_url'),

                    Attr(name='title', selector='h3', func='sel_text'),

                    Attr(name='views', selector='.yt-lockup-meta-info', func='sel_text',
                         kws={'regex': '(.*) weergaven', 'numbers': True}),
                ]
            ),
            Template(
                name='next_videos', selector='.browse-items-load-more-button',
                attrs=[
                    Attr(name='url', func='sel_attr',
                         kws={'attr': 'data-uix-load-more-href'},
                         source=Source(src_template='http://youtube.com{}',
                                       json_key=['content_html', 'load_more_widget_html']))
                ]),
        )
    ),
Exemplo n.º 8
0
cl = MongoClient()
db = cl.headlines
col = db.category

headlines = ScrapeModel(name='headlines', domain='http://www.headlines24.nl/',
    num_getters=2, phases=[
    
    Phase(source_worker=WebSource, parser=HTMLParser, sources=[
        Source(url="http://www.headlines24.nl/")],
        templates=(
            Template(
                name='category', selector='',
                db_type='mongo_db', db='headlines', table='category',
                attrs=(
                    Attr(name='url', selector='a', func='sel_url',
                        source=Source(active=False)), # source is for next run

                    Attr(name='title', selector='h1', func='sel_text'),

                    Attr(name='text', selector='p', func='sel_text'),
                )
            ),
        )
    ),
    
    Phase(source_worker=WebSource, parser=HTMLParser, sources=[
        Source(url="http://www.headlines24.nl/")],
        templates=(
            Template(
                name='category', selector='',
                db_type='mongo_db', db='headlines', table='category',
                attrs=(
Exemplo n.º 9
0
from parsers import HTMLParser
import string
from components import ScrapeModel, Source, Phase, Attr, Template

paradiso = ScrapeModel(
    name='paradiso',
    domain='https://paradiso.nl',
    phases=[
        Phase(source_worker=WebSource,
              sources=[Source(url='https://paradiso.nl/web/Agenda.htm')],
              parser=HTMLParser,
              templates=[
                  Template(name='event_link',
                           selector='a.event-link',
                           attrs=[
                               Attr(name='url',
                                    func='sel_attr',
                                    kws={'attr': 'href'},
                                    source={'active': False})
                           ])
              ]),
        Phase(templates=[
            Template(name='event',
                     db_type='MongoDB',
                     db='paradiso',
                     table='events',
                     attrs=[
                         Attr(name='name',
                              selector='meta[name=evenementts]',
                              func='sel_attr',
                              kws={'attr': 'content'}),
                         Attr(name='date',
Exemplo n.º 10
0
 name='volkskrant',
 domain='http://www.volkskrant.nl/',
 num_getters=2,
 cookies={'nl_cookiewall_version': '1'},
 phases=[
     Phase(
         source_worker=WebSource,
         parser=HTMLParser,
         sources=[
             Source(url="http://www.volkskrant.nl/archief/{}".format(year))
             for year in range(1987, today)
         ],
         templates=(Template(name='day_url',
                             selector='td',
                             attrs=(Attr(
                                 name='url',
                                 selector='a',
                                 func='sel_url',
                                 source=Source(active=False)), )), )),
     Phase(
         source_worker=WebSource,
         parser=HTMLParser,
         templates=(
             Template(name='article_url',
                      selector='article',
                      attrs=(Attr(name='url',
                                  selector='a',
                                  func='sel_url',
                                  source=Source(active=False)), )),
             Template(name='next_page_url',
                      selector='a.pager',
Exemplo n.º 11
0
meertens = ScrapeModel(
    name='namen', domain='http://www.meertens.knaw.nl/', num_getters=2, phases=[
    Phase(source_worker=WebSource, parser=HTMLParser, sources=(
        Source(url="http://www.meertens.knaw.nl/nvb/naam/begintmet/" + l)
                    for l in ['Aad']), # string.ascii_lowercase),
        templates=[
            Template(
                name='name', selector='tr.data',
                db_type='mongo_db', db='names', table='name_count_test',
                attrs=[
                    Attr(name='name', selector='td:nth-of-type(1)',
                                func='sel_text'),
                    Attr(name='men', selector='td:nth-of-type(2)',
                                func='sel_text', kws={'numbers': True}),
                    Attr(name='women', selector='td:nth-of-type(3)',
                                func='sel_text', kws={'numbers': True}),
                    Attr(name='url', selector='td:nth-of-type(1) a',
                                func='sel_attr', kws={'attr': 'href'},
                                source={'active': False},
                                source_condition={'women': '> 50',
                                                  'men': '> 50'}),
                ]
            ),
            Template(
                name='next_url', selector='.right',
                attrs=[
                    Attr(name='next', selector='abc',  func='sel_attr',
                                kws={'attr': 'href'}, source={'active': True}),
                ])
            ]
Exemplo n.º 12
0
from workers import WebSource
from parsers import HTMLParser

pornstars = ScrapeModel(
    name='pornhub_pornstars',
    domain='http://pornhub.com',
    num_getters=2,
    phases=[
        Phase(source_worker=WebSource,
              parser=HTMLParser,
              sources=[Source(url='http://www.pornhub.com/pornstars?o=a')],
              templates=[
                  Template(name='alphabet',
                           selector='.alphabetFilter .dropdownWrapper li',
                           attrs=[
                               Attr(name='url',
                                    selector='a',
                                    func='sel_url',
                                    source=Source(active=False))
                           ])
              ]),
        Phase(source_worker=WebSource,
              parser=HTMLParser,
              templates=[
                  Template(name='pornstar',
                           selector='.pornstarIndex li',
                           db_type='MongoDB',
                           db='pornstars',
                           collection='ranking',
                           attrs=[
                               Attr(name='name',
                                    selector='.title',
Exemplo n.º 13
0
from dispatcher import Dispatcher
from components import ScrapeModel, Phase, Template, Attr, Source
from workers import WebSource
from parsers import HTMLParser
import string
from pymongo import MongoClient


petitions = ScrapeModel(
    name='petitions', domain='https://petities.nl/', num_getters=2,
    phases=[
        Phase(source_worker=WebSource, parser=HTMLParser, sources=(
            Source(url="https://petities.nl/petitions/borstkankeronderzoek-vervroegen/signatures?locale=nl"),),
            templates=[
                Template(name='next_page', selector='.navigation-bar .navigation-bar',
                         attrs=[Attr(name='url', selector='a', func='sel_url', kws={'attr': 'href'},
                                     source=True)]),
                Template(name='signature', selector='.petition-signature-list',
                         db_type='mongo_db', db='petitions', table='borstkanker',
                         attrs=[
                             Attr(name='name', selector='.petition-signature-name',
                                  func='sel_text'),
                             Attr(name='time', selector='.signature-time', func='sel_text'),
                             Attr(name='location', selector='.petition-signature-location',
                                  func='sel_text'),
                             Attr(name='occupation', selector='.petition-signature-occupation',
                                  func='sel_text')
                        ])
            ]
        )
    ]
Exemplo n.º 14
0
templates=(Template(
    name='episode',
    selector='.so-panel.widget.widget_siteorigin-panels-builder',
    db_type='shell_command',
    db='theoffice',
    table='season',
    kws={
        'command':
        'sudo mkdir -p ' + filepath + '/{season}/ &' +
        ' sudo youtube-dl -o /mnt/Movies/{season}/{episode} {url}'
    },
    attrs=(
        Attr(name='url',
             selector='a',
             func=['sel_url', 'sel_text'],
             kws=[{}, {
                 'needle': r'.*(s\d+e\d+)'
             }]),
        Attr(name='episode',
             selector='.textwidget',
             func='sel_text',
             kws={
                 'index': 3,
                 'substitute': '_',
                 'replacers': ' '
             }),
        Attr(name='season',
             selector='.textwidget',
             func='sel_text',
             kws={
                 'index': 1,
                 'replacers': ' '
             }),
    )), )),
Exemplo n.º 15
0
cl = MongoClient()
db = cl.gsmhelpdesk_nummerreeksen
col = db.number_range

gsmhelpdesk_nummerreeksen = ScrapeModel(name='gsmhelpdesk_nummerreeksen', domain='http://www.gsmhelpdesk.nl/', num_getters=2,
    phases=[

    Phase(source_worker=WebSource, parser=HTMLParser, sources=[
        Source(url="http://www.gsmhelpdesk.nl/helpdesk/30/nummerreeksen"),
    ],
    templates=(
        Template(
            name='number_range', selector='tr',
            db_type='mongo_db', db='gsmhelpdesk_nummerreeksen', table='number_range',
            attrs=(
                Attr(name='start', selector='td:nth-of-type(1)',
                            func='sel_text', kws={'numbers': True}),
                Attr(name='end', selector='td:nth-of-type(2)',
                            func='sel_text', kws={'numbers': True}),
                )
            ),
        )
    ),
    ]
)

disp = Dispatcher()
disp.add_scraper(gsmhelpdesk_nummerreeksen)
disp.run()
Exemplo n.º 16
0
 Template(
     name='house',
     selector='.search-result',
     db_type='mongo_db',
     db='funda',
     table='for_hire',
     attrs=[
         Attr(name='price',
              selector='.search-result-price',
              func='sel_text',
              kws={'numbers': True}),
         Attr(name='street',
              selector='.search-result-title',
              func='sel_text'),
         Attr(name='realtor',
              selector='.realtor',
              func='sel_text'),
         Attr(name='rooms',
              selector='.search-result-info',
              func='sel_text',
              kws={
                  'regex': '(\d+) kamers',
                  'numbers': True
              }),
         Attr(name='zip',
              selector='.search-result-subtitle',
              func='sel_text',
              kws={'regex': '(\d{4} \w{2})'}),
         Attr(name='city',
              func='sel_text',
              selector='.search-result-subtitle',
              kws={'regex': '\d{4} \w{2} (\w+)'}),
         Attr(
             name='living_area',
             func='sel_text',
             selector=
             '.search-result-info span[title="Woonoppervlakte"]',
             kws={
                 'regex': '(\d+)',
                 'numbers': True
             }),
         Attr(name='meeting_url',
              selector='.search-result-header a',
              func='sel_attr',
              kws={'attr': 'href'},
              source={
                  'src_template': '{}bezichtiging/',
                  'active': False
              }),
     ]),
Exemplo n.º 17
0
thuisbezorgd = ScrapeModel(
    name='thuisbezorgd',
    domain='http://thuisbezorgd.nl',
    num_getters=2,
    phases=[
        Phase(
            source_worker=WebSource,
            parser=HTMLParser,
            sources=[Source(url="https://www.thuisbezorgd.nl/")],
            templates=(
                Template(
                    name='sections',
                    selector='',
                    attrs=(
                        Attr(name='url',
                             selector='a[href*="eten-bestellen-"]',
                             func='sel_url',
                             source=Source()),  # source is for next run
                    )),
                Template(
                    name='restaurant',
                    selector='.restaurant',
                    db_type='MongoDB',
                    db='thuisbezorgd',
                    table='restaurants',
                    attrs=(
                        Attr(name='url',
                             selector='a.restaurantname',
                             func='sel_url',
                             source=Source(
Exemplo n.º 18
0
cl = MongoClient()
db = cl.southpark
col = db.video

southpark = ScrapeModel(name='southpark', domain='http://southpark.cc.com/',
    num_getters=2, phases=[
    
    Phase(source_worker=WebSource, parser=HTMLParser, sources=[
        Source(url="http://southpark.cc.com/")],
        templates=(
            Template(
                name='video', selector='',
                db_type='MongoDB', db='southpark', table='video',
                attrs=(
                    Attr(name='url', selector='a', func='sel_url',
                        source=Source(active=False)), # source is for next run

                    Attr(name='title', selector='h1', func='sel_text'),

                    Attr(name='text', selector='p', func='sel_text'),
                )
            ),
        )
    ),
    
    Phase(source_worker=WebSource, parser=HTMLParser, sources=[
        Source(url="http://southpark.cc.com/")],
        templates=(
            Template(
                name='video', selector='',
                db_type='MongoDB', db='southpark', table='video',
                attrs=(