예제 #1
0
def build_10m():
    src = Source(loc=-800, flux=100000)
    velsel = VelocitySelector(loc=-600, wavelen=5, spread=0.14)
    atten = Attenuator(loc=-510, number=0, factor=0)
    srcap = Aperture(loc=-500, shape='circle', dims=[2.54, 2.54])
    samap = Aperture(loc=0, shape='circle', dims=[0.635, 0.635])
    guides = Guide(loc=-500, number=0, dims=[5,5], parms=[2,2])
    sam = Sample(loc=5, label='Cylinders in d2O', dims=[2.54, 2.54, 0.254], model='cylinder')
    bs = BeamStop(loc=495, coords=[0.2, 5.3], bsnum=2, bsdims=[5.08, 5.08], A=20.1, B=2.0477)
    det = Detector(loc=500, dpix=[0.508, 0.508], npix=[128,128], beam_cntr=[64.1, 65.2])
    
    sans = Instrument(src, velsel, atten, srcap, samap, guides, sam, bs, det)
    
    return sans
예제 #2
0
cl = MongoClient()
db = cl.efi_dumps
col = db.forum_post

efi_dumps = ScrapeModel(
    name='efi_dumps',
    domain='https://ghostlyhaks.com/',
    num_getters=2,
    phases=[
        Phase(
            source_worker=WebSource,
            parser=HTMLParser,
            sources=[
                Source(
                    url="https://ghostlyhaks.com/forum/rom-eeprom-bios-efi-uefi"
                )
            ],
            templates=(
                Template(
                    name='forum_post',
                    selector='.kbody tr',
                    db_type='mongo_db',
                    db='efi_dumps',
                    table='forum_post',
                    attrs=(
                        Attr(name='url',
                             selector='a.ktopic-title',
                             func='sel_url',
                             source=Source(
                                 active=False)),  # source is for next run
예제 #3
0
from parsers import HTMLParser

kinkycookies = {'ckieLegalIds': '4e17b168-5eb9-4c72-b7ee-3e8aebfd963e'}
sexjobscookies = {'algemeneVoorwaardenVersie': '3'}

kinky = ScrapeModel(
    name='kinky',
    domain='http://www.kinky.nl/',
    cookies=kinkycookies,
    phases=[
        Phase(
            source_worker=WebSource,
            parser=HTMLParser,
            sources=[
                Source(
                    url=
                    'http://www.kinky.nl/sex-afspraken/mannen/default.aspx?pagesize=5000',
                    attrs=[Attr(name='sex', value='man')]),
                Source(
                    url=
                    'http://www.kinky.nl/sex-afspraken/vrouwen/default.aspx?pagesize=5000',
                    attrs=[Attr(name='sex', value='vrouw')]),
                Source(
                    url=
                    'http://www.kinky.nl/sex-afspraken/transsexuelen/default.aspx?pagesize=5000',
                    attrs=[Attr(name='sex', value='trans')]),
                Source(
                    url=
                    'http://www.kinky.nl/sex-afspraken/stellen/default.aspx?pagesize=5000',
                    attrs=[Attr(name='sex', value='stellen')]),
                Source(
                    url=
예제 #4
0
cl = MongoClient()
db = cl.belgian_parlement_roles
col = db.government

belgian_parlement_roles = ScrapeModel(
    name='belgian_parlement_roles',
    domain='https://fr.wikipedia.org/',
    num_getters=2,
    phases=[
        Phase(
            source_worker=WebSource,
            parser=HTMLParser,
            sources=[
                Source(
                    url=
                    "https://fr.wikipedia.org/wiki/Liste_des_gouvernements_de_la_Belgique"
                )
            ],
            templates=(
                Template(
                    name='government',
                    selector='.wikitable tr td:nth-of-type(2)',
                    attrs=(
                        Attr(name='url',
                             selector='a',
                             func='sel_url',
                             source=Source(
                                 active=False)),  # source is for next run
                    )), )),
        Phase(source_worker=WebSource,
              parser=HTMLParser,
예제 #5
0

cl = MongoClient()
db = cl.youtube_channel
col = db.channel_videos

# The base url of the website
url = 'https://youtube.com/'

# The amount of workers that will get the information

youtube_channel = ScrapeModel(
    name='youtube_channel', domain='https://youtube.com/', num_getters=2, awaiting=True,
    phases=[
    Phase(source_worker=WebSource, parser=HTMLParser, sources=[
        Source(url='https://www.youtube.com/user/ozzymanreviews/videos'),
        Source(url='https://www.youtube.com/user/Draadstal/videos'),
        Source(url='https://www.youtube.com/channel/UCQMs9pijXYAdqvkEMJyCM4g/videos'),
        Source(url='https://www.youtube.com/channel/UCi1LpRIlG1tDY5Z54VTel2w/videos'),
        Source(url='https://www.youtube.com/user/vpro/videos'),
        Source(url='https://www.youtube.com/user/nprmusic/videos'),
    ],
        templates=(
            Template(
                name='channel_videos', selector='li.channels-content-item',
                db_type='mongo_db', db='youtube_channel', table='channel_videos',
                attrs=[
                    Attr(name='url', selector='h3.yt-lockup-title a', func='sel_url'),

                    Attr(name='title', selector='h3', func='sel_text'),
예제 #6
0
from workers import WebSource
from parsers import HTMLParser

cl = MongoClient()
db = cl.southpark
col = db.video

southpark = ScrapeModel(
    name='southpark',
    domain='http://southpark.cc.com/',
    num_getters=2,
    phases=[
        Phase(
            source_worker=WebSource,
            parser=HTMLParser,
            sources=[Source(url="http://southpark.cc.com/")],
            templates=(
                Template(
                    name='video',
                    selector='',
                    db_type='mongo_db',
                    db='southpark',
                    table='video',
                    attrs=(
                        Attr(name='url',
                             selector='a',
                             func='sel_url',
                             source=Source(
                                 active=False)),  # source is for next run
                        Attr(name='title', selector='h1', func='sel_text'),
                        Attr(name='text', selector='p', func='sel_text'),
예제 #7
0
from dispatcher import Dispatcher
from components import ScrapeModel, Phase, Template, Attr, Source
from pymongo import MongoClient
from workers import WebSource
from parsers import HTMLParser


cl = MongoClient()
db = cl.headlines
col = db.category

headlines = ScrapeModel(name='headlines', domain='http://www.headlines24.nl/',
    num_getters=2, phases=[
    
    Phase(source_worker=WebSource, parser=HTMLParser, sources=[
        Source(url="http://www.headlines24.nl/")],
        templates=(
            Template(
                name='category', selector='',
                db_type='mongo_db', db='headlines', table='category',
                attrs=(
                    Attr(name='url', selector='a', func='sel_url',
                        source=Source(active=False)), # source is for next run

                    Attr(name='title', selector='h1', func='sel_text'),

                    Attr(name='text', selector='p', func='sel_text'),
                )
            ),
        )
    ),
예제 #8
0
from dispatcher import Dispatcher
from components import ScrapeModel, Phase, Template, Attr, Source
from pymongo import MongoClient
from workers import WebSource
from parsers import HTMLParser


cl = MongoClient()
db = cl.gsmhelpdesk_nummerreeksen
col = db.number_range

gsmhelpdesk_nummerreeksen = ScrapeModel(name='gsmhelpdesk_nummerreeksen', domain='http://www.gsmhelpdesk.nl/', num_getters=2,
    phases=[

    Phase(source_worker=WebSource, parser=HTMLParser, sources=[
        Source(url="http://www.gsmhelpdesk.nl/helpdesk/30/nummerreeksen"),
    ],
    templates=(
        Template(
            name='number_range', selector='tr',
            db_type='mongo_db', db='gsmhelpdesk_nummerreeksen', table='number_range',
            attrs=(
                Attr(name='start', selector='td:nth-of-type(1)',
                            func='sel_text', kws={'numbers': True}),
                Attr(name='end', selector='td:nth-of-type(2)',
                            func='sel_text', kws={'numbers': True}),
                )
            ),
        )
    ),
    ]
예제 #9
0
col = db.artikelen

today = datetime.datetime.now().year
print(today)

volkskrant = ScrapeModel(
    name='volkskrant',
    domain='http://www.volkskrant.nl/',
    num_getters=2,
    cookies={'nl_cookiewall_version': '1'},
    phases=[
        Phase(
            source_worker=WebSource,
            parser=HTMLParser,
            sources=[
                Source(url="http://www.volkskrant.nl/archief/{}".format(year))
                for year in range(1987, today)
            ],
            templates=(Template(name='day_url',
                                selector='td',
                                attrs=(Attr(
                                    name='url',
                                    selector='a',
                                    func='sel_url',
                                    source=Source(active=False)), )), )),
        Phase(
            source_worker=WebSource,
            parser=HTMLParser,
            templates=(
                Template(name='article_url',
                         selector='article',
예제 #10
0
파일: namen.py 프로젝트: 0xh/modelscraper
from dispatcher import Dispatcher
from components import ScrapeModel, Phase, Template, Attr, Source
from workers import WebSource
from parsers import HTMLParser
import string


meertens = ScrapeModel(
    name='namen', domain='http://www.meertens.knaw.nl/', num_getters=2, phases=[
    Phase(source_worker=WebSource, parser=HTMLParser, sources=(
        Source(url="http://www.meertens.knaw.nl/nvb/naam/begintmet/" + l)
                    for l in ['Aad']), # string.ascii_lowercase),
        templates=[
            Template(
                name='name', selector='tr.data',
                db_type='mongo_db', db='names', table='name_count_test',
                attrs=[
                    Attr(name='name', selector='td:nth-of-type(1)',
                                func='sel_text'),
                    Attr(name='men', selector='td:nth-of-type(2)',
                                func='sel_text', kws={'numbers': True}),
                    Attr(name='women', selector='td:nth-of-type(3)',
                                func='sel_text', kws={'numbers': True}),
                    Attr(name='url', selector='td:nth-of-type(1) a',
                                func='sel_attr', kws={'attr': 'href'},
                                source={'active': False},
                                source_condition={'women': '> 50',
                                                  'men': '> 50'}),
                ]
            ),
            Template(
예제 #11
0
from dispatcher import Dispatcher
import re
from components import ScrapeModel, Phase, Template, Attr, Source
from workers import WebSource
from parsers import HTMLParser

pornstars = ScrapeModel(
    name='pornhub_pornstars',
    domain='http://pornhub.com',
    num_getters=2,
    phases=[
        Phase(source_worker=WebSource,
              parser=HTMLParser,
              sources=[Source(url='http://www.pornhub.com/pornstars?o=a')],
              templates=[
                  Template(name='alphabet',
                           selector='.alphabetFilter .dropdownWrapper li',
                           attrs=[
                               Attr(name='url',
                                    selector='a',
                                    func='sel_url',
                                    source=Source(active=False))
                           ])
              ]),
        Phase(source_worker=WebSource,
              parser=HTMLParser,
              templates=[
                  Template(name='pornstar',
                           selector='.pornstarIndex li',
                           db_type='MongoDB',
                           db='pornstars',
예제 #12
0
from dispatcher import Dispatcher
from components import ScrapeModel, Phase, Template, Attr, Source
from workers import WebSource
from parsers import HTMLParser
import string
from pymongo import MongoClient


petitions = ScrapeModel(
    name='petitions', domain='https://petities.nl/', num_getters=2,
    phases=[
        Phase(source_worker=WebSource, parser=HTMLParser, sources=(
            Source(url="https://petities.nl/petitions/borstkankeronderzoek-vervroegen/signatures?locale=nl"),),
            templates=[
                Template(name='next_page', selector='.navigation-bar .navigation-bar',
                         attrs=[Attr(name='url', selector='a', func='sel_url', kws={'attr': 'href'},
                                     source=True)]),
                Template(name='signature', selector='.petition-signature-list',
                         db_type='mongo_db', db='petitions', table='borstkanker',
                         attrs=[
                             Attr(name='name', selector='.petition-signature-name',
                                  func='sel_text'),
                             Attr(name='time', selector='.signature-time', func='sel_text'),
                             Attr(name='location', selector='.petition-signature-location',
                                  func='sel_text'),
                             Attr(name='occupation', selector='.petition-signature-occupation',
                                  func='sel_text')
                        ])
            ]
        )
    ]
예제 #13
0
cl = MongoClient()
db = cl.theoffice
col = db.season

filepath = '/mnt/Movies/theoffice/'

theoffice = ScrapeModel(
    name='theoffice',
    domain='http://watchtheofficeonline.com',
    num_getters=2,
    phases=[
        Phase(
            source_worker=WebSource,
            parser=HTMLParser,
            sources=[
                Source(url="http://watchtheofficeonline.com/s{}e{}".format(
                    season, episode)) for season in range(1, 10)
                for episode in range(1, 30)
            ],
            templates=(Template(
                name='episode',
                selector='.so-panel.widget.widget_siteorigin-panels-builder',
                db_type='shell_command',
                db='theoffice',
                table='season',
                kws={
                    'command':
                    'sudo mkdir -p ' + filepath + '/{season}/ &' +
                    ' sudo youtube-dl -o /mnt/Movies/{season}/{episode} {url}'
                },
                attrs=(
                    Attr(name='url',
예제 #14
0
from dispatcher import Dispatcher
from components import ScrapeModel, Phase, Template, Attr, Source
from pymongo import MongoClient
from workers import WebSource
from parsers import HTMLParser


cl = MongoClient()
db = cl.nytimes
col = db.menu

nytimes = ScrapeModel(name='nytimes', domain='https://www.nytimes.com/',
    num_getters=2, phases=[

    Phase(source_worker=WebSource, parser=HTMLParser, sources=[
        Source(url="https://www.nytimes.com/")],
        templates=(
            Template(
                name='menu', selector='#site-index-navigation li',
                db_type='MongoDB', db='nytimes', table='menu',
                attrs=(
                    Attr(name='url', selector='a', func='sel_url',
                        source=Source(active=False)), # source is for next run
                )
            ),
        )
    ),

    Phase(source_worker=WebSource, parser=HTMLParser, sources=[
        Source(url="https://www.nytimes.com/")],
        templates=(
예제 #15
0
from dispatcher import Dispatcher
from workers import WebSource
from parsers import HTMLParser
import string
from components import ScrapeModel, Source, Phase, Attr, Template

paradiso = ScrapeModel(
    name='paradiso',
    domain='https://paradiso.nl',
    phases=[
        Phase(source_worker=WebSource,
              sources=[Source(url='https://paradiso.nl/web/Agenda.htm')],
              parser=HTMLParser,
              templates=[
                  Template(name='event_link',
                           selector='a.event-link',
                           attrs=[
                               Attr(name='url',
                                    func='sel_attr',
                                    kws={'attr': 'href'},
                                    source={'active': False})
                           ])
              ]),
        Phase(templates=[
            Template(name='event',
                     db_type='MongoDB',
                     db='paradiso',
                     table='events',
                     attrs=[
                         Attr(name='name',
                              selector='meta[name=evenementts]',
예제 #16
0
cl = MongoClient()
db = cl.erowid
col = db.drug_report

erowid = ScrapeModel(
    name='erowid',
    domain='https://www.erowid.org/experiences/',
    num_getters=2,
    phases=[
        Phase(
            source_worker=WebSource,
            parser=HTMLParser,
            sources=[
                Source(
                    url=
                    "https://www.erowid.org/experiences/exp.cgi?ShowViews=1&Cellar=0&Start=0&Max=24777"
                )
            ],
            templates=(Template(name='report_url',
                                selector='.exp-list-table tr',
                                source={
                                    'active': False,
                                    'copy_attrs': True
                                },
                                attrs=(
                                    Attr(name='url',
                                         selector='td:nth-of-type(2) a',
                                         func='sel_url'),
                                    Attr(name='title',
                                         selector='td:nth-of-type(2) a',
                                         func='sel_text'),
예제 #17
0
from components import ScrapeModel, Phase, Source, Template, Attr
from dispatcher import Dispatcher
from workers import WebSource
from parsers import HTMLParser

funda = ScrapeModel(
    name='funda.nl',
    domain='http://funda.nl',
    num_sources=1,
    phases=[
        Phase(
            parser=HTMLParser,
            source_worker=WebSource,
            sources=[
                Source(url='http://funda.nl/huur/amsterdam/woonhuis/'),
                Source(url='http://funda.nl/huur/amsterdam/appartement/'),
                Source(url='http://funda.nl/koop/amsterdam/woonhuis/'),
                Source(url='http://funda.nl/koop/amsterdam/appartement'),
            ],
            templates=[
                Template(
                    name='house',
                    selector='.search-result',
                    db_type='mongo_db',
                    db='funda',
                    table='for_hire',
                    attrs=[
                        Attr(name='price',
                             selector='.search-result-price',
                             func='sel_text',
                             kws={'numbers': True}),
예제 #18
0
from workers import WebSource
from parsers import HTMLParser

cl = MongoClient()
db = cl.thuisbezorgd
col = db.reviews

thuisbezorgd = ScrapeModel(
    name='thuisbezorgd',
    domain='http://thuisbezorgd.nl',
    num_getters=2,
    phases=[
        Phase(
            source_worker=WebSource,
            parser=HTMLParser,
            sources=[Source(url="https://www.thuisbezorgd.nl/")],
            templates=(
                Template(
                    name='sections',
                    selector='',
                    attrs=(
                        Attr(name='url',
                             selector='a[href*="eten-bestellen-"]',
                             func='sel_url',
                             source=Source()),  # source is for next run
                    )),
                Template(
                    name='restaurant',
                    selector='.restaurant',
                    db_type='MongoDB',
                    db='thuisbezorgd',
예제 #19
0
# TODO Set the right classes for the websites.
from dispatcher import Dispatcher
from components import Attr, Source, HTMLObject, Phase, ScrapeModel, Follow

kinky =ScrapeModel(name='kinky', domain='kinky.nl', phases=[
    Phase(source=[Source(url='http://kinky.nl/sex-afspraken/vrouwen/default.aspx?pagesize=4500')],
        templates=[Template(name='advert', selector='.advertentie_kop > a',
                 attrs=[Attr(name='url', source={'active': False})])
        ]),
    Phase(templates=[
        Template(name='advert', selector='.advertentie_kop > a', db='kinky.nl', table='adverts',
                Attr(name='add_text', func= 'sel_text', selector='description p'),
                Attr(name='possibilities', func= 'sel_text', selector= '.possibilities li'),
                Attr(name='update', func= 'sel_text', selector='update'),
                Attr(name='town', func= 'sel_text', selector= '.naw div', kws={'regex': '.*Plaats: ([A-z]*);'}),
                Attr(name='work_area', func= 'sel_text', selector='naw div', kws={'regex': '.*Werkgebied: '}),
                Attr(name='prices', func= 'sel_text', selector='.prizes td:nth-of-type(2)'),
                Attr(name='pictures', func= 'sel_attr', selector='.galleryRow img', kws={'attr': 'src'}),
                Attr(name='phone', selector='.mainprofile .webbutton span', func='sel_text'),
                Attr(name='name', selector='h1.title', func='sel_text'),
                    '__reg__age': ['.naw div', '.*Leeftijd: (\d\d)'],
                    '__reg__length': ['.naw div', '.*Lengte: (\d\d\d)'],
                    '__reg__hair_color': ['.naw div', '.*Kleur haar: ([a-z]*);'],
                    '__reg__build': ['.naw div', '.*Lichaamsbouw: ([A-z]*);'],
                    '__reg__looks': ['.naw div', '.*Uiterlijk: ([A-z]*);'],
                ,
            ,
        ,
        'start': [
            'meta': {
                'sex': 'female',