Exemplos de make_soup em Python, exemplos de parser.make_soup em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: activity_scraper.py Projeto: devtayls/education_com_scraper

def scrape_activity(relative_url):
    print(relative_url)

    activity = {}
    page = make_soup(get_url(relative_url=relative_url))

    try:
        activity['title'] = page.findChild('h1').text
    except:
        pass
    try:
        activity['description'] = (page.findChild('div', {'class' : 'description'})).text
    except:
        pass
    try:
        activity['download_link'] = ((page.findChild('div', {'class' : 'main-actions'})).findChild('a'))['href']
    except:
        pass
    activity['grade_levels'] = get_activity_iterable_elements(page, {'class': 'grades'})
    activity['subject_matter'] = get_activity_iterable_elements(page, {'class': 'subjects'})
    try:
        activity['lesson_text'] = page.findChild('div', {'class' : 'bottom-text'}).text
    except:
        pass
    try:
        activity['image_link'] = page.findChild('img', {'class' : 'main-image'})['src']
        file_format = re.findall(".*\.(\S*)", activity['image_link'])[0]
        download_resource(activity['image_link'], __image_directory__, activity['title'], file_format)
    except:
        pass

    # todo debugging code
    print(activity)
    return activity

Exemplo n.º 2

0

Exibir arquivo

def get_last_pagination_num(url):
    soup = make_soup(url)

    # grab all of the pagination a elements and grab the last in the list
    last_page = int((soup.select('a[title*="Page "]'))[-1].string)

    return last_page

Exemplo n.º 3

0

Exibir arquivo

def get_item_urls(content, selector, limit=0):
    item_urls = set()
    last_page_num = get_last_pagination_num(get_url(content=content))

    if limit > 0:
        last_page_num = limit

    # loop through all the available pages starting at the first pagination url
    end_range = last_page_num + 1
    for i in range(1, end_range):

        # todo debugging code:
        print('page ' + str(i) + ' of ' + str(last_page_num))

        soup = make_soup(
            get_url(content=content, query_string='?page=' + str(i)))

        # grab all links with the data-type worksheet
        work_sheet_links = soup.select(selector)

        for j in work_sheet_links:
            item_urls.add(j.attrs['href'])

    #todo debugging code:
    print(item_urls)

    return item_urls

Exemplo n.º 4

0

Exibir arquivo

def scrape_lesson_plan(relative_url):
    lesson_plan = {}

    page = make_soup(get_url(relative_url=relative_url))

    # TODO think this might be broken
    lesson_plan['name'] = (re.findall('lesson-plan\/(.+)\/', relative_url))[0]
    lesson_plan['grade_levels'] = get_iterable_elements(page, 'Grade')
    lesson_plan['subject_matter'] = get_iterable_elements(page, 'Subject')

    lesson_plan['attachments'] = []
    attachments = page.findChildren('a', ['href', re.compile('attachment')])
    for i in attachments:
        attachment = {'label': i['href'], 'href': i['aria-label']}
        lesson_plan['attachments'].append(attachment)

    lesson_plan['learning_objectives'] = ((page.findChild('h4', text='Learning Objectives')).find_next('p')).contents[0]

    return lesson_plan

Exemplo n.º 5

0

Exibir arquivo

Arquivo: packet_scraper.py Projeto: devtayls/education_com_scraper

def scrape_workbook(relative_url):
    print(relative_url)

    packet = {}
    page = make_soup(get_url(relative_url=relative_url))

    try:
        packet['title'] = page.findChild('h1').text
    except:
        pass
    try:
        packet['description'] = (page.findChild('div',
                                                {'class': 'description'})).text
    except:
        pass
    try:
        packet['download_link'] = page.findChild(
            'a', {'class': 'download-link'})['href']
    except:
        pass
    packet['grade_levels'] = get_packet_iterable_elements(
        page, {'class': 'grades'})
    packet['subject_matter'] = get_packet_iterable_elements(
        page, {'class': 'subjects'})

    packet['image_link'] = page.findChild('img',
                                          {'class': 'main-image'})['src']
    file_format = re.findall(".*\.(\S*)", packet['image_link'])[0]
    print('calling download_resource...')
    download_resource(url=packet['image_link'],
                      relative_path='./downloads/packets/images',
                      file_name='test',
                      file_format='jpg')

    # todo debugging code
    print(packet)
    return packet

Exemplo n.º 6

0

Exibir arquivo

def scrape_worksheet(relative_url):

    worksheet_content = {}
    url = __absolute_url__ + relative_url

    worksheet_page = make_soup(url)

    worksheet_content['name'] = (re.findall('article\/(.+)\/', relative_url))[0]

    # grab the tag of primary main content of this worksheet
    worksheet_module_content = worksheet_page.findChildren('div', re.compile('worksheet-module_content_'))  # the main worksheet module

    try:
        # download the associated image
        image_tag = (worksheet_module_content[0].findChildren('img'))[0]  # the image of the worksheet
        file_format = (re.findall('/\d+\/.*\.(\w*)', image_tag['src']))[0]
        download_resource(image_tag['src'], __image_directory__, worksheet_content['name'], file_format)
    except:
        capture_failure('image_meta')

    try:
        # the written description of the worksheet
        worksheet_content['description'] = str(worksheet_module_content[0].findChild('p').contents)
    except:
        capture_failure('description')

    try:
        worksheet_tag = (worksheet_module_content[0].findChildren('a', re.compile('worksheet-module_mainActionButton_')))[0]
        file_format = (re.findall('/\d+\/.*\.(\w*)', worksheet_tag['href']))[0]
        # TODO: This should work but I need a subscription first
        # download_resource(worksheet_tag['href'], __worksheet_directory, worksheet_content['name'], file_format)
        worksheet_content['worksheet_file_format'] = file_format
        worksheet_content['original_download_link'] = worksheet_tag['href']
    except:
        capture_failure('worksheet_meta')

    try:
        # TODO: this fails sometimes. Do I care enough to fix it?
        #  related worksheet set e.g., trace the letters set
        worksheet_set_tag = (worksheet_module_content[0].findChild('a', re.compile('Action-module_action')))  # set link
        if worksheet_set_tag is not None:
            if worksheet_set_tag.has_attr('href'):
                worksheet_content['set_link'] = worksheet_set_tag['href']
            if worksheet_set_tag.has_attr('title'):
                worksheet_content['set_title'] = worksheet_set_tag['title']
    except:
        capture_failure('set_meta')

    worksheet_content['grade_levels'] = get_iterable_elements(worksheet_page, 'Grade')
    worksheet_content['grade_levels'] = get_iterable_elements(worksheet_page, 'Grade')

    # TODO this breaks often. Do I care enough to fix it?
    # grab the link to the related guided lesson module
    try:
        guided_lesson = []

        related_guided_lesson_module = worksheet_page.findChildren('div', re.compile('RelatedGuidedLesson-module_container'))[0]

        if related_guided_lesson_module is not None:
            guided_lesson['title'] = related_guided_lesson_module.findChild('h3', re.compile('RelatedGuidedLesson-module_lessonTitle_')).text
            guided_lesson['download_link'] = related_guided_lesson_module.findChild('div', re.compile('RelatedGuidedLesson-module_downloadPrintables_')).contents[0]['href']
            worksheet_content['guided_lesson_module'] = guided_lesson
    except:
        capture_failure('guided_lesson_meta')

    # TODO remove debug statement
    print(worksheet_content)

    return worksheet_content