def scrape_activity(relative_url): print(relative_url) activity = {} page = make_soup(get_url(relative_url=relative_url)) try: activity['title'] = page.findChild('h1').text except: pass try: activity['description'] = (page.findChild('div', {'class' : 'description'})).text except: pass try: activity['download_link'] = ((page.findChild('div', {'class' : 'main-actions'})).findChild('a'))['href'] except: pass activity['grade_levels'] = get_activity_iterable_elements(page, {'class': 'grades'}) activity['subject_matter'] = get_activity_iterable_elements(page, {'class': 'subjects'}) try: activity['lesson_text'] = page.findChild('div', {'class' : 'bottom-text'}).text except: pass try: activity['image_link'] = page.findChild('img', {'class' : 'main-image'})['src'] file_format = re.findall(".*\.(\S*)", activity['image_link'])[0] download_resource(activity['image_link'], __image_directory__, activity['title'], file_format) except: pass # todo debugging code print(activity) return activity
def get_last_pagination_num(url): soup = make_soup(url) # grab all of the pagination a elements and grab the last in the list last_page = int((soup.select('a[title*="Page "]'))[-1].string) return last_page
def get_item_urls(content, selector, limit=0): item_urls = set() last_page_num = get_last_pagination_num(get_url(content=content)) if limit > 0: last_page_num = limit # loop through all the available pages starting at the first pagination url end_range = last_page_num + 1 for i in range(1, end_range): # todo debugging code: print('page ' + str(i) + ' of ' + str(last_page_num)) soup = make_soup( get_url(content=content, query_string='?page=' + str(i))) # grab all links with the data-type worksheet work_sheet_links = soup.select(selector) for j in work_sheet_links: item_urls.add(j.attrs['href']) #todo debugging code: print(item_urls) return item_urls
def scrape_lesson_plan(relative_url): lesson_plan = {} page = make_soup(get_url(relative_url=relative_url)) # TODO think this might be broken lesson_plan['name'] = (re.findall('lesson-plan\/(.+)\/', relative_url))[0] lesson_plan['grade_levels'] = get_iterable_elements(page, 'Grade') lesson_plan['subject_matter'] = get_iterable_elements(page, 'Subject') lesson_plan['attachments'] = [] attachments = page.findChildren('a', ['href', re.compile('attachment')]) for i in attachments: attachment = {'label': i['href'], 'href': i['aria-label']} lesson_plan['attachments'].append(attachment) lesson_plan['learning_objectives'] = ((page.findChild('h4', text='Learning Objectives')).find_next('p')).contents[0] return lesson_plan
def scrape_workbook(relative_url): print(relative_url) packet = {} page = make_soup(get_url(relative_url=relative_url)) try: packet['title'] = page.findChild('h1').text except: pass try: packet['description'] = (page.findChild('div', {'class': 'description'})).text except: pass try: packet['download_link'] = page.findChild( 'a', {'class': 'download-link'})['href'] except: pass packet['grade_levels'] = get_packet_iterable_elements( page, {'class': 'grades'}) packet['subject_matter'] = get_packet_iterable_elements( page, {'class': 'subjects'}) packet['image_link'] = page.findChild('img', {'class': 'main-image'})['src'] file_format = re.findall(".*\.(\S*)", packet['image_link'])[0] print('calling download_resource...') download_resource(url=packet['image_link'], relative_path='./downloads/packets/images', file_name='test', file_format='jpg') # todo debugging code print(packet) return packet
def scrape_worksheet(relative_url): worksheet_content = {} url = __absolute_url__ + relative_url worksheet_page = make_soup(url) worksheet_content['name'] = (re.findall('article\/(.+)\/', relative_url))[0] # grab the tag of primary main content of this worksheet worksheet_module_content = worksheet_page.findChildren('div', re.compile('worksheet-module_content_')) # the main worksheet module try: # download the associated image image_tag = (worksheet_module_content[0].findChildren('img'))[0] # the image of the worksheet file_format = (re.findall('/\d+\/.*\.(\w*)', image_tag['src']))[0] download_resource(image_tag['src'], __image_directory__, worksheet_content['name'], file_format) except: capture_failure('image_meta') try: # the written description of the worksheet worksheet_content['description'] = str(worksheet_module_content[0].findChild('p').contents) except: capture_failure('description') try: worksheet_tag = (worksheet_module_content[0].findChildren('a', re.compile('worksheet-module_mainActionButton_')))[0] file_format = (re.findall('/\d+\/.*\.(\w*)', worksheet_tag['href']))[0] # TODO: This should work but I need a subscription first # download_resource(worksheet_tag['href'], __worksheet_directory, worksheet_content['name'], file_format) worksheet_content['worksheet_file_format'] = file_format worksheet_content['original_download_link'] = worksheet_tag['href'] except: capture_failure('worksheet_meta') try: # TODO: this fails sometimes. Do I care enough to fix it? # related worksheet set e.g., trace the letters set worksheet_set_tag = (worksheet_module_content[0].findChild('a', re.compile('Action-module_action'))) # set link if worksheet_set_tag is not None: if worksheet_set_tag.has_attr('href'): worksheet_content['set_link'] = worksheet_set_tag['href'] if worksheet_set_tag.has_attr('title'): worksheet_content['set_title'] = worksheet_set_tag['title'] except: capture_failure('set_meta') worksheet_content['grade_levels'] = get_iterable_elements(worksheet_page, 'Grade') worksheet_content['grade_levels'] = get_iterable_elements(worksheet_page, 'Grade') # TODO this breaks often. Do I care enough to fix it? # grab the link to the related guided lesson module try: guided_lesson = [] related_guided_lesson_module = worksheet_page.findChildren('div', re.compile('RelatedGuidedLesson-module_container'))[0] if related_guided_lesson_module is not None: guided_lesson['title'] = related_guided_lesson_module.findChild('h3', re.compile('RelatedGuidedLesson-module_lessonTitle_')).text guided_lesson['download_link'] = related_guided_lesson_module.findChild('div', re.compile('RelatedGuidedLesson-module_downloadPrintables_')).contents[0]['href'] worksheet_content['guided_lesson_module'] = guided_lesson except: capture_failure('guided_lesson_meta') # TODO remove debug statement print(worksheet_content) return worksheet_content