def enqueue_based_on_url_re(self, rsrc_info, link_url, context):
        """
        Handler helper method used to call `enqueue_url_and_context` with the
        context kind attribute set based on `link_url` type.
        """
        if MOD_SUBPAGE_RE.match(link_url):
            if rsrc_info['source_id'] in TESSA_AUDIO_RESOURCES_SUBPAGES:
                context.update({'kind': 'audio_resources_subpage'})
            else:
                context.update({'kind': 'subpage'})
            self.enqueue_url_and_context(link_url, context)

        elif MOD_CONTENT_RE.match(link_url):
            context.update({'kind': 'oucontent'})
            self.enqueue_url_and_context(link_url, context)

        elif MOD_RESOURCE_RE.match(link_url):
            context.update({'kind': 'resource'})
            self.enqueue_url_and_context(link_url, context)

        elif MOD_URL_RE.match(link_url):
            # mod/url. links are resolved before being processed (usually oucontent)
            head_response = self.make_request(link_url, method='HEAD')
            if not head_response:
                LOGGER.warning('HEAD request failed for link_url ' + link_url)
                return
            new_link_url = head_response.url
            self.enqueue_based_on_url_re(rsrc_info, new_link_url, context)

        else:
            LOGGER.debug('____ Skipping link ' + link_url)
    def on_resource(self, url, page, context):
        LOGGER.info('Procesing resource ' + url + ' title:' + context['title'])
        resource_dict = dict(
            kind='TessaResource',
            url=url,
            children=[],
        )
        resource_dict.update(context)

        # attach this resource as another child in parent page
        context['parent']['children'].append(resource_dict)

        pagecontent_div = page.find(class_="pagecontent-content")
        links = pagecontent_div.find_all('a')
        for i, link in enumerate(links):
            if link.has_attr('href'):
                link_url = urljoin(url, link['href'])
                link_title = get_text(link)
                if not self.should_ignore_url(link_url):
                    context = dict(
                        parent=resource_dict,
                        title=link_title,
                    )
                    self.enqueue_url_and_context(link_url, context)
            else:
                LOGGER.warning('a link with no href ' + str(link))
    def _recusive_section_remover(subtree):

        breadcrumbs.append(subtree['url'])

        if 'children' in subtree:

            new_children = []
            for child in subtree['children']:

                # remove back-links
                if child['url'] in breadcrumbs:
                    LOGGER.warning('Found a back-link ' + child['url'])
                    continue

                # filter sections
                if 'title' in child:
                    title = child['title']
                    if title.startswith(section_str):
                        pass
                    elif lang == 'sw' and title.startswith('Section'):
                        pass  # special case since certain SW modules are in English
                    else:
                        new_children.append(child)
                else:
                    LOGGER.warning('FOUND a title less child ' + child['url'])
                    new_children.append(child)

            subtree['children'] = new_children

            # recurse
            for child in subtree['children']:
                _recusive_section_remover(child)

        breadcrumbs.pop()
    def on_oucontent(self, url, page, context):
        LOGGER.info('Procesing oucontent ' + url + ' title:' +
                    context['title'])
        oucontent_dict = dict(
            kind='TessaContent',
            url=url,
            children=[],
        )
        oucontent_dict.update(context)

        # attach this page as another child in parent page
        context['parent']['children'].append(oucontent_dict)
    def on_subpage(self, url, page, context):
        LOGGER.info('Procesing subpage ' + url + ' title:' + context['title'])
        subpage_dict = dict(
            # kind='TessaSubpage',
            url=url,
            children=[],
        )
        subpage_dict.update(context)

        # attach this page as another child in parent page
        context['parent']['children'].append(subpage_dict)

        course_content_div = page.find(class_="pagecontent-content")
        activity_lis = course_content_div.find_all("li", class_="activity")
        for i, activity_li in enumerate(activity_lis):
            link = activity_li.find('a')
            if link:
                if link.has_attr('href'):
                    link_url = urljoin(url, link['href'])
                    if not self.should_ignore_url(link_url):
                        rsrc_info = get_resource_info(activity_li)
                        link_title = rsrc_info['title']
                        context = dict(
                            parent=subpage_dict,
                            title=link_title,
                        )
                        self.enqueue_based_on_url_re(rsrc_info, link_url,
                                                     context)
                    else:
                        LOGGER.debug('Ignoring link ' + link_url +
                                     ' on page ' + url)
                else:
                    pass
                    LOGGER.warning('Found a link with no href ' + str(link))
    def on_tessa_language_page(self, url, page, context):
        """
        Basic handler that adds current page to parent's children array and adds
        all links on current page to the crawling queue.
        """
        LOGGER.info('Procesing tessa_language_page ' + url)
        page_dict = dict(
            # kind='TessaLangWebRessourceTree',
            url=url,
            title=self.get_title(page),
            children=[],
        )
        page_dict.update(context)

        # attach this page as another child in parent page
        context['parent']['children'].append(page_dict)

        course_content_div = page.find(class_="course-content")
        activity_lis = course_content_div.find_all("li", class_="activity")
        for i, activity_li in enumerate(activity_lis):
            link = activity_li.find('a')
            if link:
                if link.has_attr('href'):
                    link_url = urljoin(url, link['href'])
                    if not self.should_ignore_url(link_url):
                        rsrc_info = get_resource_info(activity_li)
                        link_title = rsrc_info['title']
                        context = dict(
                            parent=page_dict,
                            title=link_title,
                        )
                        self.enqueue_based_on_url_re(rsrc_info, link_url,
                                                     context)
                    else:
                        LOGGER.debug('Ignoring link ' + link_url +
                                     ' on page ' + url)
    def on_audio_resources_subpage(self, url, page, context):
        """
        Special handler for pages that link to audio resources on different topics.
        """
        LOGGER.info('Procesing audio_resources_subpage ' + url + ' title:' +
                    context['title'])
        subpage_dict = dict(
            # kind='TessaAudioResourcesSubpage',
            title=context['title'],
            url=url,
            children=[],
        )
        subpage_dict.update(context)

        # attach this page as another child in parent page
        context['parent']['children'].append(subpage_dict)

        course_content_div = page.find(class_="pagecontent-content")
        activity_lis = course_content_div.find_all("li", class_="activity")
        for i, activity_li in enumerate(activity_lis):
            link = activity_li.find('a')
            if link:
                if link.has_attr('href'):
                    link_url = urljoin(url, link['href'])
                    if not self.should_ignore_url(link_url):
                        rsrc_info = get_resource_info(activity_li)
                        link_title = rsrc_info['title']
                        context = dict(
                            parent=subpage_dict,
                            title=link_title,
                        )
                        if MOD_SUBPAGE_RE.match(link_url):
                            context.update(
                                {'kind': 'audio_resource_topic_subpage'})
                            self.enqueue_url_and_context(link_url, context)
                        else:
                            LOGGER.debug(
                                ':::audio_resources_subpage::: Skipping link '
                                + link_url + ' ' + link_title)
                else:
                    LOGGER.warning('a link with no href ' + str(link))
Exemplo n.º 8
0
#!/usr/bin/env python

from urllib.parse import urljoin

from basiccrawler.crawler import BasicCrawler

from basiccrawler.crawler import LOGGER, logging

LOGGER.setLevel(logging.INFO)


class TakeHomeCrawler(BasicCrawler):
    MAIN_SOURCE_DOMAIN = 'http://chef-take-home-test.learningequality.org'
    START_PAGE = 'http://chef-take-home-test.learningequality.org/'
    START_PAGE_CONTEXT = {'kind': 'channel'}

    SOURCE_DOMAINS = [MAIN_SOURCE_DOMAIN]
    IGNORE_URLS = []

    CRAWLING_STAGE_OUTPUT = 'chefdata/trees/takehome_web_resource_tree.json'

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.kind_handlers = {   # mapping from web resource kinds (user defined) and handlers
            'channel': self.on_channel_or_topic,
            'topic': self.on_channel_or_topic,
            'audio': self.on_content,
            'video': self.on_content,
            'document': self.on_content,
        }
    def on_audio_resource_topic_subpage(self, url, page, context):
        """
        Process pages like http://www.open.edu/openlearncreate/mod/subpage/view.php?id=67220
        which contain sections on different subtopics, e.g. "The show must go on"
        """
        LOGGER.info('Procesing audio_resource_topic_subpage ' + url +
                    ' title:' + context['title'])
        topic_subpage_dict = dict(
            # kind='TessaAudioResourceTopicSubpage',
            title=context['title'],
            url=url,
            children=[],
        )
        topic_subpage_dict.update(context)

        # attach this page as another child in parent page
        context['parent']['children'].append(topic_subpage_dict)

        course_content_div = page.find('div', class_="course-content")
        section_lis = course_content_div.find_all('li', class_="section")

        for section_li in section_lis:
            section_name = section_li.find(class_="sectionname")
            if section_name is None:  # handle empty section_li edge case
                continue
            section_title = get_text(section_name)
            subtopic_dict = dict(
                kind='TessaAudioResourceSection',
                url=url + '#' + quote_plus(section_title),
                title=section_title,
                parent=topic_subpage_dict,
                children=[],
            )
            subtopic_dict['source_id'] = subtopic_dict['url']
            sections_ul = section_li.find('ul', class_="section")
            activity_lis = sections_ul.find_all('li', class_="activity")
            subtopic_description = ''
            for activity_li in activity_lis:
                activity_type = get_modtype(activity_li)

                if activity_type in ['label', 'heading']:
                    subtopic_description += activity_li.get_text()

                elif activity_type == 'resource':
                    link = activity_li.find('a')
                    link_url = urljoin(url, link['href'])
                    rsrc_info = get_resource_info(activity_li)
                    verdict, head_response = self.is_media_file(link_url)
                    if head_response is None:
                        LOGGER.warning('HEAD ' + link_url +
                                       ' did not return response.')
                    # CASE A. Direct-links to media files
                    if verdict == True:
                        media_rsrc_dict = self.create_media_url_dict(
                            link_url, head_response)
                        media_rsrc_dict['title'] = rsrc_info['title']
                        subtopic_dict['children'].append(media_rsrc_dict)
                    # CASE B. Indirect-links to mp3 files on HTML page (AR only)
                    else:
                        resource_dict = dict(
                            kind='resource',
                            url=link_url,
                            parent=subtopic_dict,
                            title=rsrc_info['title'],
                            children=[],
                        )
                        self.enqueue_url_and_context(link_url, resource_dict)
                else:
                    LOGGER.debug(
                        ':::audio_resource_topic_subpage::: Skipping activity '
                        + str(activity_type) + ' ' + rsrc_info['title'])
            subtopic_dict['description'] = subtopic_description
            topic_subpage_dict['children'].append(subtopic_dict)
    def _recursive_restrucutre_walk(subtree, depth):
        # set source_id
        if subtree['kind'] in ['oucontent', 'subpage', 'resource']:
            subtree['source_id'] = subtree['kind'] + ':' + url_to_id(
                subtree['url'])

        # rename kind to scraper-recognized names
        if subtree[
                'kind'] == 'oucontent' and depth == 1:  # special case for non-module overview content pages on homepage
            subtree['kind'] = 'TessaContentPage'
        #
        elif subtree['kind'] == 'oucontent':
            subtree['kind'] = 'TessaModule'
        #
        elif subtree['kind'] == 'subpage':
            subtree['kind'] = 'TessaSubpage'
        #
        elif subtree['kind'] == 'audio_resources_subpage':
            subtree['kind'] = 'TessaAudioResourcesSubpage'
            subtree['source_id'] = subtree['url']
        #
        elif subtree['kind'] == 'audio_resource_topic_subpage':
            subtree['kind'] = 'TessaAudioResourceTopicSubpage'
            subtree['source_id'] = subtree['url']
        #
        # MP3 and PDF media files
        elif subtree['kind'] == 'MediaWebResource':
            if subtree['content-type'] == 'application/pdf':
                subtree['kind'] = 'TessaPDFDocument'
                subtree['source_id'] = subtree['url']
                LOGGER.info('Found PDF ' + subtree['url'])

            elif subtree['content-type'] == 'audio/mp3':
                subtree['kind'] = 'TessaAudioResouce'
                subtree['source_id'] = subtree['url']
                LOGGER.info('Found MP3 ' + subtree['url'])

            else:
                LOGGER.warning('Unsupported format ' +
                               subtree['content-type'] + ' url=' +
                               subtree['url'])
        #
        # handle special case for 'ar' where mp3 resources appear on a subpage, not direct links
        elif subtree['kind'] == 'resource':
            if len(subtree['children']) != 1:
                LOGGER.error('Multiple children found on ' + subtree['url'] +
                             ' Expected a single mp3 MediaWebResource child.')
            mp3_child_node = subtree['children'][0]
            subtree['children'] = []
            subtree['kind'] = 'TessaAudioResouce'
            subtree['url'] = mp3_child_node['url']
            subtree['source_id'] = subtree['url']
            subtree['content-type'] = mp3_child_node['content-type']
            if 'content-disposition' in mp3_child_node:
                subtree['content-disposition'] = mp3_child_node[
                    'content-disposition']
            if 'content-length' in mp3_child_node:
                subtree['content-length'] = mp3_child_node['content-length']
            LOGGER.info('Found MP3 ' + subtree['url'])
        #
        else:
            LOGGER.warning('Unknown kind ' + subtree['kind'] + ' url=' +
                           subtree['url'])

        # set lang on all nodes based on top-level channel lang proprty
        subtree['lang'] = lang

        # recurse
        if 'children' in subtree:
            for child in subtree['children']:
                _recursive_restrucutre_walk(child, depth + 1)