예제 #1
0
def create_document_node(path, title, target_node, source_id, **details):
    document_file = files.DocumentFile(path)
    document_id = title.replace(" ", "-").lower()
    target_node.add_child(
        nodes.DocumentNode(source_id="{}-{}".format(source_id, document_id),
                           title=title,
                           files=[document_file],
                           **details))
예제 #2
0
    def construct_channel(self, *args, **kwargs):
        """
        Creates ChannelNode and build topic tree
        Args:
          - args: arguments passed in during upload_channel (currently None)
          - kwargs: extra argumens and options not handled by `uploadchannel`.
            For example, add the command line option   lang="fr"  and the string
            "fr" will be passed along to `construct_channel` as kwargs['lang'].
        Returns: ChannelNode
        """
        channel = self.get_channel(
            *args,
            **kwargs)  # Create ChannelNode from data in self.channel_info

        topics = load_json_from_file(JSON_FILE)
        for topic in topics:
            book_title = topic['book_title']
            source_id = book_title.replace(" ", "_")
            url = topic['path_or_url']
            topic_node = nodes.TopicNode(source_id=source_id,
                                         title=book_title,
                                         tags=[
                                             "Teacher facing",
                                             "Professional development",
                                             "Life skills",
                                             "Intercultural skills",
                                             "Mentorship", "Formal contexts"
                                         ])
            channel.add_child(topic_node)

            parser = pdf.PDFParser(url, toc=topic['chapters'])
            parser.open()
            chapters = parser.split_chapters()
            for chapter in chapters:
                title = chapter['title']
                pdf_path = chapter['path']
                pdf_file = files.DocumentFile(pdf_path)
                pdf_node = nodes.DocumentNode(
                    source_id="{} {}".format(book_title, title),
                    title=title,
                    author="INTO",
                    tags=[
                        "Teacher facing", "Professional development",
                        "Life skills", "Intercultural skills", "Mentorship",
                        "Formal contexts"
                    ],
                    files=[pdf_file],
                    license=licenses.get_license(CHANNEL_LICENSE, "INTO",
                                                 LICENSE_DESCRIPTION),
                    copyright_holder="INTO")
                topic_node.add_child(pdf_node)

        raise_for_invalid_channel(
            channel)  # Check for errors in channel construction

        return channel
def add_file_node(target_node, url, title, **details):
    """ Creates file node at target topic node """
    document_file = files.DocumentFile(path=url)
    document_id = title.replace(" ", "-").lower()
    document_node = nodes.DocumentNode(source_id="{}-{}".format(
        target_node.source_id, document_id),
                                       title=title,
                                       files=[document_file],
                                       **details)
    target_node.add_child(document_node)
def save_book(book_detail, channel):
    book_id = book_detail["id"]
    book_source_id = get_book_source_id(book_id)
    book_title = book_detail["name"]
    level_id = book_detail["readingLevel"]
    language = book_detail["language"]
    language_id = language["id"]
    tags = book_detail["tags"]
    epub_url = book_detail["epubUrl"]
    pdf_urls = book_detail["pdfUrl"]
    pdf_portrait_url = pdf_urls.get("portraitUrl", "") if pdf_urls else ""
    pdf_landscape_url = pdf_urls.get("landscapeUrl", "") if pdf_urls else ""
    pdf_booklet_url = pdf_urls.get("bookletUrl", "") if pdf_urls else ""
    pdf_url = pdf_portrait_url or pdf_landscape_url or pdf_booklet_url

    if not pdf_url and not epub_url:
        LOGGER.error("No file found for \n {}".format(book_source_id))
        raise NoFileAvailableError()

    book_files = []
    if pdf_url:
        pdf_file = files.DocumentFile(path=pdf_url)
        book_files.append(pdf_file)
    if epub_url:
        epub_file = files.EPubFile(path=epub_url)
        book_files.append(epub_file)

    book = nodes.DocumentNode(
        source_id=book_source_id,
        title=book_title,
        license=licenses.
        PUBLIC_DOMAIN,  # TODO: get a real license and copyright holder
        files=book_files)

    language_topic = get_or_create_language_topic(language, channel)
    level_topic = get_or_create_level_topic(level_id, language_id,
                                            language_topic)

    if not tags:
        level_topic.add_child(book)
        return

    for tag in tags:
        tag_topic = get_or_create_tag_topic(tag, language_id, level_id,
                                            level_topic)
        tag_topic.add_child(book)
예제 #5
0
def generate_pdf_nodes(data, topic, source=""):
    """
        Generates nodes related to pdfs
        Args:
            - data (dict) data on pdf details (split pdfs, file paths, exercises, etc.)
            - topic (TopicNode) node to add sub nodes to
            - source (str) unique string associated with this pdf
        Returns None
    """

    # Iterate through chapter data
    for chapter in data:
        # Create topics if we're dealing with a section
        if chapter.get('header'):
            source_id = "{}-{}".format(source, chapter['header'])
            subtopic = nodes.TopicNode(title=chapter['header'],
                                       source_id=source_id)
            topic.add_child(subtopic)
            generate_pdf_nodes(chapter['chapters'], subtopic, source=source_id)

        # Create a document node and its related exercise nodes if it's a document
        elif chapter.get("chapter"):
            # Create doucment node
            source_id = "{}-{}".format(source, chapter['chapter'])
            topic.add_child(
                nodes.DocumentNode(title=chapter['chapter'],
                                   source_id=source_id,
                                   copyright_holder=COPYRIGHT_HOLDER,
                                   license=LICENSE,
                                   files=[files.DocumentFile(chapter['path'])
                                          ]))

            # Create exercise nodes
            for index, exercise in enumerate(chapter.get("exercises") or []):
                exercise_id = "{} Exercise {}".format(source_id, index)
                exercise_node = nodes.ExerciseNode(
                    title=chapter['chapter'],
                    source_id=exercise_id,
                    description=exercise.get('description'),
                    copyright_holder=COPYRIGHT_HOLDER,
                    license=LICENSE,
                )
                topic.add_child(exercise_node)
                create_exercise_questions(exercise_node,
                                          exercise.get('questions') or [])
예제 #6
0
def create_slideshow(images, source_id, title, language_name):
    """
        images: {url: str, caption: str}
    """

    thumbnailFile = files.ThumbnailFile(images[0]['url'])

    if '--slides' in sys.argv:
        slides = [
            files.SlideImageFile(image['url'],
                                 caption=image.get('caption', ''))
            for image in images
        ]
        return nodes.SlideshowNode(source_id=source_id,
                                   title=title,
                                   license=LICENSE,
                                   language=LANGUAGE_MAP[language_name],
                                   files=[thumbnailFile] + slides)

    # Create PDF
    filename = hashlib.md5(source_id.encode('utf-8')).hexdigest()
    pdfpath = '{}{}{}.pdf'.format(DOCUMENT_DOWNLOAD_DIR, os.path.sep, filename)

    if not os.path.exists(pdfpath):
        image_list = []
        for image in images:
            img = Image.open(BytesIO(downloader.read(image['url'])))
            if img.mode == 'RGBA':
                img = img.convert('RGB')
            image_list.append(img)

        image_list[0].save(pdfpath,
                           save_all=True,
                           append_images=image_list[1:])

    return nodes.DocumentNode(
        source_id=source_id,
        title=title,
        license=LICENSE,
        language=LANGUAGE_MAP[language_name],
        files=[thumbnailFile, files.DocumentFile(pdfpath)])
def make_content_node(kind, source_id, title, license, filepath, optionals):
    """
    Create `kind` subclass of ContentNode based on required args and optionals.
    """
    content_node = None
    if kind == content_kinds.VIDEO:
        content_node = nodes.VideoNode(
            source_id=source_id,
            title=title,
            license=license,
            author=optionals.get("author", None),
            description=optionals.get("description", None),
            derive_thumbnail=True, # video-specific data
            files=[files.VideoFile(path=filepath)],
        )

    elif kind == content_kinds.AUDIO:
        content_node = nodes.AudioNode(
            source_id=source_id,
            title=title,
            license=license,
            author=optionals.get("author", None),
            description=optionals.get("description", None),
            thumbnail=optionals.get("thumbnail", None),
            files=[files.AudioFile(path=filepath)],
        )

    elif kind == content_kinds.DOCUMENT:
        content_node = nodes.DocumentNode(
            source_id=source_id,
            title=title,
            license=license,
            author=optionals.get("author", None),
            description=optionals.get("description", None),
            thumbnail=optionals.get("thumbnail", None),
            files=[files.DocumentFile(path=filepath)],
        )

    return content_node
예제 #8
0
    def create_leaf_node(self, module, subject_node, subject_id):
        # zips are always SCORMs in this case.
        assert 'file' in module, "Invalid module: {}".format(module)
        if 'file' in module:
            ext = os.path.splitext(module['file'])[1].lower()
            if ext == '.zip':
                self.get_scorm_topic_tree(subject_node, module['file'])
            elif ext == '.pdf':
                license = licenses.SpecialPermissionsLicense(copyright_holder="ProFuturo",
                                                             description="FIXME: Get license info")
                doc_id = '{}-{}'.format(subject_id, module['id'])
                doc_file = files.DocumentFile(path=module['file'])
                doc_node = nodes.DocumentNode(title=module['title'], source_id=doc_id, files=[doc_file], license=license)
                subject_node.add_child(doc_node)
            role = roles.LEARNER
            if 'role' in module:
                role = module['role']

            def set_role_recursive(node, role):
                node.role = role
                for child in node.children:
                    set_role_recursive(child, role)

            set_role_recursive(subject_node, role)
예제 #9
0
def _build_tree(node, sourcetree):
    """
    Parse nodes given in `sourcetree` and add as children of `node`.
    """
    for child_source_node in sourcetree:
        try:
            main_file = child_source_node['files'][0] if 'files' in child_source_node else {}
            kind = guess_content_kind(path=main_file.get('path'), web_video_data=main_file.get('youtube_id') or main_file.get('web_url'), questions=child_source_node.get("questions"))
        except UnknownContentKindError:
            continue

        if kind == content_kinds.TOPIC:
            child_node = nodes.TopicNode(
                source_id=child_source_node["id"],
                title=child_source_node["title"],
                author=child_source_node.get("author"),
                description=child_source_node.get("description"),
                thumbnail=child_source_node.get("thumbnail"),
            )
            node.add_child(child_node)

            source_tree_children = child_source_node.get("children", [])

            _build_tree(child_node, source_tree_children)

        elif kind == content_kinds.VIDEO:
            child_node = nodes.VideoNode(
                source_id=child_source_node["id"],
                title=child_source_node["title"],
                license=get_license(child_source_node.get("license"), description="Description of license"),
                author=child_source_node.get("author"),
                description=child_source_node.get("description"),
                derive_thumbnail=True, # video-specific data
                thumbnail=child_source_node.get('thumbnail'),
            )
            add_files(child_node, child_source_node.get("files") or [])
            node.add_child(child_node)

        elif kind == content_kinds.AUDIO:
            child_node = nodes.AudioNode(
                source_id=child_source_node["id"],
                title=child_source_node["title"],
                license=child_source_node.get("license"),
                author=child_source_node.get("author"),
                description=child_source_node.get("description"),
                thumbnail=child_source_node.get("thumbnail"),
            )
            add_files(child_node, child_source_node.get("files") or [])
            node.add_child(child_node)

        elif kind == content_kinds.DOCUMENT:
            child_node = nodes.DocumentNode(
                source_id=child_source_node["id"],
                title=child_source_node["title"],
                license=child_source_node.get("license"),
                author=child_source_node.get("author"),
                description=child_source_node.get("description"),
                thumbnail=child_source_node.get("thumbnail"),
            )
            add_files(child_node, child_source_node.get("files") or [])
            node.add_child(child_node)

        elif kind == content_kinds.EXERCISE:
            child_node = nodes.ExerciseNode(
                source_id=child_source_node["id"],
                title=child_source_node["title"],
                license=child_source_node.get("license"),
                author=child_source_node.get("author"),
                description=child_source_node.get("description"),
                exercise_data={}, # Just set to default
                thumbnail=child_source_node.get("thumbnail"),
            )
            add_files(child_node, child_source_node.get("files") or [])
            for q in child_source_node.get("questions"):
                question = create_question(q)
                child_node.add_question(question)
            node.add_child(child_node)

        elif kind == content_kinds.HTML5:
            child_node = nodes.HTML5AppNode(
                source_id=child_source_node["id"],
                title=child_source_node["title"],
                license=child_source_node.get("license"),
                author=child_source_node.get("author"),
                description=child_source_node.get("description"),
                thumbnail=child_source_node.get("thumbnail"),
            )
            add_files(child_node, child_source_node.get("files") or [])
            node.add_child(child_node)

        else:                   # unknown content file format
            continue

    return node
예제 #10
0
def build_tree_from_json(parent_node, sourcetree):
    """
    Recusively parse nodes in the list `sourcetree` and add them as children
    to the `parent_node`. Usually called with `parent_node` being a `ChannelNode`.
    """
    EXPECTED_NODE_TYPES = [
        TOPIC_NODE, VIDEO_NODE, AUDIO_NODE, EXERCISE_NODE, DOCUMENT_NODE,
        HTML5_NODE
    ]

    for source_node in sourcetree:
        kind = source_node['kind']
        if kind not in EXPECTED_NODE_TYPES:
            LOGGER.critical('Unexpected node type found: ' + kind)
            raise NotImplementedError(
                'Unexpected node type found in json data.')

        if kind == TOPIC_NODE:
            child_node = nodes.TopicNode(
                source_id=source_node.get("source_id", None),
                title=source_node["title"],
                author=source_node.get("author"),
                description=source_node.get("description"),
                language=source_node.get('language', None),
                thumbnail=source_node.get("thumbnail"),
            )
            parent_node.add_child(child_node)
            source_tree_children = source_node.get("children", [])
            build_tree_from_json(child_node, source_tree_children)

        elif kind == VIDEO_NODE:
            child_node = nodes.VideoNode(
                source_id=source_node["source_id"],
                title=source_node["title"],
                license=get_license(**source_node['license']),
                author=source_node.get("author"),
                description=source_node.get("description"),
                language=source_node.get('language', None),
                derive_thumbnail=source_node.get(
                    'derive_thumbnail', True),  # video-specific option
                thumbnail=source_node.get('thumbnail'),
            )
            add_files(child_node, source_node.get("files") or [])
            parent_node.add_child(child_node)

        elif kind == AUDIO_NODE:
            child_node = nodes.AudioNode(
                source_id=source_node["source_id"],
                title=source_node["title"],
                license=get_license(**source_node['license']),
                author=source_node.get("author"),
                description=source_node.get("description"),
                language=source_node.get('language', None),
                thumbnail=source_node.get('thumbnail'),
            )
            add_files(child_node, source_node.get("files") or [])
            parent_node.add_child(child_node)

        elif kind == EXERCISE_NODE:
            child_node = nodes.ExerciseNode(
                source_id=source_node["source_id"],
                title=source_node["title"],
                license=get_license(**source_node['license']),
                author=source_node.get("author"),
                description=source_node.get("description"),
                language=source_node.get('language', None),
                thumbnail=source_node.get("thumbnail"),
                questions=[],
            )
            add_questions(child_node, source_node.get("questions") or [])
            parent_node.add_child(child_node)

        elif kind == DOCUMENT_NODE:
            child_node = nodes.DocumentNode(
                source_id=source_node["source_id"],
                title=source_node["title"],
                license=get_license(**source_node['license']),
                author=source_node.get("author"),
                description=source_node.get("description"),
                language=source_node.get('language', None),
                thumbnail=source_node.get("thumbnail"),
            )
            add_files(child_node, source_node.get("files") or [])
            parent_node.add_child(child_node)

        elif kind == HTML5_NODE:
            child_node = nodes.HTML5AppNode(
                source_id=source_node["source_id"],
                title=source_node["title"],
                license=get_license(**source_node['license']),
                author=source_node.get("author"),
                description=source_node.get("description"),
                language=source_node.get('language', None),
                thumbnail=source_node.get("thumbnail"),
            )
            add_files(child_node, source_node.get("files") or [])
            parent_node.add_child(child_node)

        else:
            LOGGER.critical("Encountered an unknown kind: " + str(source_node))
            continue

    return parent_node
예제 #11
0
def build_tree_from_json(parent_node, sourcetree):
    """
    Recusively parse nodes in the list `sourcetree` and add them as children
    to the `parent_node`. Usually called with `parent_node` being a `ChannelNode`.
    """
    EXPECTED_NODE_TYPES = [TOPIC_NODE, VIDEO_NODE, AUDIO_NODE, EXERCISE_NODE,
                           DOCUMENT_NODE, HTML5_NODE, SLIDESHOW_NODE]

    for source_node in sourcetree:
        kind = source_node['kind']
        if kind not in EXPECTED_NODE_TYPES:
            LOGGER.critical('Unexpected node kind found: ' + kind)
            raise NotImplementedError('Unexpected node kind found in json data.')

        if kind == TOPIC_NODE:
            child_node = nodes.TopicNode(
                source_id=source_node.get('source_id', None),
                title=source_node['title'],
                description=source_node.get('description'),
                author=source_node.get('author'),
                aggregator=source_node.get('aggregator'),
                provider=source_node.get('provider'),
                # no role for topics (computed dynaically from descendants)
                language=source_node.get('language'),
                thumbnail=source_node.get('thumbnail'),
                tags=source_node.get('tags'),
            )
            parent_node.add_child(child_node)
            source_tree_children = source_node.get('children', [])
            build_tree_from_json(child_node, source_tree_children)

        elif kind == VIDEO_NODE:
            child_node = nodes.VideoNode(
                source_id=source_node['source_id'],
                title=source_node['title'],
                description=source_node.get('description'),
                license=get_license(**source_node['license']),
                author=source_node.get('author'),
                aggregator=source_node.get('aggregator'),
                provider=source_node.get('provider'),
                role=source_node.get('role', roles.LEARNER),
                language=source_node.get('language'),
                derive_thumbnail=source_node.get('derive_thumbnail', True),  # video-specific option
                thumbnail=source_node.get('thumbnail'),
                tags=source_node.get('tags'),
            )
            add_files(child_node, source_node.get('files') or [])
            parent_node.add_child(child_node)

        elif kind == AUDIO_NODE:
            child_node = nodes.AudioNode(
                source_id=source_node['source_id'],
                title=source_node['title'],
                description=source_node.get('description'),
                license=get_license(**source_node['license']),
                author=source_node.get('author'),
                aggregator=source_node.get('aggregator'),
                provider=source_node.get('provider'),
                role=source_node.get('role', roles.LEARNER),
                language=source_node.get('language'),
                thumbnail=source_node.get('thumbnail'),
                tags=source_node.get('tags'),
            )
            add_files(child_node, source_node.get('files') or [])
            parent_node.add_child(child_node)

        elif kind == EXERCISE_NODE:
            child_node = nodes.ExerciseNode(
                source_id=source_node['source_id'],
                title=source_node['title'],
                description=source_node.get('description'),
                license=get_license(**source_node['license']),
                author=source_node.get('author'),
                aggregator=source_node.get('aggregator'),
                provider=source_node.get('provider'),
                role=source_node.get('role', roles.LEARNER),
                language=source_node.get('language'),
                thumbnail=source_node.get('thumbnail'),
                tags=source_node.get('tags'),
                exercise_data=source_node.get('exercise_data'),
                questions=[],
            )
            add_questions(child_node, source_node.get('questions') or [])
            parent_node.add_child(child_node)

        elif kind == DOCUMENT_NODE:
            child_node = nodes.DocumentNode(
                source_id=source_node['source_id'],
                title=source_node['title'],
                description=source_node.get('description'),
                license=get_license(**source_node['license']),
                author=source_node.get('author'),
                aggregator=source_node.get('aggregator'),
                provider=source_node.get('provider'),
                role=source_node.get('role', roles.LEARNER),
                language=source_node.get('language'),
                thumbnail=source_node.get('thumbnail'),
                tags=source_node.get('tags'),
            )
            add_files(child_node, source_node.get('files') or [])
            parent_node.add_child(child_node)

        elif kind == HTML5_NODE:
            child_node = nodes.HTML5AppNode(
                source_id=source_node['source_id'],
                title=source_node['title'],
                description=source_node.get('description'),
                license=get_license(**source_node['license']),
                author=source_node.get('author'),
                aggregator=source_node.get('aggregator'),
                provider=source_node.get('provider'),
                role=source_node.get('role', roles.LEARNER),
                language=source_node.get('language'),
                thumbnail=source_node.get('thumbnail'),
                tags=source_node.get('tags'),
            )
            add_files(child_node, source_node.get('files') or [])
            parent_node.add_child(child_node)

        elif kind == SLIDESHOW_NODE:
            child_node = nodes.SlideshowNode(
                source_id=source_node['source_id'],
                title=source_node['title'],
                description=source_node.get('description'),
                license=get_license(**source_node['license']),
                author=source_node.get('author'),
                aggregator=source_node.get('aggregator'),
                provider=source_node.get('provider'),
                role=source_node.get('role', roles.LEARNER),
                language=source_node.get('language'),
                thumbnail=source_node.get('thumbnail'),
                tags=source_node.get('tags')
            )
            add_files(child_node, source_node.get('files') or [])
            parent_node.add_child(child_node)

        else:
            LOGGER.critical('Encountered an unknown kind: ' + str(source_node))
            continue

    return parent_node
def scrape_page(channel):
    """ Read main page for Saylor (https://www.saylor.org/books/) """
    try:
        page = BeautifulSoup(read_source(BASE_URL, loadjs=True), 'html.parser')
        contents = page.find('div', {
            'class': 'main-content'
        }).find('div', {'class', 'row'})

        # Site doesn't have special designation for subjects, so get headers
        for subject in contents.find_all('h3'):

            # Create subject topic
            title = subject.text.replace(u'\xa0', u' ').replace('\n', '')
            source_id = generate_id(title)
            category_topic = nodes.TopicNode(source_id=source_id, title=title)
            channel.add_child(category_topic)
            LOGGER.info(title)

            # Get list from subject
            book_list = subject.findNext('ul')
            for book in book_list.find_all('li'):
                license = LICENSE
                page_links = []

                # Some books have subsections for different formats/licenses
                # e.g. See Business-General/Miscellaneous > Information Systems for Business and Beyond
                if book.find('small'):
                    # Determine what license to use
                    for l in licenses.choices:
                        if l[0] in book.find('small').text:
                            license = l[0]
                            break
                    booktitle = book.contents[0]
                    LOGGER.info("    " + booktitle)
                    # Download one of the sublinks
                    for sublink in book.find_all('a'):
                        if not sublink.get('href'):
                            continue
                        elif "PDF" in sublink.text:
                            category_topic.add_child(
                                nodes.DocumentNode(
                                    source_id=source_id +
                                    os.path.basename(sublink['href']),
                                    title=booktitle,
                                    license=license,
                                    copyright_holder=COPYRIGHT_HOLDER,
                                    files=[
                                        files.DocumentFile(
                                            path=sublink['href'])
                                    ]))
                            break  # only need to download one format of the book
                        elif "HTML" in sublink.text:
                            html_node = scrape_book(sublink['href'],
                                                    license=license)
                            if html_node:
                                category_topic.add_child(html_node)
                                break  # only need to download one format of the book

                # Most book links go straight to an html page
                else:
                    page_links.append(book.find('a')['href'])
                    html_node = scrape_book(book.find('a')['href'], license)
                    if html_node:
                        category_topic.add_child(html_node)
    finally:
        # No matter what, add link to video mapping for future runs
        with open(VIDEO_MAP_JSON, "w") as videojson:
            json.dump(VIDEO_MAPPING, videojson)