def construct_channel(self, **kwargs): # create channel channel = self.get_channel(**kwargs) # create a topic "Ebooks" and add it to channel topic = TopicNode( source_id="ebooksfolder", title="Ebooks", language=languages.getlang('en').code, ) channel.add_child(topic) # Create an ePub file and add ePub file to a DocumentNode epub_file = EPubFile( path='samplefiles/documents/laozi_tao-te-ching.epub') doc_node = DocumentNode( source_id="<en_doc_id>", title='Tao Te Ching', author='Lao Zi', description='This is a sample epub document', license=licenses.PUBLIC_DOMAIN, language=languages.getlang('en').code, files=[epub_file], ) # Add document node to the topic topic.add_child(doc_node) return channel
def parse_page(self, channel, page): categories = {} # Create TopicNodes for each filter filters = page.find("nav", class_="course-list__filters").find_all("a") for category in filters: category_id = category["data-filterby"] # Exclude the filter which contains all the courses if category_id == "all": continue category_node = TopicNode(source_id=category.text, title=category.text) categories[category_id] = category_node channel.add_child(category_node) # Get all the courses in json format data = page.find("script", {"id": "__data__"}).text pattern = re.compile("courses: \[(.*?)}]") courses = pattern.search(data).group(1).split("}, ") for item in courses: course = json.loads(item + "}") course_url = "{base}course/{slug}?enroll-success=1".format( base=BASE_URL, slug=course["slug"]) course_node = TopicNode( source_id="{lang}-{course}".format(lang=CHANNEL_LANGUAGE, course=course["slug"]), title=course["title"].encode("utf-8").decode("unicode_escape"), thumbnail=course["image"], ) categories[course["category"]].add_child(course_node) self.parse_course(course_node, course_url)
def get_files(topic_name, directory, files): topic = TopicNode(title=topic_name, source_id="{}_id".format(topic_name)) for filename in files: if filename.endswith("h5p"): title = None else: title = os.path.splitext(filename)[0] node = get_file(topic_name, directory, filename, title) if node: topic.add_child(node) return topic
def construct_channel(*args, **kwargs): channel = create_channel(*args, **kwargs) citrus_topic = TopicNode(source_id="List_of_citrus_fruits", title="Citrus!") channel.add_child(citrus_topic) add_subpages_from_wikipedia_list(citrus_topic, "https://en.wikipedia.org/wiki/List_of_citrus_fruits") potato_topic = TopicNode(source_id="List_of_potato_cultivars", title="Potatoes!") channel.add_child(potato_topic) add_subpages_from_wikipedia_list(potato_topic, "https://en.wikipedia.org/wiki/List_of_potato_cultivars") return channel
def construct_channel(self, **kwargs): channel = self.get_channel(**kwargs) old_title = None old_group = None i = 0 for metadata, zfilename, [title, group] in index.no_dl_index(): i = i + 1 if title != old_title: old_title = title title_node = TopicNode(source_id=title + str(i), title=replace(title)) channel.add_child(title_node) old_group = None if group != old_group: old_group = group group_node = TopicNode(source_id=title + group + str(i + 0.5), title=replace(group)) title_node.add_child(group_node) doc_node = HTML5AppNode( title=metadata.title, description=metadata.description, source_id=zfilename + str(i + 0.9), license=LICENCE, language='es', files=[HTMLZipFile(path=zfilename)], ) group_node.add_child(doc_node) return channel
def build_pdf_topics(main_topic, sections, lang_code): """ Adds the documents from the sections tree to the `main_topic`. - CASE A = no children => add as DocumentNode - CASE B = has children => add as TopicNode and add all children as DocumentNode """ LICENSE = get_license("CC BY-NC-SA", copyright_holder=POINTB) for i, section in enumerate(sections): # CASE A: All sections except Section 2 if 'children' not in section: title = section['title'] abspath = section['path'] filename = os.path.basename(abspath) doc_node = DocumentNode( title=title, description= 'Chapter from A GUIDE TO BECOMING A 21ST CENTURY TEACHER', source_id='%s-%s' % (filename, lang_code), license=LICENSE, aggregator=LE, language=lang_code, role=roles.COACH, files=[DocumentFile(path=abspath, language=lang_code)]) main_topic.add_child(doc_node) # CASE B: Section 2 else: section_topic = TopicNode(title=section['title'], source_id="pointb_section_" + str(i)) main_topic.add_child(section_topic) for subsection in section['children']: title = subsection['title'] abspath = subsection['path'] filename = os.path.basename(abspath) subsection_doc_node = DocumentNode( title=title, description='', source_id='%s-%s' % (filename, lang_code), license=LICENSE, aggregator=LE, language=lang_code, role=roles.COACH, files=[DocumentFile(path=abspath, language=lang_code)]) section_topic.add_child(subsection_doc_node) return main_topic
def construct_channel(self, **kwargs): channel = self.get_channel(**kwargs) potato_topic = TopicNode(title="Potatoes!", source_id="<potatos_id>") channel.add_child(potato_topic) doc_node = DocumentNode( title='Growing potatoes', description='An article about growing potatoes on your rooftop.', source_id='pubs/mafri-potatoe', license=get_license('CC BY', copyright_holder='University of Alberta'), language='en', files=[DocumentFile(path='https://www.gov.mb.ca/inr/pdf/pubs/mafri-potatoe.pdf', language='en')], ) potato_topic.add_child(doc_node) return channel
def addAssignment(a_info): try: assignment = TopicNode(title=a_info['title'], source_id=a_info['id'], description=a_info['description'], language='en', derive_thumbnail=True, thumbnail=None) except: assignment = TopicNode(title=a_info['title'], source_id=a_info['id'], language='en', derive_thumbnail=True, thumbnail=None) return assignment
def test_generate_tiled_thumbnail(self, document, html, video, audio): topic = TopicNode('test-topic', 'Topic') topic.add_child(document) topic.add_child(html) topic.add_child(video) topic.add_child(audio) config.THUMBNAILS = True for child in topic.children: # must process children before topic node child.process_files() filenames = topic.process_files() assert len(filenames) == 1, 'expected one filename' self.check_has_thumbnail(topic)
def parse_topics(self, topics, channel): """ Parse the topics on the site. """ final_topics = [] main_topics = [] for topic in topics: href = topic["href"].split(MAIN_PAGE_HREF) subject = href[-1].split("/")[0] if subject in SUBJECT_BLACKLIST: continue # Get subject information for the topic subjectLink = BASE_URL.format(href[-1]) subjectTitle = topic.text.strip() subject_topic = TopicNode(source_id=subjectTitle, title=subjectTitle) # When the topic is a subtopic of another if topic.parent.parent.attrs["class"][0] == "l2": parent = main_topics[-1] else: parent = channel main_topics.append(subject_topic) topic_tuple = (subject_topic, subjectLink, parent) final_topics.append(topic_tuple) return final_topics
def hier(medium, curriculum_tags): out_tags = [] all_ancestors = [] for tag in curriculum_tags: all_ancestors.extend(tag['ancestor_ids']) retry = {} for tag in sorted(curriculum_tags, key=lambda x: x['id']): slug = tag['slug'] _id = tag['id'] name = tag['name'] if not tag['ancestor_ids']: ancestor = "ROOT" else: ancestor = tag['ancestor_ids'][-1] # attach to tree if _id not in nodes[medium]: nodes[medium][_id] = TopicNode(source_id=slug, title=name) try: add_child_replacement(nodes[medium][ancestor], nodes[medium][_id], before=True) # nodes[medium][ancestor].add_child(nodes[medium][_id]) except Exception: retry[(medium, ancestor)] = nodes[medium][_id] if _id not in all_ancestors: out_tags.append(nodes[medium][_id]) for k, v in retry.items(): add_child_replacement(nodes[k[0]][k[1]], v, before=True) # nodes[k[0]][k[1]].add_child(v) assert out_tags return out_tags
def parse_through_tree(tree, parent_topic, as_booklist): """ Recursively parsing through the tree and adding TopicNodes and DocumentNodes. Parameters: * tree - The tree that contains information about category, publisher, language, level, and book and is going to be parsed * parent_topic - The parent node that will be attached with Nodes created later * as_booklist - the list of books from African Storybook """ for topic_name in sorted(tree): content = tree[topic_name] try: title = "Level {}".format(int(topic_name)) except ValueError: title = topic_name current_topic = TopicNode( source_id="{}_{}".format(parent_topic.source_id, topic_name.replace(" ", "_")), title=title, ) if type(content) is list: add_node_document(content, current_topic, as_booklist) else: parse_through_tree(content, current_topic, as_booklist) # Only add the current topic node when it has child nodes if current_topic.children: parent_topic.add_child(current_topic)
def construct_channel(*args, **kwargs): channel = create_channel(*args, **kwargs) city_topic = TopicNode(source_id="List_of_largest_cities", title="Cities!") channel.add_child(city_topic) add_subpages_from_wikipedia_list( city_topic, "https://en.wikipedia.org/wiki/List_of_largest_cities") return channel
def make_topic_for_settings(title, ffmpeg_settings): """ Assumes global VIDEO_URLS available. """ topic = TopicNode( source_id=title, title=title, description='', author=None, language=getlang('en').id, thumbnail=None, ) for counter, video_url in enumerate(VIDEO_URLS): vid_number = counter + 1 video_title = 'Video ' + str(vid_number) video_node = make_video_node(video_title, video_url, ffmpeg_settings=ffmpeg_settings) topic.add_child(video_node) return topic
def construct_channel(self, **kwargs): channel = self.get_channel(**kwargs) potato_topic = TopicNode(title="Potatoes!", source_id="<potatoes_id>") channel.add_child(potato_topic) document_node = DocumentNode( title="Growing potatoes", description="An article about growing potatoes on your rooftop.", source_id="pubs/mafri-potatoe", license=get_license("CC BY", copyright_holder="University of Alberta"), language="en", files=[ DocumentFile( path="https://www.gov.mb.ca/inr/pdf/pubs/mafri-potatoe.pdf", language="en", ) ], ) potato_topic.add_child(document_node) return channel
def get_node(_path, rootparent, nodes): global badcount path = tuple(_path) if path in nodes: return nodes[path] if len(path) > 1: parent = get_node(path[:-1], rootparent, nodes) assert parent else: parent = rootparent if path not in nodes: title = path[-1] if title in badlist.rename: badcount = badcount + 1 print("badlist: RENAME", title, badcount) title = badlist.rename[title] # mangle title: if arabic.grade in title: print("grade: SKIP GRADE", title) else: print("grade: NO GRADE", title) drop = False for drop_word in arabic.drop_words: if drop_word in title: drop = drop_word print("grade: ", drop) if drop: newtitle = False for subject in arabic.subjects: if subject in title: title = arabic.subjects[subject] newtitle = True assert newtitle, title print("grade: CHANGE TO", title) # check if new title present! path = list(path) path[-1] = title path = tuple(path) if path in nodes: print("NEW PATH") return nodes[path] nodes[path] = TopicNode(source_id="topic" + title, title=title) if title in badlist.badlist: badcount = badcount + 1 print("badlist: BAD: ", title, badcount) return nodes[path] # unconnected! parent.add_child(nodes[path]) return nodes[path]
def construct_channel(self, **kwargs): channel = self.get_channel(**kwargs) for name, _id in catnum.items(): cat_node = TopicNode(source_id=str(_id), title=name) channel.add_child(cat_node) links = crawl.get_all_links(_id) for link in list(links): zipfilename, title = localise.zip_from_url(link) appzip = HTMLZipFile(zipfilename) if os.path.exists(zipfilename + "_2.jpg"): thumb = ThumbnailFile(zipfilename + "_2.jpg") else: thumb = None zipnode = HTML5AppNode(source_id=link, title=title, license=licenses.CC_BY, copyright_holder=CHANNEL_NAME, files=[appzip], author=crawl.author_lookup[link], thumbnail=thumb) zipnode.validate() cat_node.add_child(zipnode) cat_node.validate() print("DONE") return channel
def construct_channel(self, **kwargs): channel = self.get_channel(**kwargs) # Soupify goalkicker main page gk_url = 'https://' + self.channel_info['CHANNEL_SOURCE_DOMAIN'] + '/' gk_soup = get_soup(gk_url) # Get urls for each goalkicker book els_with_page_urls = gk_soup.find_all(class_='bookContainer') page_urls = [ gk_url + el.find('a')['href'] for el in els_with_page_urls ] for page_url in page_urls: # Soupify book page page_soup = get_soup(page_url) # Extract and construct book info book_info = parse_book_info(page_soup) book_info['absolute_url'] = page_url + book_info['relative_url'] # Add book to channel tree topic_node_source_id = 'topic/' + book_info['subject'] page_topic_node = TopicNode(title=book_info['subject'], source_id=topic_node_source_id) channel.add_child(page_topic_node) doc_node = DocumentNode( title=book_info['title'], description=book_info['description'], source_id=book_info['source_id'], license=get_license('CC BY-SA', copyright_holder='Creative Commons'), language='en', files=[ DocumentFile(path=book_info['absolute_url'], language='en') ], ) page_topic_node.add_child(doc_node) return channel
def construct_channel(self, *args, **kwargs): channel = self.get_channel(**kwargs) videos_topic = TopicNode(source_id="/wiki/Category:Articles_containing_video_clips", title="Articles containing video clips") channel.add_child(videos_topic) thumbnail_url = 'https://upload.wikimedia.org/wikipedia/commons/thumb/e/ee/A_Is_for_Atom_1953.webm/220px--A_Is_for_Atom_1953.webm.jpg' page = download_wikipedia_page('/wiki/Category:Articles_containing_video_clips', thumbnail_url, 'A Is for Atom') videos_topic.add_child(page) video_url = 'https://upload.wikimedia.org/wikipedia/commons/e/ee/A_Is_for_Atom_1953.webm' video_file = VideoFile(path=video_url) video_node = VideoNode(title='A Is for Atom 1953', source_id='A_Is_for_Atom_1953.webm', files=[video_file], license=licenses.PublicDomainLicense()) subtitle_url = 'https://commons.wikimedia.org/w/api.php?action=timedtext&title=File%3AA_Is_for_Atom_1953.webm&lang={}&trackformat=srt' subtitle_languages = [ 'en', 'es', ] for lang in subtitle_languages: subtitle_file = SubtitleFile(path=subtitle_url.format(lang), language=lang, subtitlesformat='srt') video_node.add_file(subtitle_file) videos_topic.add_child(video_node) return channel
def get_things(all_things, parent_node, new_node=True): for thing in all_things: _id = thing.url.strip('/').split('/')[-1] # TODO hash if new_node: this_node = TopicNode(source_id = thing.url, title=thing.title) else: this_node = parent_node content_node = make_youtube_video(thing.youtube, "Video: {}".format(thing.title), "video__{}".format(thing.url)) # TODO hash if content_node is not None: this_node.add_child(content_node) try: os.mkdir('html') except Exception: pass fn = "html/{}.zip".format(_id) with open(fn, "wb") as f: f.write(thing.app) app_zip = HTMLZipFile(fn) if thing.title[0] in "AEIOUaeiou": an = "an" else: an = "a" app_node = HTML5AppNode(source_id = "app_{}".format(thing.url), title = "Being {} {}".format(an, thing.title), license = LICENCE, files=[app_zip]) this_node.add_child(app_node) if new_node: parent_node.add_child(this_node)
def construct_channel(self, *args, **kwargs): """ Create ChannelNode and build topic tree. """ channel = self.get_channel( *args, **kwargs) # creates ChannelNode from data in self.channel_info city_topic = TopicNode(source_id="List_of_largest_cities", title="Cities!") channel.add_child(city_topic) add_subpages_from_wikipedia_list( city_topic, "https://en.wikipedia.org/wiki/List_of_largest_cities") return channel
def construct_channel(self, **kwargs): channel = self.get_channel(**kwargs) # add topics and corresponding books to the channel channel_tree = download_all() as_booklist = get_AS_booklist_dict() for category in sorted(channel_tree): category_topic = TopicNode(source_id=category.replace(" ", "_"), title=category) channel.add_child(category_topic) parse_through_tree(channel_tree[category], category_topic, as_booklist) return channel
def construct_channel(self, **kwargs): # these 2 methods output to the downloads folder download_documents() crop_documents() en_chapters = split_pdfs('English') my_chapters = split_pdfs('Burmese') channel = self.get_channel(**kwargs) main_topic_en = TopicNode(title="21ST CENTURY GUIDE English Topic", source_id="main_en", thumbnail=DOWNLOADS_FOLDER + '/thumbnail.png') topic_videos_en = TopicNode(title="Videos", source_id="pointb_en_videos", thumbnail=DOWNLOADS_FOLDER + '/videothumbnail.png') main_topic_my = TopicNode(title="21ST CENTURY GUIDE Burmese Topic", source_id="main_my", thumbnail=DOWNLOADS_FOLDER + '/thumbnail.png') topic_videos_my = TopicNode(title="Videos", source_id="pointb_my_videos", thumbnail=DOWNLOADS_FOLDER + '/videothumbnail.png') add_documents(main_topic_en, en_chapters, 'en') add_documents(main_topic_my, my_chapters, 'my') channel.add_child(main_topic_en) channel.add_child(main_topic_my) topic_videos_en = download_videos(topic_videos_en, 'en') topic_videos_my = download_videos(topic_videos_my, 'my') channel.add_child(topic_videos_en) channel.add_child(topic_videos_my) return channel
def generate_child_topics(arvind_contents, main_topic, lang_obj, topic_type): # Create a topic for each languages data = arvind_contents[lang_obj.name] for topic_index in data: topic_name = topic_index if topic_type == STANDARD_TOPIC: source_id = 'arvind-child-topic-{0}'.format(topic_name) topic_node = TopicNode(title=topic_name, source_id=source_id) download_video_topics(data, topic_name, topic_node, lang_obj) main_topic.add_child(topic_node) if topic_type == SINGLE_TOPIC: download_video_topics(data, topic_name, main_topic, lang_obj) return main_topic
def add_documents(topic, chapters, language): for idx, chapter in enumerate(chapters): # if chapter has 'children' if 'children' in chapter.keys(): doc_title = chapter['title'] child_topic_node = TopicNode(title=doc_title, source_id=language + doc_title, thumbnail=DOWNLOADS_FOLDER + '/thumbnail.png') for child in chapter['children']: child_doc_title = child['title'] doc_node = DocumentNode( title=child_doc_title, description=f'Chapter {idx} from {doc_title}', source_id=language + child_doc_title, license=get_license('CC BY', copyright_holder='NC-SA 4.0'), language=language, thumbnail=DOWNLOADS_FOLDER + '/thumbnail.png', files=[ DocumentFile(path=child['path'], language=language) ], ) child_topic_node.add_child(doc_node) topic.add_child(child_topic_node) else: doc_title = chapter['title'] doc_node = DocumentNode( title=doc_title, description=f'Chapter {idx} from 21ST CENTURY GUIDE', source_id=language + doc_title, license=get_license('CC BY', copyright_holder='NC-SA 4.0'), language=language, thumbnail=DOWNLOADS_FOLDER + '/thumbnail.png', files=[DocumentFile(path=chapter['path'], language=language)], ) topic.add_child(doc_node)
def generate_child_topics(arvind_contents, main_topic, lang_obj, topic_type): # Create a topic for each languages pp = pprint.PrettyPrinter() data = arvind_contents[lang_obj.name] for topic_index in data: if topic_type == STANDARD_TOPIC: source_id = lang_obj.code + '_' + topic_index topic_node = TopicNode(title=topic_index, source_id=source_id) download_video_topics(data, topic_index, topic_node, lang_obj) main_topic.add_child(topic_node) if topic_type == SINGLE_TOPIC: download_video_topics(data, topic_index, main_topic, lang_obj) return main_topic
def download_subject(self, subject, link, parent): """ Parse each subject page. """ LOGGER.info("Processing subject: {}".format(subject.title)) # No need to parse the content under the subject when link is not valid if "javascript:void(0);" in link: parent.add_child(subject) return # Parse each subject's index page resp = downloader.make_request(link) soup = BeautifulSoup(resp.content, "html.parser") selected_category = soup.find("option", { "class": "level0", "selected": "selected" }) if not selected_category: return parent.add_child(subject) for item in AGE_RANGE.keys(): params = OrderedDict([("category", selected_category["value"]), ("moduleId", "282"), ("format", "count")]) for index in range(len(AGE_RANGE[item])): params["taga[{}]".format(index)] = AGE_RANGE[item][index] # Parse the topics of age range under each subject resp = downloader.make_request("{}/itemlist/filter".format(link), params=params) count = int(resp.text.split('\n')[0]) if count == 0: continue LOGGER.info("Processing topic: {}".format(item)) age_topic = TopicNode(source_id=item, title=item) subject.add_child(age_topic) total_pages = ceil(count / 20) for i in range(total_pages): page_params = OrderedDict(params) LOGGER.info("Processing page: {}".format(i)) self.download_content(age_topic, link, page_params, selected_category["value"], i * 20)
def parse_through_tree(tree, parent_topic, as_booklist): for topic_name in sorted(tree): content = tree[topic_name] try: title = 'Level {}'.format(int(topic_name)) except ValueError: title = topic_name current_topic = TopicNode( source_id='{}_{}'.format(parent_topic.source_id, topic_name.replace(' ', '_')), title=title, ) parent_topic.add_child(current_topic) if type(content) is list: add_node_document(content, current_topic, as_booklist) else: parse_through_tree(content, current_topic, as_booklist)
def get_subtopics(parent, path): doc = get_page(path) try: menu_row = doc.find('div', {'id': 'body-row'}) menu_row = menu_row.find('div', {'class': 'col-md-2'}) except Exception as e: LOGGER.error('get_subtopics: %s : %s' % (e, doc)) return for subtopic in menu_row.find_all('a'): try: title = subtopic.get_text().strip() source_id = get_source_id(subtopic['href']) LOGGER.info(' subtopic: %s: %s' % (source_id, title)) node = TopicNode(title=title, source_id=source_id) parent.add_child(node) get_lessons(node, subtopic['href']) except Exception as e: LOGGER.error('get_subtopics: %s : %s' % (e, subtopic))
def download_category(self, parent, cat_id, categories, sims, keywords, language): """ Process a category, and add all its sub-categories, and its simulations/videos. """ print("Processing category:", cat_id) cat = categories[str(cat_id)] # loop through all subtopics and recursively add them # (reverse order seems to give most rational results) for child_id in reversed(cat["childrenIds"]): # look up the child category by ID subcat = categories[str(child_id)] # skip it if it's in our blacklist if subcat["name"] in ID_BLACKLIST_BY_LANG.get( language, ID_BLACKLIST_BY_LANG['en']): continue # make the title human-readable, and clean it up title = subcat["name"].replace("-", " ").title() title = title.replace(" And ", " and ") title = title.replace("Mathconcepts", "Concepts") title = title.replace("Mathapplications", "Applications") if language == "ar": title = ARABIC_NAME_CATEGORY[title] # create the topic node, and add it to the parent subtopic = TopicNode( source_id=subcat["name"], title=title, ) parent.add_child(subtopic) # recursively download the contents of the topic self.download_category(subtopic, child_id, categories, sims, keywords, language) # loop through all sims in this topic and add them, but only if we're at a leaf topic if len(parent.children) == 0: for sim_id in list(set(cat["simulationIds"])): # skip ones that aren't found (probably as they aren't HTML5) if sim_id not in sims: continue self.download_sim(parent, sims[sim_id], keywords, language)