def license_objects(): regular_ids = [ CC_BY, CC_BY_SA, CC_BY_ND, CC_BY_NC, CC_BY_NC_SA, CC_BY_NC_ND, ALL_RIGHTS_RESERVED, PUBLIC_DOMAIN ] license_objects = [] for regular_id in regular_ids: # with desciption and copyright_holder licence_obj = get_license(regular_id, copyright_holder='Some name', description='Le description') assert licence_obj, 'licence_obj should exist' license_objects.append(licence_obj) # with desciption only licence_obj = get_license(regular_id, description='Le description solo2') assert licence_obj, 'licence_obj should exist' license_objects.append(licence_obj) # with copyright_holder only licence_obj = get_license(regular_id, copyright_holder='Some name3') assert licence_obj, 'licence_obj should exist' license_objects.append(licence_obj) # bare licence_obj = get_license(regular_id) assert licence_obj, 'licence_obj should exist' license_objects.append(licence_obj) return license_objects
def pdfNode(infoDict): #Get response from converted-to-pdf path response = requests.get(infoDict['pdfPath'], auth=auth) global pdfCopy if os.path.exists(infoDict['pdfTitle']): infoDict['pdfTitle'] = infoDict['pdfTitle'].replace( ".pdf", str(pdfCopy) + ".pdf") pdfCopy += 1 #Write pdf to local file with open(infoDict['pdfTitle'], 'wb') as f: f.write(response.content) filesCreated.append(infoDict['pdfTitle']) #Create Document Node pdfNode = DocumentNode( source_id=str(infoDict['id']), title=infoDict['pdfTitle'], language="en", description="", license=get_license(licenses.CC_BY, copyright_holder='Copyright holder name'), files=[DocumentFile( path=infoDict['pdfTitle'], language="en", )], ) return pdfNode
def make_video_node(title, video_url, video_language='en', ffmpeg_settings=None): """ Create a VideoNode from video_url. Assumes title is unique within containin topic. """ content_node = VideoNode(source_id=title, title=title, author='Sikana', description='', language=getlang(video_language).id, license=get_license( licenses.CC_BY_NC_ND, copyright_holder='Sikana Education'), thumbnail=None, derive_thumbnail=True, files=[ VideoFile( path=video_url, language=getlang(video_language).id, ffmpeg_settings=ffmpeg_settings, ) ]) return content_node
def add_content_to_tree(self, channel): tree = self.channel_tree lang = 'English' lang_obj = getlang("en") for class_name in tree[lang]: class_obj = tree[lang][class_name] class_id = "{}-{}".format(lang, class_name) class_node = nodes.TopicNode(source_id=class_name, title=class_name) for subject_name in class_obj: subject_id = "{}-{}".format(class_id, subject_name) subject_node = nodes.TopicNode(source_id=subject_id, title=subject_name) subject_obj = class_obj[subject_name] for item in subject_obj['items']: item_id = "{}-{}".format(subject_id, get_column(item, 'id')) video = nodes.VideoNode( source_id=item_id, title=get_column(item, 'name'), description=get_column(item, 'description'), files=[ files.VideoFile(path=get_column(item, 'file')) ], language=lang_obj, # FIXME: Use the column's license field instead of hardcoding. license=licenses.get_license(le_licenses.CC_BY, copyright_holder=get_column(item, "copyright")), # thumbnail=get_column(item, "thumbnail") ) subject_node.add_child(video) class_node.add_child(subject_node) channel.add_child(class_node)
def add_node_document(booklist, level_topic, as_booklist): # Add books according to level, language and publisher for item in booklist: # initailize the source domain and content_id domain = uuid.uuid5(uuid.NAMESPACE_DNS, 'storyweaver.org.in') book_id = str(item['source_id']) """ If the publisher is AS and the book is found, then change the source_domain and content_id """ if item['publisher'] == 'African Storybook Initiative': check = check_if_story_in_AS(as_booklist, item['title']) if check[0] == True: domain = uuid.uuid5(uuid.NAMESPACE_DNS, 'www.africanstorybook.org') book_id = check[1] link = get_html5_app_zip_path(item['slug']) if link: html5_file = HTMLZipFile(path=link) book = HTML5AppNode( title=item['title'], source_id=book_id, author=item['author'], files=[html5_file], license=get_license(licenses.CC_BY, copyright_holder='Pratham Books'), thumbnail=item.get('thumbnail'), description=item['description'], domain_ns=domain, ) level_topic.add_child(book)
def vimeoNode(url): r = requests.get(url).text # grabs request of the URL #Get video title bs = bs4.BeautifulSoup(r, "html.parser") videoTitle = bs.find_all('title', limit=1) #videoTitle includes html tags, stripping them newTitle = str(re.sub('<.*?>', '', str(videoTitle))) #May have to delete if there are brackets in title newTitle = newTitle.replace("]", '') newTitle = newTitle.replace("[", '') #Create Video Node video_node = VideoNode( source_id=url, # set to url title=str(newTitle), license=get_license(licenses.CC_BY, copyright_holder='Copyright holder name'), language=getlang('en').id, derive_thumbnail=True, # video-specicig flag thumbnail=None, files=[ WebVideoFile(web_url=url, language='en'), ]) #Return Video Node return video_node
def googleNode(url): #Get doc id id = getIdFromUrl(url) #Call proper google download function if (url.find('document') != -1): print("Found Document") #Uses download function to get pdf of document, returns filename fileName = downloadDocument(id) elif (url.find('presentation') != -1): print("Found Presentation") #Uses download function to get pdf of powerpoint, returns filename fileName = downloadPowerpoint(id) elif (url.find('spreadsheets') != -1): print("Found Spreadsheet") fileName = downloadSpreadsheet(id) #Use download and filename to create node googleNode = DocumentNode( source_id=id, title=fileName, language="en", description="", license=get_license(licenses.CC_BY, copyright_holder='Copyright holder name'), files=[DocumentFile( path=fileName, language="en", )], ) return googleNode
def linkAssignment(linkData): #Get URL and Title from JSON info url = linkData['attachments']['links']['link'][0]['url'] title = linkData['attachments']['links']['link'][0]['title'] #Make session and request to get HTML session = requests.Session() session.headers[ "User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36" html = session.get(url).content #HTML parser soup = bs4.BeautifulSoup(html, "html.parser") #Path for folder to hold content global zipId filename = 'myzipper' + str(zipId) print("\n\n\n" + filename + "\n\n\n") zipId = zipId + 1 #Delete folder if it already exists if (os.path.exists(filename)): shutil.rmtree(filename) #os.unlink(filename) #Download all assets(html, css, js,...) from url doc = download_static_assets(soup, filename, url, request_fn=make_request, url_blacklist=url_blacklist) # Write out the HTML source. with open(os.path.join(filename, "index.html"), "w", encoding="utf-8") as f: f.write(str(doc)) #Outputs files being downloaded print(" ... downloaded to %s" % filename) filesCreated.append(filename) #Make zip file from folder contents shutil.make_archive(filename, 'zip', filename) filesCreated.append(filename + '.zip') #Creation of file and node link_file = HTMLZipFile(path=(filename + '.zip')) link_node = HTML5AppNode(source_id=url, title=title, license=get_license( licenses.CC_BY, copyright_holder='Copyright holder name'), language=getlang('en').id, derive_thumbnail=False, thumbnail=None, files=[link_file]) return link_node
def __init__(self, source_id=None, lang="en", name=None): self.source_id = source_id self.filepath = None self.name = name self.lang = lang self.menu = Menu(lang=self.lang, name=name) self.license = get_license( licenses.CC_BY_NC_SA, copyright_holder=COPYRIGHT_HOLDER).as_dict()
def __init__(self, source_id, lang="en", lincese="", name=None): self.filename = get_name_from_url(source_id) self.source_id = urljoin( BASE_URL, source_id) if source_id.startswith("/") else source_id self.filepath = None self.lang = lang self.name = "{}_{}".format(name, self.filename) self.license = get_license( licenses.CC_BY_NC_SA, copyright_holder=COPYRIGHT_HOLDER).as_dict()
def construct_channel(self, *args, **kwargs): """ Creates ChannelNode and build topic tree Args: - args: arguments passed in during upload_channel (currently None) - kwargs: extra argumens and options not handled by `uploadchannel`. For example, add the command line option lang="fr" and the string "fr" will be passed along to `construct_channel` as kwargs['lang']. Returns: ChannelNode """ channel = self.get_channel( *args, **kwargs) # Create ChannelNode from data in self.channel_info topics = load_json_from_file(JSON_FILE) for topic in topics: book_title = topic['book_title'] source_id = book_title.replace(" ", "_") url = topic['path_or_url'] topic_node = nodes.TopicNode(source_id=source_id, title=book_title, tags=[ "Teacher facing", "Professional development", "Life skills", "Intercultural skills", "Mentorship", "Formal contexts" ]) channel.add_child(topic_node) parser = pdf.PDFParser(url, toc=topic['chapters']) parser.open() chapters = parser.split_chapters() for chapter in chapters: title = chapter['title'] pdf_path = chapter['path'] pdf_file = files.DocumentFile(pdf_path) pdf_node = nodes.DocumentNode( source_id="{} {}".format(book_title, title), title=title, author="INTO", tags=[ "Teacher facing", "Professional development", "Life skills", "Intercultural skills", "Mentorship", "Formal contexts" ], files=[pdf_file], license=licenses.get_license(CHANNEL_LICENSE, "INTO", LICENSE_DESCRIPTION), copyright_holder="INTO") topic_node.add_child(pdf_node) raise_for_invalid_channel( channel) # Check for errors in channel construction return channel
def linkAssignment(material): url = material["link"]["url"] session = requests.Session() session.headers[ "User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36" html = session.get(url).content soup = bs(html, "html.parser") script_files = [] css_files = [] for script in soup.find_all("script"): if script.attrs.get("src"): # if the tag has the attribute 'src' script_url = urljoin(url, script.attrs.get("src")) script_files.append(script_url) for css in soup.find_all("link"): if css.attrs.get("href"): css_url = urljoin(url, css.attrs.get("href")) css_files.append(css_url) with HTMLWriter('./myzipper.zip') as zipper: with open("index.html", "w", encoding="utf-8") as f: index_content = soup.prettify() zipper.write_index_contents(index_content) with open("javascript_files.js") as f: for js_file in script_files: script_path = zipper.write_url(js_file, "scripts.js", directory="src") script = "<script src='{}' type='text/javascript'></script>".format( script_path) with open("css_files.css", "w") as f: for css_file in css_files: print(css_file, file=f) css_path = zipper.write_url(css_file, "style.css", directory="styles") extra_head = "<link href='{}' rel='stylesheet'></link>".format( css_path) link_file = HTMLZipFile(path='./myzipper.zip') link_node = HTML5AppNode(source_id=material["link"]["url"], title=material["link"]["title"], license=get_license( licenses.CC_BY, copyright_holder='Copyright holder name'), language=getlang('en').id, derive_thumbnail=False, thumbnail=None, files=[link_file]) return link_node
def download_videos(topic, language): scraped_video_urls_path = os.path.join(DOWNLOADS_FOLDER, 'scraped_video_urls.json') with open(scraped_video_urls_path) as f: scraped_video_urls = json.load(f) video_urls_list = scraped_video_urls[language]['urls'] video_descriptions_list = scraped_video_urls[language]['descriptions'] nodes = [] for video_num, video_url in enumerate(video_urls_list): ydl_options = { 'outtmpl': f'downloads/videos/{video_num}_{language}.%(ext)s', # uses output templates, see documentation 'writethumbnail': False, 'no_warnings': True, 'continuedl': False, 'restrictfilenames': True, 'quiet': False, 'format': "bestvideo[height<=480][ext=mp4]+bestaudio[ext=m4a]/best[height<=480][ext=mp4]", # Note the format specification is important so we get mp4 and not taller than 480 } with youtube_dl.YoutubeDL(ydl_options) as ydl: try: ydl.add_default_info_extractors() vinfo = ydl.extract_info(video_url, download=True) except (youtube_dl.utils.DownloadError, youtube_dl.utils.ContentTooShortError, youtube_dl.utils.ExtractorError) as e: print('error_occured') ext_p1 = vinfo['requested_formats'][0]['format_id'] ext_p2 = vinfo['requested_formats'][0]['ext'] video_path = f'downloads/videos/{video_num}_{language}.f{ext_p1}.{ext_p2}' video_node = VideoNode( source_id=vinfo['webpage_url'], title=vinfo['title'], description=video_descriptions_list[video_num], # aggregator=LE, thumbnail=vinfo['thumbnail'], license=get_license('CC BY', copyright_holder='NC-SA 4.0'), # role=roles.COACH, files=[VideoFile(path=video_path, language=language)]) topic.add_child(video_node) return topic
def test_license_serilizibility(license_objects, special_license): orig_licenses = license_objects orig_licenses.append(special_license) for licence_orig in orig_licenses: # serizlize license_dict = licence_orig.as_dict() license_json = json.dumps(license_dict) # deserizlize license_copy_dict = json.loads(license_json) license_copy = get_license(**license_copy_dict) same_attributes = _compare_licence_objects(licence_orig, license_copy) assert same_attributes, 'License attributes not the same after serizlize'
def construct_channel(self, *args, **kwargs): """ This method is reponsible for creating a `ChannelNode` object and populating it with `TopicNode` and `ContentNode` children. """ # Create channel ######################################################################## channel = self.get_channel(*args, **kwargs) # uses self.channel_info # Create topics to add to your channel ######################################################################## teen_topic = TopicNode(source_id="topic-teen", title="K-12 Resources") adult_topic = TopicNode(source_id="topic-adult", title="Adult Continue Education Resources") channel.add_child(teen_topic) channel.add_child(adult_topic) level_map = {} for index, teen_level in enumerate(teen_levels): level_map[teen_level] = TopicNode(source_id="topic-teen-" + teen_level, title=teen_level) for index, adult_level in enumerate(adult_levels): level_map[adult_level] = TopicNode(source_id="topic-adult-" + adult_level, title=adult_level) for level, subtopics in parse_website().items(): for subtopic, resources in subtopics.items(): subtopic_node = TopicNode(source_id=subtopic, title=subtopic) for resource in resources: resource_file = DocumentFile(path=resource['link']) resource_pdf = DocumentNode(title=resource['title'], source_id=resource['title'], files=[resource_file], license=get_license( licenses.PUBLIC_DOMAIN)) subtopic_node.add_child(resource_pdf) level_map[level].add_child(subtopic_node) for key, value in level_map.items(): if key in teen_levels: teen_topic.add_child(value) elif key in adult_levels: adult_topic.add_child(value) # the `construct_channel` method returns a ChannelNode that will be # processed by the ricecooker framework return channel
def _build_tree(node, sourcetree): """ Parse nodes given in `sourcetree` and add as children of `node`. """ for child_source_node in sourcetree: try: main_file = child_source_node['files'][ 0] if 'files' in child_source_node else {} kind = guess_content_kind( path=main_file.get('path'), web_video_data=main_file.get('youtube_id') or main_file.get('web_url'), questions=child_source_node.get("questions")) except UnknownContentKindError: continue if kind == content_kinds.TOPIC: child_node = nodes.TopicNode( source_id=child_source_node["id"], title=child_source_node["title"], author=child_source_node.get("author"), description=child_source_node.get("description"), thumbnail=child_source_node.get("thumbnail"), ) node.add_child(child_node) source_tree_children = child_source_node.get("children", []) _build_tree(child_node, source_tree_children) elif kind == content_kinds.VIDEO: child_node = nodes.VideoNode( source_id=child_source_node["id"], title=child_source_node["title"], license=get_license(child_source_node.get("license"), description="Description of license", copyright_holder=child_source_node.get( 'copyright_holder')), author=child_source_node.get("author"), description=child_source_node.get("description"), derive_thumbnail=True, # video-specific data thumbnail=child_source_node.get('thumbnail'), ) add_files(child_node, child_source_node.get("files") or []) node.add_child(child_node) else: # unknown content file format continue return node
def build_pdf_topics(main_topic, sections, lang_code): """ Adds the documents from the sections tree to the `main_topic`. - CASE A = no children => add as DocumentNode - CASE B = has children => add as TopicNode and add all children as DocumentNode """ LICENSE = get_license("CC BY-NC-SA", copyright_holder=POINTB) for i, section in enumerate(sections): # CASE A: All sections except Section 2 if 'children' not in section: title = section['title'] abspath = section['path'] filename = os.path.basename(abspath) doc_node = DocumentNode( title=title, description= 'Chapter from A GUIDE TO BECOMING A 21ST CENTURY TEACHER', source_id='%s-%s' % (filename, lang_code), license=LICENSE, aggregator=LE, language=lang_code, role=roles.COACH, files=[DocumentFile(path=abspath, language=lang_code)]) main_topic.add_child(doc_node) # CASE B: Section 2 else: section_topic = TopicNode(title=section['title'], source_id="pointb_section_" + str(i)) main_topic.add_child(section_topic) for subsection in section['children']: title = subsection['title'] abspath = subsection['path'] filename = os.path.basename(abspath) subsection_doc_node = DocumentNode( title=title, description='', source_id='%s-%s' % (filename, lang_code), license=LICENSE, aggregator=LE, language=lang_code, role=roles.COACH, files=[DocumentFile(path=abspath, language=lang_code)]) section_topic.add_child(subsection_doc_node) return main_topic
def add_documents(topic, chapters, language): for idx, chapter in enumerate(chapters): # if chapter has 'children' if 'children' in chapter.keys(): doc_title = chapter['title'] child_topic_node = TopicNode(title=doc_title, source_id=language + doc_title, thumbnail=DOWNLOADS_FOLDER + '/thumbnail.png') for child in chapter['children']: child_doc_title = child['title'] doc_node = DocumentNode( title=child_doc_title, description=f'Chapter {idx} from {doc_title}', source_id=language + child_doc_title, license=get_license('CC BY', copyright_holder='NC-SA 4.0'), language=language, thumbnail=DOWNLOADS_FOLDER + '/thumbnail.png', files=[ DocumentFile(path=child['path'], language=language) ], ) child_topic_node.add_child(doc_node) topic.add_child(child_topic_node) else: doc_title = chapter['title'] doc_node = DocumentNode( title=doc_title, description=f'Chapter {idx} from 21ST CENTURY GUIDE', source_id=language + doc_title, license=get_license('CC BY', copyright_holder='NC-SA 4.0'), language=language, thumbnail=DOWNLOADS_FOLDER + '/thumbnail.png', files=[DocumentFile(path=chapter['path'], language=language)], ) topic.add_child(doc_node)
def process_file(self, download=False, filepath=None): self.download(download=download, base_path=filepath) if self.filepath: files = [dict(file_type=content_kinds.VIDEO, path=self.filepath)] files += self.subtitles_dict() self.node = dict(kind=content_kinds.VIDEO, source_id=self.resource_url, title=self.filename, description='', files=files, language=self.lang, license=get_license( licenses.CC_BY, copyright_holder=COPYRIGHT_HOLDER).as_dict())
def documentAssignment(material): docPath = "documents/" + slugify( material["driveFile"]["driveFile"]["title"]) + ".pdf" document_node = DocumentNode( source_id=material["driveFile"]["driveFile"]["id"], title=material["driveFile"]["driveFile"]["title"], language=getlang('en').id, license=get_license(licenses.CC_BY, copyright_holder='Copyright holder name'), derive_thumbnail=True, thumbnail=None, files=[DocumentFile(path=str(docPath), language=getlang('en').id)]) print(courseDataCopy["Assignments"]) return document_node
def construct_channel(self, **kwargs): channel = self.get_channel(**kwargs) potato_topic = TopicNode(title="Potatoes!", source_id="<potatos_id>") channel.add_child(potato_topic) doc_node = DocumentNode( title='Growing potatoes', description='An article about growing potatoes on your rooftop.', source_id='pubs/mafri-potatoe', license=get_license('CC BY', copyright_holder='University of Alberta'), language='en', files=[DocumentFile(path='https://www.gov.mb.ca/inr/pdf/pubs/mafri-potatoe.pdf', language='en')], ) potato_topic.add_child(doc_node) return channel
def videoAssignment(material): video_node = VideoNode( source_id=material["youtubeVideo"] ["id"], # usually set source_id to youtube_id title=material["youtubeVideo"]["title"], license=get_license(licenses.CC_BY, copyright_holder='Copyright holder name'), language=getlang('en').id, derive_thumbnail=True, # video-specicig flag thumbnail=None, files=[ YouTubeVideoFile(youtube_id=material["youtubeVideo"]["id"], high_resolution=False, language='en'), YouTubeSubtitleFile(youtube_id=material["youtubeVideo"]["id"], language='en') ]) return video_node
def include_video_topic(topic_node, video_data, lang_obj): # Include video details to the parent topic node video = video_data create_id = uuid.uuid4().hex[:12].lower() video_source_id = str( video.uid) # For YouTube imports, set source_id to the youtube_id video_node = VideoNode(source_id=video_source_id, title=clean_video_title(video.title, lang_obj), description=video.description, author=ARVIND, thumbnail=video.thumbnail, license=get_license("CC BY-NC", copyright_holder=ARVIND), files=[ YouTubeVideoFile(youtube_id=video.uid, language=video.language) ]) topic_node.add_child(video_node)
def include_video_topic(topic_node, video_data, lang_obj): # Include video details to the parent topic node video_id = video_data.uid video_source_id = 'arvind-video-{0}'.format(video_id) video_node = VideoNode(source_id=video_source_id, title=clean_video_title(video_data.title, lang_obj), description=video_data.description, author=ARVIND, thumbnail=video_data.thumbnail, license=get_license("CC BY-NC", copyright_holder=ARVIND), files=[ YouTubeVideoFile( youtube_id=video_id, language=video_data.language, high_resolution=False, ) ]) topic_node.add_child(video_node)
def construct_channel(self, **kwargs): channel = self.get_channel(**kwargs) potato_topic = TopicNode(title="Potatoes!", source_id="<potatoes_id>") channel.add_child(potato_topic) document_node = DocumentNode( title="Growing potatoes", description="An article about growing potatoes on your rooftop.", source_id="pubs/mafri-potatoe", license=get_license("CC BY", copyright_holder="University of Alberta"), language="en", files=[ DocumentFile( path="https://www.gov.mb.ca/inr/pdf/pubs/mafri-potatoe.pdf", language="en", ) ], ) potato_topic.add_child(document_node) return channel
def build_burmese_video_topics(topic): """ """ video_data = download_videos(LANG_CODE_MY) if not video_data: print('==> Download of Videos FAILED!') return False for i, video in enumerate(video_data): filepath = video.filepath video_node = VideoNode( source_id=video.uid, title=video.title, description=video.description, aggregator=LE, thumbnail=video.thumbnail, license=get_license("CC BY-NC-SA", copyright_holder=POINTB), role=roles.COACH, files=[VideoFile(path=filepath, language=LANG_CODE_MY)]) topic.add_child(video_node) return topic
def construct_channel(self, **kwargs): channel = self.get_channel(**kwargs) # Soupify goalkicker main page gk_url = 'https://' + self.channel_info['CHANNEL_SOURCE_DOMAIN'] + '/' gk_soup = get_soup(gk_url) # Get urls for each goalkicker book els_with_page_urls = gk_soup.find_all(class_='bookContainer') page_urls = [ gk_url + el.find('a')['href'] for el in els_with_page_urls ] for page_url in page_urls: # Soupify book page page_soup = get_soup(page_url) # Extract and construct book info book_info = parse_book_info(page_soup) book_info['absolute_url'] = page_url + book_info['relative_url'] # Add book to channel tree topic_node_source_id = 'topic/' + book_info['subject'] page_topic_node = TopicNode(title=book_info['subject'], source_id=topic_node_source_id) channel.add_child(page_topic_node) doc_node = DocumentNode( title=book_info['title'], description=book_info['description'], source_id=book_info['source_id'], license=get_license('CC BY-SA', copyright_holder='Creative Commons'), language='en', files=[ DocumentFile(path=book_info['absolute_url'], language='en') ], ) page_topic_node.add_child(doc_node) return channel
def download_video_topics(topic_node, playlist_item, lang_obj, use_cache=True, to_sheet=False): """ Scrape, collect, and download the videos from playlist. """ playlist_obj = RefugeeResponsePlaylist(playlist_item, use_cache) playlist_info = playlist_obj.get_playlist_info() videos = [entry['id'] for entry in playlist_info.get('children')] for video in playlist_info.get('children'): video_id = video['id'] video_url = YOUTUBE_VIDEO_URL_FORMAT.format(video_id) video_source_id = 'refugee-response-{0}-{1}'.format( lang_obj.name, video_id) if video_id in VIDEO_DESCRIPTION_MAP: video_description = VIDEO_DESCRIPTION_MAP[video_id] else: # Exclude videos continue LOGGER.info("Video Description: '%s'", video_description) try: video_node = nodes.VideoNode( source_id=video_source_id, title=video['title'], description=video_description, author=REFUGEE_RESPONSE, language=lang_obj.code, provider=REFUGEE_RESPONSE, thumbnail=video['thumbnail'], license=licenses.get_license( "CC BY-NC-ND", copyright_holder=REFUGEE_RESPONSE), files=[ files.YouTubeVideoFile(youtube_id=video_id, language=lang_obj.code) ]) topic_node.add_child(video_node) except Exception as e: LOGGER.error('Error downloading this video: %s', e)
def youtubeNode(url): #Picking out youtube video ID from URL url_data = urlparse(url) query = urlparse.parse_qs(url_data.query) videoID = query["v"][0] r = requests.get(url).text # grabs request of the URL #Get video title bs = bs4.BeautifulSoup(r, "html.parser") videoTitle = bs.find_all('title', limit=1) #videoTitle includes html tags, stripping them newTitle = str(re.sub('<.*?>', '', str(videoTitle))) #May have to delete if there are brackets in title newTitle = newTitle.replace("]", '') newTitle = newTitle.replace("[", '') #Create Video Node video_node = VideoNode( source_id=videoID, # usually set source_id to youtube_id title=str(newTitle), license=get_license(licenses.CC_BY, copyright_holder='Copyright holder name'), language=getlang('en').id, derive_thumbnail=True, # video-specicig flag thumbnail=None, files=[ YouTubeVideoFile(youtube_id=videoID, high_resolution=False, language='en'), YouTubeSubtitleFile(youtube_id=videoID, language='en') ]) #Return Video Node return video_node
def build_english_video_topics(topic): """ """ video_data = download_videos(LANG_CODE_EN) if not video_data: print('==> Download of Videos FAILED!') return False # NOTE(cpauya: VideoNode constructor has no argument for language code? for i, video in enumerate(video_data): filepath = video.filepath title = video.title.replace('(English Language)', '').strip() video_node = VideoNode( source_id=video.uid, title=title, description=video.description, aggregator=LE, thumbnail=video.thumbnail, license=get_license("CC BY-NC-SA", copyright_holder=POINTB), role=roles.COACH, files=[VideoFile(path=filepath, language=LANG_CODE_EN)]) topic.add_child(video_node) return topic