def get_video_transcript(self, video_id): """ Retrieves and formats transcripts for the passed video TODO: If no captions are available, download audio track and pass into Cloud Speech-to-Text? for now we just return None implying that we cant perform sentiment analysis on the video content itself. """ video = YouTube('https://www.youtube.com/watch?v={}'.format(video_id)) captions = video.captions.get_by_language_code('en') if not captions: logger.info('Unable to return transcript for video %r!', video_id) return # format captions as plaintext and strip trailing whitespace and html try: captions = ElementTree.fromstring(captions.xml_captions) except UnicodeEncodeError: xml = captions.xml_captions.encode("utf-8") captions = ElementTree.fromstring(xml) captions_list = [] for subtitle in captions.getchildren(): text = subtitle.text or u'' caption = unescape(text.replace('\n', ' ').replace(' ', ' ')) captions_list.append(u"{text} ".format(text=caption)) transcript = clean_html( html.fromstring(u''.join(captions_list).strip())) return transcript.text_content().strip()
def xml_caption_to_srt(self, xml_captions): """Convert xml caption tracks to "SubRip Subtitle (srt)". :param str xml_captions: XML formatted caption tracks. """ segments = [] root = ElementTree.fromstring(xml_captions) for i, child in enumerate(root.getchildren()): text = child.text or '' caption = unescape( text .replace('\n', ' ') .replace(' ', ' '), ) duration = float(child.attrib['dur']) start = float(child.attrib['start']) end = start + duration sequence_number = i + 1 # convert from 0-indexed to 1. line = ( '{seq}\n{start} --> {end}\n{text}\n'.format( seq=sequence_number, start=self.float_to_srt_time_format(start), end=self.float_to_srt_time_format(end), text=caption, ) ) segments.append(line) return '\n'.join(segments).strip()