示例#1
0
    def get_video_transcript(self, video_id):
        """
        Retrieves and formats transcripts for the passed video

        TODO: If no captions are available, download audio track and pass into
        Cloud Speech-to-Text? for now we just return None implying that we cant
        perform sentiment analysis on the video content itself.
        """
        video = YouTube('https://www.youtube.com/watch?v={}'.format(video_id))
        captions = video.captions.get_by_language_code('en')
        if not captions:
            logger.info('Unable to return transcript for video %r!', video_id)
            return

        # format captions as plaintext and strip trailing whitespace and html
        try:
            captions = ElementTree.fromstring(captions.xml_captions)
        except UnicodeEncodeError:
            xml = captions.xml_captions.encode("utf-8")
            captions = ElementTree.fromstring(xml)
        captions_list = []
        for subtitle in captions.getchildren():
            text = subtitle.text or u''
            caption = unescape(text.replace('\n', ' ').replace('  ', ' '))
            captions_list.append(u"{text} ".format(text=caption))
        transcript = clean_html(
            html.fromstring(u''.join(captions_list).strip()))
        return transcript.text_content().strip()
示例#2
0
    def xml_caption_to_srt(self, xml_captions):
        """Convert xml caption tracks to "SubRip Subtitle (srt)".

        :param str xml_captions:
            XML formatted caption tracks.
        """
        segments = []
        root = ElementTree.fromstring(xml_captions)
        for i, child in enumerate(root.getchildren()):
            text = child.text or ''
            caption = unescape(
                text
                .replace('\n', ' ')
                .replace('  ', ' '),
            )
            duration = float(child.attrib['dur'])
            start = float(child.attrib['start'])
            end = start + duration
            sequence_number = i + 1  # convert from 0-indexed to 1.
            line = (
                '{seq}\n{start} --> {end}\n{text}\n'.format(
                    seq=sequence_number,
                    start=self.float_to_srt_time_format(start),
                    end=self.float_to_srt_time_format(end),
                    text=caption,
                )
            )
            segments.append(line)
        return '\n'.join(segments).strip()