def vttToWebCaptions(operator_object, vttObject):

    webcaptions = []

    # Get metadata
    s3 = boto3.client('s3')
    try:
        print("Getting data from s3://" + vttObject["Bucket"] + "/" +
              vttObject["Key"])
        data = s3.get_object(Bucket=vttObject["Bucket"], Key=vttObject["Key"])
        vtt = data['Body'].read().decode('utf-8')
    except Exception as e:
        operator_object.update_workflow_status("Error")
        operator_object.add_workflow_metadata(
            WebCaptionsError="Unable read VTT file. " + str(e))
        raise MasExecutionError(operator_object.return_output_object())

    buffer = StringIO(vtt)

    for caption in webvtt.read_buffer(buffer):
        webcaption = {}
        webcaption["start"] = formatTimeVTTtoSeconds(caption.start)
        webcaption["end"] = formatTimeVTTtoSeconds(caption.end)
        webcaption["caption"] = caption.text
        webcaptions.append(webcaption)

    return webcaptions
    def parse_webvtt_subtitles_to_text(subtitle_data):
        """
        
            Return values: 
                subtitles, as a text string
                retryable_error, boolean: if we should discard this and try again later (e.g. a weird network error or rate-limiting)
                non-rettryable_error, boolean: if we shouldn't retry, e.g. because there were no subtitles

        """
        if subtitle_data and SUBTITLE_RATE_LIMIT_STRING in subtitle_data:
            log.info("subtitle_data {}".format(subtitle_data))
            return None, True, False  # if we're rate-limited, it's a retryable error
        elif subtitle_data:
            subtitle_lines = [
                caption.text
                for caption in webvtt.read_buffer(StringIO(subtitle_data))
                if caption.text.strip() != ''
            ]
            subtitle_lines_deduped = [subtitle_lines[0]]
            for line_a, line_b in zip(subtitle_lines[:-1], subtitle_lines[1:]):
                if line_a not in line_b:
                    subtitle_lines_deduped.append(line_b)
            subs = '\n'.join(subtitle_lines_deduped)
            return subs, False, False
        else:
            subs = None
            return subs, False, True  # if there's no subtitle data, it's a non-retryable error
Exemplo n.º 3
0
    def _get_sentences(self, file_uri: str) -> List[Dict[str, Union[str, float]]]:
        # Create file-like object of caption file's content
        buffer = io.StringIO(self._request_caption_content(file_uri))
        # Get list of caption block
        captions = webvtt.read_buffer(buffer).captions
        buffer.close()

        # Create timestamped sentences
        sentences = []
        # List of text, representing a sentence
        lines = []
        start_time = 0
        for caption in captions:
            start_time = start_time or caption.start_in_seconds
            lines.append(caption.text)
            end_sentence_search = re.search(self.end_of_sentence_pattern, caption.text)
            # Caption block is a end of sentence block
            if end_sentence_search:
                sentence = {'start_time': start_time,
                            'end_time': caption.end_in_seconds,
                            'text': ' '.join(lines)}
                sentences.append(sentence)
                # Reset lines and start_time, for start of new sentence
                lines = []
                start_time = 0

        # If any leftovers in lines, add a sentence for that.
        if lines:
            sentences.append({'start_time': start_time,
                              'end_time': captions[-1].end_in_seconds,
                              'text': ' '.join(lines)})
        return sentences
Exemplo n.º 4
0
    def vttToCaptions(self, vttObject):

        captions = []
        vtt = ""
        # Get metadata
        s3 = boto3.client('s3')
        try:
            self.logger.debug("Getting data from s3://" + vttObject["Bucket"] +
                              "/" + vttObject["Key"])
            vtt = S3Helper().readFromS3(vttObject["Bucket"], vttObject["Key"])
            self.logger.debug(vtt)
        except Exception as e:
            #Fix me
            self.logger.error(e)

        buffer = StringIO(vtt)

        for vttcaption in webvtt.read_buffer(buffer):
            caption = {}
            caption["start"] = self.formatTimeVTTtoSeconds(vttcaption.start)
            caption["end"] = self.formatTimeVTTtoSeconds(vttcaption.end)
            caption["caption"] = vttcaption.text
            captions.append(caption)

        return captions
Exemplo n.º 5
0
    def test_read_memory_buffer(self):
        payload = ''
        with open(self._get_file('sample.vtt'), 'r', encoding='utf-8') as f:
            payload = f.read()

        buffer = io.StringIO(payload)
        vtt = webvtt.read_buffer(buffer)
        self.assertIsInstance(vtt.captions, list)
Exemplo n.º 6
0
 def _parse_subs(self, subs: str) -> List[str]:
     buffer = StringIO(subs)
     lines = []
     for caption in webvtt.read_buffer(buffer):
         try:
             lines.append(caption.text)
         except:
             pass
     return lines
Exemplo n.º 7
0
 def _get_captions(
     self, closed_caption_content: str
 ) -> List[webvtt.structures.Caption]:
     # Create file-like object of caption file's content
     buffer = io.StringIO(closed_caption_content)
     # Get list of caption blocks
     captions = webvtt.read_buffer(buffer).captions
     buffer.close()
     return captions
Exemplo n.º 8
0
def scrape_3c_media(url):
    transcript_log = []
    with get_session() as ses:
        title, config = get_w3_info(ses.get(url))
        for video in config['playlist']:
            for track in video['tracks']:
                with StringIO(ses.get(track['file']).text) as captions:
                    for caption in webvtt.read_buffer(captions):
                        transcript_log.append((caption.start, caption.text))
    return (title, transcript_log)
Exemplo n.º 9
0
def translateVTT(subid: ObjectId, language: str, translator: str):
    sub_obj = db.subtitles.find_one({'_id': subid})
    if sub_obj is None:
        raise UserError('ITEM_NOT_FOUND')
    if sub_obj['format'] != 'vtt':
        raise UserError('ONLY_VTT_SUPPORTED')
    with redis_lock.Lock(rdb, "subtitleEdit:" +
                         str(subid)), MongoTransaction(client) as s:
        cache = db.subtitle_translation_cache.find_one(
            {
                "subid": subid,
                "lang": language,
                "translator": translator
            },
            session=s())
        if cache is None or cache['version'] < sub_obj['meta']['modified_at']:
            # cache miss
            vtt = webvtt.read_buffer(io.StringIO(sub_obj['content']))
            if translator == 'googletrans':
                result = translate_google(vtt, language)
            elif translator == 'baidutrans':
                with redis_lock.Lock(rdb, "lock-baidutrans"):
                    result = translate_baidu(vtt, language)
            else:
                raise UserError('UNSUPPORTED_TRANSLATOR')
            if cache is None:
                db.subtitle_translation_cache.insert_one(
                    {
                        'subid': subid,
                        'translator': translator,
                        'lang': language,
                        'version': sub_obj['meta']['modified_at'],
                        'content': result
                    },
                    session=s())
            else:
                db.subtitle_translation_cache.update_one(
                    {'_id': cache['_id']}, {
                        '$set': {
                            'version': sub_obj['meta']['modified_at'],
                            'content': result
                        }
                    },
                    session=s())
            s.mark_succeed()
            return result
        else:
            # cache hit
            return cache['content']
Exemplo n.º 10
0
def translate_captions_file(inbuf, outbuf, method='inplace'):
    '''Translates captions from input buffer to output buffer'''
    captions = webvtt.read_buffer(inbuf)

    # Preprocess
    encode_names(captions)
    fix_hyphenation(captions)

    # Main
    translate_texts(captions, method)

    # Postprocess
    encode_names(captions, back=True)
    revert_hyphenation(captions)

    captions.write(outbuf)
Exemplo n.º 11
0
 def test_read_memory_buffer_carriage_return(self):
     """https://github.com/glut23/webvtt-py/issues/29"""
     buffer = io.StringIO(textwrap.dedent('''\
         WEBVTT\r
         \r
         00:00:00.500 --> 00:00:07.000\r
         Caption text #1\r
         \r
         00:00:07.000 --> 00:00:11.890\r
         Caption text #2\r
         \r
         00:00:11.890 --> 00:00:16.320\r
         Caption text #3\r
     '''))
     vtt = webvtt.read_buffer(buffer)
     self.assertEqual(len(vtt.captions), 3)
Exemplo n.º 12
0
def infer_vtt_indic_en():
    start_time = time.time()
    model, source_lang, target_lang = get_inference_params()
    source_text = request.form['text']
    # vad_segments = request.form['vad_nochunk'] # Assuming it is an array of start & end timestamps

    vad = webvtt.read_buffer(StringIO(source_text))
    source_sentences = [
        v.text.replace('\r', '').replace('\n', ' ') for v in vad
    ]

    ## SUMANTH LOGIC HERE ##

    # for each vad timestamp, do:
    large_sentence = ' '.join(
        source_sentences)  # only sentences in that time range
    large_sentence = large_sentence.lower()
    # split_sents = sentence_split(large_sentence, 'en')
    # print(split_sents)

    large_sentence = re.sub(r'[^\w\s]', '', large_sentence)
    punctuated = rpunct.punctuate(large_sentence, batch_size=32)
    end_time = time.time()
    print("Time Taken for punctuation: {} s".format(end_time - start_time))
    start_time = time.time()
    split_sents = splitter([punctuated])  ### Please uncomment

    # print(split_sents)
    # output_sentence_punctuated = model.translate_paragraph(punctuated, source_lang, target_lang)
    output_sents = model.batch_translate(split_sents, source_lang, target_lang)
    # print(output_sents)
    # output_sents = split_sents
    # print(output_sents)
    # align this to those range of source_sentences in `captions`

    map_ = {split_sents[i]: output_sents[i] for i in range(len(split_sents))}
    # print(map_)
    punct_para = ' '.join(list(map_.keys()))
    nmt_para = ' '.join(list(map_.values()))
    nmt_words = nmt_para.split(' ')

    len_punct = len(punct_para.split(' '))
    len_nmt = len(nmt_para.split(' '))

    start = 0
    for i in range(len(vad)):
        if vad[i].text == '':
            continue

        len_caption = len(vad[i].text.split(' '))
        frac = (len_caption / len_punct)
        # frac = round(frac, 2)

        req_nmt_size = floor(frac * len_nmt)
        # print(frac, req_nmt_size)

        vad[i].text = ' '.join(nmt_words[start:start + req_nmt_size])
        # print(vad[i].text)
        # print(start, req_nmt_size)
        start += req_nmt_size

    end_time = time.time()

    print("Time Taken for translation: {} s".format(end_time - start_time))

    # vad.save('aligned.vtt')

    return {
        'text': vad.content,
        # 'duration':round(end_time-start_time, 2)
    }
Exemplo n.º 13
0
 def test_read_malformed_buffer(self):
     malformed_payloads = ['', 'MOCK MELFORMED CONTENT']
     for payload in malformed_payloads:
         buffer = io.StringIO(payload)
         with self.assertRaises(MalformedFileError):
             webvtt.read_buffer(buffer)
Exemplo n.º 14
0
 def test_read_file_buffer(self):
     with open(self._get_file('sample.vtt'), 'r', encoding='utf-8') as f:
         vtt = webvtt.read_buffer(f)
         self.assertIsInstance(vtt.captions, list)
Exemplo n.º 15
0
def print_text_from_vtt(inbuf):
    captions = webvtt.read_buffer(inbuf)
    text = '\n'.join(c.text for c in captions)
    text = re.sub('-\n-', '', text)
    print(text)