def convert_sup_to_srt(filename, file_info):
    column = 2
    try:
        (headers, group) = get_gformat_subs(filename)
        for language in headers:
            subs = pysrt.SubRipFile()
            column = column + 1
            tag = language.replace(" ", "_").decode('ascii', 'ignore')
            for line in group:
                if (len(line[column]) > 1):
                    current_sub = pysrt.SubRipItem()
                    current_sub.start = line[0].replace(',', '.')
                    current_sub.end = line[1].replace(',', '.')
                    current_sub.text = line[column].decode('utf-8')
                    subs.append(current_sub)
            subs.save('temp.vtt')
            new_filename = 'live/subtitles/' + os.path.splitext(
                os.path.basename(filename))[0] + tag + '.vtt'
            os.system('echo WEBVTT > ' + new_filename)
            os.system('cat temp.vtt >> ' + new_filename)
            shortname = os.path.splitext(os.path.basename(filename))[0]
            if (shortname in prog_dict):
                if (prog_dict[shortname] < len(subs)):
                    prog_dict[shortname] = len(subs)
            else:
                prog_dict[shortname] = len(subs)
            file_info.append([shortname, len(subs), new_filename, language])
            print(new_filename)
    except AttributeError:
        # We would expect this to be because we've been handed a file that's outside our type
        # TODO: we should identify exactly where this error appears for
        # various types of tests
        pass
예제 #2
0
def make_subtitles(frames_time, frames_annotation, user_id):
    file = pysrt.SubRipFile(encoding='utf-8')
    length = len(frames_time)
    for i in range(length - 1):
        sub = pysrt.SubRipItem()
        sub.index = frames_time[i][0] + 1
        sub.start.seconds = frames_time[i][1]
        sub.end.seconds = frames_time[i + 1][1]
        sub.text = frames_annotation[i][1]
        file.append(sub)
    sub = pysrt.SubRipItem()
    sub.index = frames_time[length - 1][0] + 1
    sub.start.seconds = frames_time[length - 1][1]
    sub.text = frames_annotation[length - 1][1]
    file.append(sub)
    file.save(MAIN_DIRECTORY + '%d/subtitles.srt' % user_id)
예제 #3
0
def srt_formatter(subtitles, show_before=0, show_after=0):
    sub_rip_file = pysrt.SubRipFile()
    for i, ((start, end), text) in enumerate(subtitles, start=1):
        item = pysrt.SubRipItem()
        item.index = i
        item.text = six.text_type(text)
        item.start.seconds = max(0, start - show_before)
        item.end.seconds = end + show_after
        sub_rip_file.append(item)
    return '\n'.join(six.text_type(item) for item in sub_rip_file)
예제 #4
0
def srt_formatter(subtitles, show_before=0, show_after=0):
    f = pysrt.SubRipFile()
    for (rng, text) in subtitles:
        item = pysrt.SubRipItem()
        item.text = force_unicode(text)
        start, end = rng
        item.start.seconds = max(0, start - show_before)
        item.end.seconds = end + show_after
        f.append(item)
    return '\n'.join(map(unicode, f))
예제 #5
0
def srt_formatter(subtitles, show_before=0, show_after=0):
    f = pysrt.SubRipFile()
    for i, (rng, text) in enumerate(subtitles, 1):
        item = pysrt.SubRipItem()
        item.index = i
        item.text = force_unicode(text)
        start, end, num = rng
        item.start.seconds = max(0, start - show_before)
        item.end.seconds = end + show_after
        f.append(item)
    return '\n'.join(six.text_type(item) for item in f)
예제 #6
0
def criarArquivoSRT(tempodalegenda, preenchimento_antes=0, preenchimento_depois=0):
    rip = pysrt.SubRipFile()
    for i, ((inicio, fim), text) in enumerate(tempodalegenda, start=1):
        item = pysrt.SubRipItem()
        item.index = i
        item.text = six.text_type(text)
        item.start.seconds = max(0, inicio - preenchimento_antes)
        item.end.seconds = fim + preenchimento_depois
        rip.append(item)
    legenda = '\n'.join(six.text_type(item) for item in rip)
    return legenda
예제 #7
0
def srt_formatter(subtitles, show_before=0, show_after=0):
    f = pysrt.SubRipFile()
    for i, (rng, text) in enumerate(subtitles, 1):
        item = pysrt.SubRipItem()
        item.index = i
        item.text = force_unicode(text)
        start = rng[0]
        end = rng[1]
        item.start.seconds = max(0, start - show_before)
        item.end.seconds = end + show_after
        f.append(item)
    return '\n'.join(map(str, f))
예제 #8
0
def xml_to_srt(xml_data):
    """
    xml_data - ET
    Converts XML data received from Google's servers and returns a SubRipFile instance.
    """
    f = pysrt.SubRipFile()
    for child in xml_data:
        sub = pysrt.SubRipItem()
        sub.text = h.unescape(child.text)
        sub.start.seconds = float(child.attrib["start"])
        sub.end.seconds = float(child.attrib["start"]) + float(child.attrib["dur"])
        f.append(sub)
    return f
예제 #9
0
def srt_formatter(subtitles, padding_before=0, padding_after=0):
    """
    Serialize a list of subtitles according to the SRT format, with optional time padding.
    """
    sub_rip_file = pysrt.SubRipFile()
    for i, ((start, end), text) in enumerate(subtitles, start=1):
        item = pysrt.SubRipItem()
        item.index = i
        item.text = six.text_type(text)
        item.start.seconds = max(0, start - padding_before)
        item.end.seconds = end + padding_after
        sub_rip_file.append(item)
    return '\n'.join(six.text_type(item) for item in sub_rip_file)
예제 #10
0
 def generate(self,
              subtitles,
              show_before=0,
              show_after=0,
              *args,
              **kwargs) -> str:
     sub_rip_file = pysrt.SubRipFile()
     for i, ((start, end), text) in enumerate(subtitles, start=1):
         item = pysrt.SubRipItem()
         item.index = i
         item.text = str(text)
         item.start.seconds = max(0, start - show_before)
         item.end.seconds = end + show_after
         sub_rip_file.append(item)
     return '\n'.join(str(item) for item in sub_rip_file)
예제 #11
0
    def write_to_file(self, output_path: Opt[str] = None):
        util.touch(output_path)
        sub_rip_file = pysrt.open(output_path, encoding='utf-8')

        for index, sub in enumerate(self.subtitles):
            start_time = pysrt.SubRipTime.from_ordinal(sub.start_time * 1000)
            end_time = pysrt.SubRipTime.from_ordinal(sub.end_time * 1000)
            next_sub = pysrt.SubRipItem(index=index,
                                        text=sub.text,
                                        start=start_time,
                                        end=end_time)
            sub_rip_file.append(next_sub)

        sub_rip_file.save(output_path, encoding='utf-8')

        return output_path
예제 #12
0
def combine_srt(srt_list):
    """
    srt_list - a list of SubRipFiles
    Combines the text of all SubRipFiles in srt_list and returns a SubRipFile instance.
    """
    if srt_list is None or len(srt_list) == 0:
        return None
    f = pysrt.SubRipFile()
    for index in xrange(len(srt_list[0])):
        sub = pysrt.SubRipItem()
        for srt in srt_list:
            sub.text += (srt[index].text + "\n")
        sub.text = sub.text.rstrip()
        sub.start = srt_list[0][index].start
        sub.end = srt_list[0][index].end
        f.append(sub)
    return f
예제 #13
0
def write_transcripts(transcript_filename, transcript, reg):
    print(transcript)
    import six
    sub_rip = pysrt.SubRipFile()
    for i, (start, end), text in zip(range(len(transcript)), reg, transcript):
        print(i, start, end, text)
        item = pysrt.SubRipItem()
        item.index = i
        item.text = six.text_type(text)
        item.start.seconds = max(0, start)
        item.end.seconds = end
        sub_rip.append(item)
    fin_sub = '\n'.join(six.text_type(item) for item in sub_rip)
    with open(output_filepath + transcript_filename, "wb") as f:
        f.write(fin_sub.encode("utf-8"))
    print("+ Successfully Generated Subtitles.")
    return True
예제 #14
0
파일: main.py 프로젝트: mihir97/Videocorum
def do_subtitles_generation(sub_write_file, filename, chunk_sound_file, start_chunk):    
    voices = detect_nonsilent(chunk_sound_file, 
        # must be silent for at least half a second
        min_silence_len=50,

        # consider it silent if quieter than -16 dBFS
        silence_thresh=-29	)

    global counter   
    print(voices)
    splits = [0]
    i = 0
    if (len(voices) > 0 and voices[-1][1] == len(chunk_sound_file)):
        del voices[-1]
    for voice in voices:
        if (voice[1] > splits[i] + threshold):
            i += 1
            splits.append(voice[1])

    if (len(voices) > 0 and splits[-1] != voices[-1][1]):
        splits.append(voices[-1][1])
    end_chunk = splits[-1]
    
    print(splits)

    print("Split complete")

    for i in range(len(splits) - 1):
        out_file = ".//splitAudio//chunk{0}.wav".format(i)
        print("exporting", out_file)
        chunk_sound_file[splits[i]:splits[i+1]].export(out_file, format="wav")
        with sr.AudioFile(out_file) as source:
            audio = r.record(source)
            text = r.recognize_sphinx(audio)
            sub = pysrt.SubRipItem()
            sub.index = counter
            counter += 1
            sub.start.milliseconds = start_chunk + splits[i]
            sub.end.milliseconds = start_chunk + splits[i+1]
            sub.text = text
            sub_write_file.append(sub)
            sub_write_file.save(filename + '.srt', encoding='utf-8')
            print(text)

    return end_chunk
예제 #15
0
def export_to_srt(subtitulation):
    subtitles = db(db.subtitle.subtitulation_id == subtitulation.id).select(
        orderby=db.subtitle.starts)
    import pysrt
    import StringIO
    sio = StringIO.StringIO()
    mysrt = pysrt.SubRipFile(encoding=ENCODING)
    for i, subtitle in enumerate(subtitles):
        sri = pysrt.SubRipItem()
        if isinstance(subtitle.body, unicode):
            print "is unicode"
            sri.text = subtitle.body
        else:
            print "is not unicode"
            sri.text = unicode(subtitle.body, ENCODING)
        sri.start = pysrt.SubRipTime.from_time(subtitle.starts)
        sri.end = pysrt.SubRipTime.from_time(subtitle.ends)
        sri.index = i
        mysrt.append(sri)
    mysrt.write_into(sio)
    sio.seek(0)
    return sio
def merge_subtitles(sub1, sub2):
    # Pick the index
    idx = min(sub1.index, sub2.index)

    # Pick the earlier starting time
    start1 = str_to_time_obj(str(sub1.start))
    start2 = str_to_time_obj(str(sub2.start))
    if start1 < start2:
        start = str(sub1.start)
    else:
        start = str(sub2.start)

    # Pick the later starting time
    end1 = str_to_time_obj(str(sub1.end))
    end2 = str_to_time_obj(str(sub2.end))
    if end1 > end2:
        end = str(sub1.end)
    else:
        end = str(sub2.end)

    # Return new SRT item
    return pysrt.SubRipItem(idx, start=start, end=end, text=str(sub1.text))
예제 #17
0
def merge_sub(sub1, sub2, bar, driver):
    if space_var.get() == 1:
        space_sub = '\n&nbsp;\n'
    else:
        space_sub = '\n'
    sub1_df = dataframe_sub(sub1, "en")
    sub2_df = dataframe_sub(sub2, "ru")
    df = pd.concat([sub1_df, sub2_df], axis=0)
    df['sum'] = df[['start', 'end']].sum(axis=1)
    df['plus'] = (df['start'] + df['end']) / 2
    df = df.sort_values(by='start', ascending=True)
    # агломеративная кластеризация
    if clusters_auto_var.get() == 1:
        clusters_list = []
        # оценка качества с помощью "силуэта"
        silhouette = []
        for i in np.linspace(0.2, 1, 20):
            root.update()
            threshold = float(i) * 10000
            clustering = AgglomerativeClustering(
                n_clusters=None,
                distance_threshold=threshold).fit(df[['start', 'end']])
            clusters = clustering.labels_
            clusters_list.append(len(pd.unique(clusters)))
            score = silhouette_score(df[['start', 'end']], clusters)
            silhouette.append(score)
        max_silhouette = np.argmax(silhouette)
        clustering = AgglomerativeClustering(
            n_clusters=clusters_list[max_silhouette]).fit(df[['start', 'end']])
    else:
        threshold = float(clusters_manual_entry.get()) * 10000
        clustering = AgglomerativeClustering(
            n_clusters=None,
            distance_threshold=threshold,
            linkage=clusters_method_combobox.get()).fit(df[['start', 'end']])
    clusters = clustering.labels_
    # добавление найденных кластеров
    df['cluster'] = clusters
    bar_subs = float(bar) / float(len(pd.unique(clusters)))
    # создание нового файла субтитров
    double_sub = pysrt.SubRipFile(encoding='utf-8')
    translate_list = pysrt.SubRipFile(encoding='utf-8')
    for n, i in enumerate(pd.unique(clusters)):
        root.update()
        progressBar['value'] += bar_subs
        df_en = df[(df['language'] == 'en') & (df['cluster'] == i)]
        df_ru = df[(df['language'] == 'ru') & (df['cluster'] == i)]
        df_group_en = df_en.groupby('cluster').agg({
            'text': ' '.join,
            'start': min,
            'end': max,
            'language': 'first'
        })
        df_group_ru = df_ru.groupby('cluster').agg({
            'text': ' '.join,
            'start': min,
            'end': max,
            'language': 'first'
        })
        df_group = df_group_en.merge(
            df_group_ru,
            on=['cluster', 'text', 'start', 'end', 'language'],
            how='outer').groupby('cluster').agg({
                'text': space_sub.join,
                'start': 'first',
                'end': 'first',
                'language': ''.join
            })
        sub = pysrt.SubRipItem(index=n + 1,
                               start=int(df_group.iloc[0]['start']),
                               end=int(df_group.iloc[0]['end']),
                               text=str(df_group.iloc[0]['text']))
        double_sub.append(sub)
        if translate_var.get() == 1 and df_group['language'].values == 'en':
            translate_list.append(sub)
    if translate_var.get() == 1 and translate_list:
        translate_sub(translate_list, bar, driver)
    # переиндексация субтитров
    double_sub.clean_indexes()
    return double_sub
		### CASO 4: the previus added line ends in a punctuation mark, no combination needed. Extra SRT lines should be created
		if  prevText =='' or prevText[-1] in END_PUNCTUATION :

			prevTimeStart = line.start
			prevTimeEnd = line.start + {'seconds':ratioTime * duration}
			for newSent in range(1,numSentences):

				#Updating text and end from current line
				#print(count, " to srt2 ", lineAux[:firstPunctuationIdx+1])
				print(count, " to srt2 ", arrStrings[newSent-1])
				newIndex = lastAdded+1
				
				#newLine = pysrt.SubRipItem(index=newIndex, start=prevEnd , end=prevEnd, text=lineAux[:firstPunctuationIdx+1])

				
				newLine = pysrt.SubRipItem(index=newIndex, start= prevTimeStart, end=prevTimeEnd, text=arrStrings[newSent-1])
				srtTransformed.append(newLine)
				newIndex+=1

				#print(lineAux,"|", lineAux[firstPunctuationIdx+1:])

				srtOriginal[count].text = arrStrings[newSent]
				srtOriginal[count].start = prevTimeEnd
				numFirstWords = len(nltk.word_tokenize(arrStrings[newSent]))
				duration = (line.end - srtOriginal[count].start).seconds#.to_time().to_seconds() 
				ratioTime = float(numFirstWords)/float(numTotalWords)

				divtime = srtOriginal[count].start + {'seconds':ratioTime * duration}
				srtOriginal[count].end = divtime

				#prevTimeStart
예제 #19
0
def find_summary_regions(srt_filename, summarizer, duration, language,
                         bonusWords, stigmaWords, videonamepart):
    srt_file = pysrt.open(srt_filename)
    # Find the average amount of time required for each subtitle to be showned

    clipList = list(map(srt_item_to_range, srt_file))

    avg_subtitle_duration = total_duration_of_regions(clipList) / len(srt_file)

    # Find the no of sentences that will be required in the summary video
    n_sentences = duration / avg_subtitle_duration
    print("nsentance : " + str(n_sentences))

    # get the summarize video's subtitle array
    [summary,
     summarizedSubtitles] = summarize(srt_file, summarizer, n_sentences,
                                      language, bonusWords, stigmaWords)
    # Check whether the total duration is less than the duration required for the video
    total_time = total_duration_of_regions(summary)
    print("total_time : " + str(total_time))
    try_higher = total_time < duration
    prev_total_time = -1
    # If the duration which we got is higher than required
    if try_higher:
        # Then until the resultant duration is higher than the required duration run a loop in which the no of sentence is increased by 1
        while total_time < duration:
            if (prev_total_time == total_time):
                print("1 : Maximum summarization time reached")
                break
            print("1 : total_time : duration " + str(total_time) + " " +
                  str(duration))
            n_sentences += 1
            [summary, summarizedSubtitles] = summarize(srt_file, summarizer,
                                                       n_sentences, language,
                                                       bonusWords, stigmaWords)
            prev_total_time = total_time
            total_time = total_duration_of_regions(summary)
    else:
        # Else if  the duration which we got is lesser than required
        # Then until the resultant duration is lesser than the required duration run a loop in which the no of sentence is increased by 1
        while total_time > duration:
            if (n_sentences <= 2):
                print("2 : Minimum summarization time reached")
                break
            print("2 : total_time : duration " + str(total_time) +
                  str(duration))
            n_sentences -= 1
            [summary, summarizedSubtitles] = summarize(srt_file, summarizer,
                                                       n_sentences, language,
                                                       bonusWords, stigmaWords)
            total_time = total_duration_of_regions(summary)

    print("************ THis is summary array *********")
    print(summary)
    print("**********************************")

    print(
        "************************THis is summarizedSubtitles array *******************"
    )
    print(summarizedSubtitles)
    print("**********************************************************")
    # Find the duration of each subtitle and add it to the ending time of the previous subtitle
    subs = []
    starting = 0
    sub_rip_file = pysrt.SubRipFile()
    for index, item in enumerate(summarizedSubtitles):
        newSubitem = pysrt.SubRipItem()
        newSubitem.index = index
        newSubitem.text = item.text
        # First find duration
        duration = summary[index][1] - summary[index][0]
        # Then find the ending time
        ending = starting + duration
        newSubitem.start.seconds = starting
        newSubitem.end.seconds = ending
        sub_rip_file.append(newSubitem)
        # subs.append((index,starting,ending,item.text))
        starting = ending

    print(sub_rip_file)

    # print(subs)

    path = videonamepart + ".srt"
    with open(path, "w+") as sf:
        for i in range(0, len(sub_rip_file)):
            sf.write(str(sub_rip_file[i]))
            sf.write("\n")
    sf.close()

    #test file for finding emotions
    # path = "./media/documents/summarizedSubtitleText.txt"
    # with open(path,"w+") as stf:
    #     for i in range(0,len(summarizedSubtitles)):
    #         stf.write(str(summarizedSubtitles[i].text))
    #         stf.write("\n")
    # stf.close()

    # return the resulant summarized subtitle array
    return summary
예제 #20
0
name_list = vtuber_list.vtuber_tl_list
for name in name_list:
    name_list = name_list + (name.replace(':', ' :'),)
    
#add single letter tags to name list
for letter in range(ord('a'), ord('z') + 1):
    name_list = name_list + ('[' + chr(letter) + ']',)

with open(sys.argv[1]) as f:
    records = csv.DictReader(f)
    for row in records:
        msg = row['message']
        msg_lower = row['message'].lower()
        for tag in lang_dict[args.lang.lower()]:
            if msg_lower.startswith(tag) or (msg_lower.startswith(name_list) and args.lang.lower() == 'en' and args.colon and msg_lower.count(':') < 2):
                sub = pysrt.SubRipItem()
                sub.index = index
                sub_start = int(row['time_in_seconds']) - args.offset
                sub.start.seconds = sub_start
                sub.end.seconds = sub_start + args.duration
                sub.text = msg.replace(tag, '').replace(tag.upper(), '').replace(tag.title(), '')
                if sub.text.startswith(": "):
                    sub.text = sub.text.replace(": ", "", 1)
                sub.text = sub.text.strip()
                sub_file.append(sub)
                index += 1
                sub_count += 1
                break
            
if not sub_count:
    print("No subtitles found")
예제 #21
0
                    del subs[i]
                if '\n' in subs[i].text:
                    # Split the subtitle at the hyphen and format the list
                    lines = [
                        line[1:] if line[0] == '-' else line
                        for line in subs[i].text.split('\n')
                    ]
                    length_milli = 1000 * abs(
                        float(
                            (subs[i].end.seconds - subs[i].start.seconds - 60)
                            % 60)) + float(subs[i].end.milliseconds -
                                           subs[i].start.milliseconds)
                    interval_milli = int(length_milli / len(lines))
                    dummy = pysrt.SubRipItem(
                        0,
                        start=str(subs[i].start),
                        end=str(subs[i].end),
                        text=""
                    )  # Use this just to get the right formatting for the time
                    dummy.shift(
                        milliseconds=+interval_milli
                    )  # Shift the dummy so its start time is now the end time we want
                    for j in xrange(len(lines)):
                        new_sub = pysrt.SubRipItem(0,
                                                   start=str(subs[i].start),
                                                   end=str(dummy.start),
                                                   text=lines[j])
                        new_sub.shift(milliseconds=+(j * interval_milli))
                        subs.append(new_sub)
                    del subs[i]
            subs.clean_indexes()
예제 #22
0
    def process(self):
        gentle_file = self.jsonfile
        out_file = self.outfile
        srt_file = self.srtfile

        g = json.load(open(gentle_file, 'r'))
        t = g['transcript']
        g_words = [
            w for w in g['words'] if w['case'] != 'not-found-in-transcript'
        ]

        sentences = t.split('\n')
        sentences = [sent.replace('-', ' ') for sent in sentences]

        inputsrt_elems = pysrt.open(srt_file)
        assert (len(inputsrt_elems) == len(sentences))

        srt_elems = pysrt.SubRipFile()
        counter = 0
        for sent_i, sent in enumerate(sentences):
            if type(sent) != type(u''):
                sent = sent.decode('utf-8')

            words = sent.split()
            start_time_found = False
            for cur_word in words:
                if not re.search(r'(\w|\’\w|\'\w)+', cur_word, re.UNICODE):
                    continue

                for w in re.finditer(r'(\w|\’\w|\'\w)+', cur_word, re.UNICODE):
                    word = w.group()
                    gentle_word = g_words[counter]
                    clean_word = re.search(r'(\w|\’\w|\'\w)+', word,
                                           re.UNICODE).group()
                    if False:
                        if (clean_word.lower() != gentle_word['word'].lower()):
                            pdb.set_trace()
                            print("Error")
                    else:
                        assert (
                            clean_word.lower() == gentle_word['word'].lower())

                    if gentle_word['case'] == 'success':
                        #Retain first valid time boundary
                        if start_time_found == False:
                            start_time = gentle_word['start']
                            start_time_found = True

                        #keep scanning until the last valid time bounday
                        end_time = gentle_word['end']

                    counter += 1

            if start_time_found == False:
                start_time = inputsrt_elems[sent_i].start
                end_time = inputsrt_elems[sent_i].end
            else:
                start_time = extract_time_tuple(start_time)
                end_time = extract_time_tuple(end_time)

            elem = pysrt.SubRipItem()
            elem.index = sent_i + 1
            elem.text = sent
            elem.start = start_time
            elem.end = end_time

            srt_elems.append(elem)

        srt_elems.save(out_file, encoding='utf-8')
                                         features='html.parser')
                    maindiv = soup.findAll("div", {"id": "show"})[0]
                    basicdiv = maindiv.findAll("b")
                    break
                except:
                    # html = driver.find_element_by_tag_name('html')
                    time.sleep(1)
                    driver.refresh()

            submissing = False
            if html.find(
                    "Sorry, there are no subtitle available for this video."
            ) != -1:
                file = pysrt.SubRipFile()
                sub = pysrt.SubRipItem(1,
                                       start='00:00:00,000',
                                       end='00:00:01,000',
                                       text="Sub was not found")
                file.append(sub)
                subtype = "F-"
                file.save("H:\#Everything Else\#Project Ashwini\SRT\\" +
                          channelName + "\\" + subtype + channelName + "-" +
                          str(videolinknum) + ".srt",
                          encoding='utf-8')
                submissing = True
                print("Sub missing")

            elif len(basicdiv) <= 1:
                file = pysrt.SubRipFile()
                sub = pysrt.SubRipItem(1,
                                       start='00:00:00,000',
                                       end='00:00:01,000',