def convert_sup_to_srt(filename, file_info): column = 2 try: (headers, group) = get_gformat_subs(filename) for language in headers: subs = pysrt.SubRipFile() column = column + 1 tag = language.replace(" ", "_").decode('ascii', 'ignore') for line in group: if (len(line[column]) > 1): current_sub = pysrt.SubRipItem() current_sub.start = line[0].replace(',', '.') current_sub.end = line[1].replace(',', '.') current_sub.text = line[column].decode('utf-8') subs.append(current_sub) subs.save('temp.vtt') new_filename = 'live/subtitles/' + os.path.splitext( os.path.basename(filename))[0] + tag + '.vtt' os.system('echo WEBVTT > ' + new_filename) os.system('cat temp.vtt >> ' + new_filename) shortname = os.path.splitext(os.path.basename(filename))[0] if (shortname in prog_dict): if (prog_dict[shortname] < len(subs)): prog_dict[shortname] = len(subs) else: prog_dict[shortname] = len(subs) file_info.append([shortname, len(subs), new_filename, language]) print(new_filename) except AttributeError: # We would expect this to be because we've been handed a file that's outside our type # TODO: we should identify exactly where this error appears for # various types of tests pass
def make_subtitles(frames_time, frames_annotation, user_id): file = pysrt.SubRipFile(encoding='utf-8') length = len(frames_time) for i in range(length - 1): sub = pysrt.SubRipItem() sub.index = frames_time[i][0] + 1 sub.start.seconds = frames_time[i][1] sub.end.seconds = frames_time[i + 1][1] sub.text = frames_annotation[i][1] file.append(sub) sub = pysrt.SubRipItem() sub.index = frames_time[length - 1][0] + 1 sub.start.seconds = frames_time[length - 1][1] sub.text = frames_annotation[length - 1][1] file.append(sub) file.save(MAIN_DIRECTORY + '%d/subtitles.srt' % user_id)
def srt_formatter(subtitles, show_before=0, show_after=0): sub_rip_file = pysrt.SubRipFile() for i, ((start, end), text) in enumerate(subtitles, start=1): item = pysrt.SubRipItem() item.index = i item.text = six.text_type(text) item.start.seconds = max(0, start - show_before) item.end.seconds = end + show_after sub_rip_file.append(item) return '\n'.join(six.text_type(item) for item in sub_rip_file)
def srt_formatter(subtitles, show_before=0, show_after=0): f = pysrt.SubRipFile() for (rng, text) in subtitles: item = pysrt.SubRipItem() item.text = force_unicode(text) start, end = rng item.start.seconds = max(0, start - show_before) item.end.seconds = end + show_after f.append(item) return '\n'.join(map(unicode, f))
def srt_formatter(subtitles, show_before=0, show_after=0): f = pysrt.SubRipFile() for i, (rng, text) in enumerate(subtitles, 1): item = pysrt.SubRipItem() item.index = i item.text = force_unicode(text) start, end, num = rng item.start.seconds = max(0, start - show_before) item.end.seconds = end + show_after f.append(item) return '\n'.join(six.text_type(item) for item in f)
def criarArquivoSRT(tempodalegenda, preenchimento_antes=0, preenchimento_depois=0): rip = pysrt.SubRipFile() for i, ((inicio, fim), text) in enumerate(tempodalegenda, start=1): item = pysrt.SubRipItem() item.index = i item.text = six.text_type(text) item.start.seconds = max(0, inicio - preenchimento_antes) item.end.seconds = fim + preenchimento_depois rip.append(item) legenda = '\n'.join(six.text_type(item) for item in rip) return legenda
def srt_formatter(subtitles, show_before=0, show_after=0): f = pysrt.SubRipFile() for i, (rng, text) in enumerate(subtitles, 1): item = pysrt.SubRipItem() item.index = i item.text = force_unicode(text) start = rng[0] end = rng[1] item.start.seconds = max(0, start - show_before) item.end.seconds = end + show_after f.append(item) return '\n'.join(map(str, f))
def xml_to_srt(xml_data): """ xml_data - ET Converts XML data received from Google's servers and returns a SubRipFile instance. """ f = pysrt.SubRipFile() for child in xml_data: sub = pysrt.SubRipItem() sub.text = h.unescape(child.text) sub.start.seconds = float(child.attrib["start"]) sub.end.seconds = float(child.attrib["start"]) + float(child.attrib["dur"]) f.append(sub) return f
def srt_formatter(subtitles, padding_before=0, padding_after=0): """ Serialize a list of subtitles according to the SRT format, with optional time padding. """ sub_rip_file = pysrt.SubRipFile() for i, ((start, end), text) in enumerate(subtitles, start=1): item = pysrt.SubRipItem() item.index = i item.text = six.text_type(text) item.start.seconds = max(0, start - padding_before) item.end.seconds = end + padding_after sub_rip_file.append(item) return '\n'.join(six.text_type(item) for item in sub_rip_file)
def generate(self, subtitles, show_before=0, show_after=0, *args, **kwargs) -> str: sub_rip_file = pysrt.SubRipFile() for i, ((start, end), text) in enumerate(subtitles, start=1): item = pysrt.SubRipItem() item.index = i item.text = str(text) item.start.seconds = max(0, start - show_before) item.end.seconds = end + show_after sub_rip_file.append(item) return '\n'.join(str(item) for item in sub_rip_file)
def write_to_file(self, output_path: Opt[str] = None): util.touch(output_path) sub_rip_file = pysrt.open(output_path, encoding='utf-8') for index, sub in enumerate(self.subtitles): start_time = pysrt.SubRipTime.from_ordinal(sub.start_time * 1000) end_time = pysrt.SubRipTime.from_ordinal(sub.end_time * 1000) next_sub = pysrt.SubRipItem(index=index, text=sub.text, start=start_time, end=end_time) sub_rip_file.append(next_sub) sub_rip_file.save(output_path, encoding='utf-8') return output_path
def combine_srt(srt_list): """ srt_list - a list of SubRipFiles Combines the text of all SubRipFiles in srt_list and returns a SubRipFile instance. """ if srt_list is None or len(srt_list) == 0: return None f = pysrt.SubRipFile() for index in xrange(len(srt_list[0])): sub = pysrt.SubRipItem() for srt in srt_list: sub.text += (srt[index].text + "\n") sub.text = sub.text.rstrip() sub.start = srt_list[0][index].start sub.end = srt_list[0][index].end f.append(sub) return f
def write_transcripts(transcript_filename, transcript, reg): print(transcript) import six sub_rip = pysrt.SubRipFile() for i, (start, end), text in zip(range(len(transcript)), reg, transcript): print(i, start, end, text) item = pysrt.SubRipItem() item.index = i item.text = six.text_type(text) item.start.seconds = max(0, start) item.end.seconds = end sub_rip.append(item) fin_sub = '\n'.join(six.text_type(item) for item in sub_rip) with open(output_filepath + transcript_filename, "wb") as f: f.write(fin_sub.encode("utf-8")) print("+ Successfully Generated Subtitles.") return True
def do_subtitles_generation(sub_write_file, filename, chunk_sound_file, start_chunk): voices = detect_nonsilent(chunk_sound_file, # must be silent for at least half a second min_silence_len=50, # consider it silent if quieter than -16 dBFS silence_thresh=-29 ) global counter print(voices) splits = [0] i = 0 if (len(voices) > 0 and voices[-1][1] == len(chunk_sound_file)): del voices[-1] for voice in voices: if (voice[1] > splits[i] + threshold): i += 1 splits.append(voice[1]) if (len(voices) > 0 and splits[-1] != voices[-1][1]): splits.append(voices[-1][1]) end_chunk = splits[-1] print(splits) print("Split complete") for i in range(len(splits) - 1): out_file = ".//splitAudio//chunk{0}.wav".format(i) print("exporting", out_file) chunk_sound_file[splits[i]:splits[i+1]].export(out_file, format="wav") with sr.AudioFile(out_file) as source: audio = r.record(source) text = r.recognize_sphinx(audio) sub = pysrt.SubRipItem() sub.index = counter counter += 1 sub.start.milliseconds = start_chunk + splits[i] sub.end.milliseconds = start_chunk + splits[i+1] sub.text = text sub_write_file.append(sub) sub_write_file.save(filename + '.srt', encoding='utf-8') print(text) return end_chunk
def export_to_srt(subtitulation): subtitles = db(db.subtitle.subtitulation_id == subtitulation.id).select( orderby=db.subtitle.starts) import pysrt import StringIO sio = StringIO.StringIO() mysrt = pysrt.SubRipFile(encoding=ENCODING) for i, subtitle in enumerate(subtitles): sri = pysrt.SubRipItem() if isinstance(subtitle.body, unicode): print "is unicode" sri.text = subtitle.body else: print "is not unicode" sri.text = unicode(subtitle.body, ENCODING) sri.start = pysrt.SubRipTime.from_time(subtitle.starts) sri.end = pysrt.SubRipTime.from_time(subtitle.ends) sri.index = i mysrt.append(sri) mysrt.write_into(sio) sio.seek(0) return sio
def merge_subtitles(sub1, sub2): # Pick the index idx = min(sub1.index, sub2.index) # Pick the earlier starting time start1 = str_to_time_obj(str(sub1.start)) start2 = str_to_time_obj(str(sub2.start)) if start1 < start2: start = str(sub1.start) else: start = str(sub2.start) # Pick the later starting time end1 = str_to_time_obj(str(sub1.end)) end2 = str_to_time_obj(str(sub2.end)) if end1 > end2: end = str(sub1.end) else: end = str(sub2.end) # Return new SRT item return pysrt.SubRipItem(idx, start=start, end=end, text=str(sub1.text))
def merge_sub(sub1, sub2, bar, driver): if space_var.get() == 1: space_sub = '\n \n' else: space_sub = '\n' sub1_df = dataframe_sub(sub1, "en") sub2_df = dataframe_sub(sub2, "ru") df = pd.concat([sub1_df, sub2_df], axis=0) df['sum'] = df[['start', 'end']].sum(axis=1) df['plus'] = (df['start'] + df['end']) / 2 df = df.sort_values(by='start', ascending=True) # агломеративная кластеризация if clusters_auto_var.get() == 1: clusters_list = [] # оценка качества с помощью "силуэта" silhouette = [] for i in np.linspace(0.2, 1, 20): root.update() threshold = float(i) * 10000 clustering = AgglomerativeClustering( n_clusters=None, distance_threshold=threshold).fit(df[['start', 'end']]) clusters = clustering.labels_ clusters_list.append(len(pd.unique(clusters))) score = silhouette_score(df[['start', 'end']], clusters) silhouette.append(score) max_silhouette = np.argmax(silhouette) clustering = AgglomerativeClustering( n_clusters=clusters_list[max_silhouette]).fit(df[['start', 'end']]) else: threshold = float(clusters_manual_entry.get()) * 10000 clustering = AgglomerativeClustering( n_clusters=None, distance_threshold=threshold, linkage=clusters_method_combobox.get()).fit(df[['start', 'end']]) clusters = clustering.labels_ # добавление найденных кластеров df['cluster'] = clusters bar_subs = float(bar) / float(len(pd.unique(clusters))) # создание нового файла субтитров double_sub = pysrt.SubRipFile(encoding='utf-8') translate_list = pysrt.SubRipFile(encoding='utf-8') for n, i in enumerate(pd.unique(clusters)): root.update() progressBar['value'] += bar_subs df_en = df[(df['language'] == 'en') & (df['cluster'] == i)] df_ru = df[(df['language'] == 'ru') & (df['cluster'] == i)] df_group_en = df_en.groupby('cluster').agg({ 'text': ' '.join, 'start': min, 'end': max, 'language': 'first' }) df_group_ru = df_ru.groupby('cluster').agg({ 'text': ' '.join, 'start': min, 'end': max, 'language': 'first' }) df_group = df_group_en.merge( df_group_ru, on=['cluster', 'text', 'start', 'end', 'language'], how='outer').groupby('cluster').agg({ 'text': space_sub.join, 'start': 'first', 'end': 'first', 'language': ''.join }) sub = pysrt.SubRipItem(index=n + 1, start=int(df_group.iloc[0]['start']), end=int(df_group.iloc[0]['end']), text=str(df_group.iloc[0]['text'])) double_sub.append(sub) if translate_var.get() == 1 and df_group['language'].values == 'en': translate_list.append(sub) if translate_var.get() == 1 and translate_list: translate_sub(translate_list, bar, driver) # переиндексация субтитров double_sub.clean_indexes() return double_sub
### CASO 4: the previus added line ends in a punctuation mark, no combination needed. Extra SRT lines should be created if prevText =='' or prevText[-1] in END_PUNCTUATION : prevTimeStart = line.start prevTimeEnd = line.start + {'seconds':ratioTime * duration} for newSent in range(1,numSentences): #Updating text and end from current line #print(count, " to srt2 ", lineAux[:firstPunctuationIdx+1]) print(count, " to srt2 ", arrStrings[newSent-1]) newIndex = lastAdded+1 #newLine = pysrt.SubRipItem(index=newIndex, start=prevEnd , end=prevEnd, text=lineAux[:firstPunctuationIdx+1]) newLine = pysrt.SubRipItem(index=newIndex, start= prevTimeStart, end=prevTimeEnd, text=arrStrings[newSent-1]) srtTransformed.append(newLine) newIndex+=1 #print(lineAux,"|", lineAux[firstPunctuationIdx+1:]) srtOriginal[count].text = arrStrings[newSent] srtOriginal[count].start = prevTimeEnd numFirstWords = len(nltk.word_tokenize(arrStrings[newSent])) duration = (line.end - srtOriginal[count].start).seconds#.to_time().to_seconds() ratioTime = float(numFirstWords)/float(numTotalWords) divtime = srtOriginal[count].start + {'seconds':ratioTime * duration} srtOriginal[count].end = divtime #prevTimeStart
def find_summary_regions(srt_filename, summarizer, duration, language, bonusWords, stigmaWords, videonamepart): srt_file = pysrt.open(srt_filename) # Find the average amount of time required for each subtitle to be showned clipList = list(map(srt_item_to_range, srt_file)) avg_subtitle_duration = total_duration_of_regions(clipList) / len(srt_file) # Find the no of sentences that will be required in the summary video n_sentences = duration / avg_subtitle_duration print("nsentance : " + str(n_sentences)) # get the summarize video's subtitle array [summary, summarizedSubtitles] = summarize(srt_file, summarizer, n_sentences, language, bonusWords, stigmaWords) # Check whether the total duration is less than the duration required for the video total_time = total_duration_of_regions(summary) print("total_time : " + str(total_time)) try_higher = total_time < duration prev_total_time = -1 # If the duration which we got is higher than required if try_higher: # Then until the resultant duration is higher than the required duration run a loop in which the no of sentence is increased by 1 while total_time < duration: if (prev_total_time == total_time): print("1 : Maximum summarization time reached") break print("1 : total_time : duration " + str(total_time) + " " + str(duration)) n_sentences += 1 [summary, summarizedSubtitles] = summarize(srt_file, summarizer, n_sentences, language, bonusWords, stigmaWords) prev_total_time = total_time total_time = total_duration_of_regions(summary) else: # Else if the duration which we got is lesser than required # Then until the resultant duration is lesser than the required duration run a loop in which the no of sentence is increased by 1 while total_time > duration: if (n_sentences <= 2): print("2 : Minimum summarization time reached") break print("2 : total_time : duration " + str(total_time) + str(duration)) n_sentences -= 1 [summary, summarizedSubtitles] = summarize(srt_file, summarizer, n_sentences, language, bonusWords, stigmaWords) total_time = total_duration_of_regions(summary) print("************ THis is summary array *********") print(summary) print("**********************************") print( "************************THis is summarizedSubtitles array *******************" ) print(summarizedSubtitles) print("**********************************************************") # Find the duration of each subtitle and add it to the ending time of the previous subtitle subs = [] starting = 0 sub_rip_file = pysrt.SubRipFile() for index, item in enumerate(summarizedSubtitles): newSubitem = pysrt.SubRipItem() newSubitem.index = index newSubitem.text = item.text # First find duration duration = summary[index][1] - summary[index][0] # Then find the ending time ending = starting + duration newSubitem.start.seconds = starting newSubitem.end.seconds = ending sub_rip_file.append(newSubitem) # subs.append((index,starting,ending,item.text)) starting = ending print(sub_rip_file) # print(subs) path = videonamepart + ".srt" with open(path, "w+") as sf: for i in range(0, len(sub_rip_file)): sf.write(str(sub_rip_file[i])) sf.write("\n") sf.close() #test file for finding emotions # path = "./media/documents/summarizedSubtitleText.txt" # with open(path,"w+") as stf: # for i in range(0,len(summarizedSubtitles)): # stf.write(str(summarizedSubtitles[i].text)) # stf.write("\n") # stf.close() # return the resulant summarized subtitle array return summary
name_list = vtuber_list.vtuber_tl_list for name in name_list: name_list = name_list + (name.replace(':', ' :'),) #add single letter tags to name list for letter in range(ord('a'), ord('z') + 1): name_list = name_list + ('[' + chr(letter) + ']',) with open(sys.argv[1]) as f: records = csv.DictReader(f) for row in records: msg = row['message'] msg_lower = row['message'].lower() for tag in lang_dict[args.lang.lower()]: if msg_lower.startswith(tag) or (msg_lower.startswith(name_list) and args.lang.lower() == 'en' and args.colon and msg_lower.count(':') < 2): sub = pysrt.SubRipItem() sub.index = index sub_start = int(row['time_in_seconds']) - args.offset sub.start.seconds = sub_start sub.end.seconds = sub_start + args.duration sub.text = msg.replace(tag, '').replace(tag.upper(), '').replace(tag.title(), '') if sub.text.startswith(": "): sub.text = sub.text.replace(": ", "", 1) sub.text = sub.text.strip() sub_file.append(sub) index += 1 sub_count += 1 break if not sub_count: print("No subtitles found")
del subs[i] if '\n' in subs[i].text: # Split the subtitle at the hyphen and format the list lines = [ line[1:] if line[0] == '-' else line for line in subs[i].text.split('\n') ] length_milli = 1000 * abs( float( (subs[i].end.seconds - subs[i].start.seconds - 60) % 60)) + float(subs[i].end.milliseconds - subs[i].start.milliseconds) interval_milli = int(length_milli / len(lines)) dummy = pysrt.SubRipItem( 0, start=str(subs[i].start), end=str(subs[i].end), text="" ) # Use this just to get the right formatting for the time dummy.shift( milliseconds=+interval_milli ) # Shift the dummy so its start time is now the end time we want for j in xrange(len(lines)): new_sub = pysrt.SubRipItem(0, start=str(subs[i].start), end=str(dummy.start), text=lines[j]) new_sub.shift(milliseconds=+(j * interval_milli)) subs.append(new_sub) del subs[i] subs.clean_indexes()
def process(self): gentle_file = self.jsonfile out_file = self.outfile srt_file = self.srtfile g = json.load(open(gentle_file, 'r')) t = g['transcript'] g_words = [ w for w in g['words'] if w['case'] != 'not-found-in-transcript' ] sentences = t.split('\n') sentences = [sent.replace('-', ' ') for sent in sentences] inputsrt_elems = pysrt.open(srt_file) assert (len(inputsrt_elems) == len(sentences)) srt_elems = pysrt.SubRipFile() counter = 0 for sent_i, sent in enumerate(sentences): if type(sent) != type(u''): sent = sent.decode('utf-8') words = sent.split() start_time_found = False for cur_word in words: if not re.search(r'(\w|\’\w|\'\w)+', cur_word, re.UNICODE): continue for w in re.finditer(r'(\w|\’\w|\'\w)+', cur_word, re.UNICODE): word = w.group() gentle_word = g_words[counter] clean_word = re.search(r'(\w|\’\w|\'\w)+', word, re.UNICODE).group() if False: if (clean_word.lower() != gentle_word['word'].lower()): pdb.set_trace() print("Error") else: assert ( clean_word.lower() == gentle_word['word'].lower()) if gentle_word['case'] == 'success': #Retain first valid time boundary if start_time_found == False: start_time = gentle_word['start'] start_time_found = True #keep scanning until the last valid time bounday end_time = gentle_word['end'] counter += 1 if start_time_found == False: start_time = inputsrt_elems[sent_i].start end_time = inputsrt_elems[sent_i].end else: start_time = extract_time_tuple(start_time) end_time = extract_time_tuple(end_time) elem = pysrt.SubRipItem() elem.index = sent_i + 1 elem.text = sent elem.start = start_time elem.end = end_time srt_elems.append(elem) srt_elems.save(out_file, encoding='utf-8')
features='html.parser') maindiv = soup.findAll("div", {"id": "show"})[0] basicdiv = maindiv.findAll("b") break except: # html = driver.find_element_by_tag_name('html') time.sleep(1) driver.refresh() submissing = False if html.find( "Sorry, there are no subtitle available for this video." ) != -1: file = pysrt.SubRipFile() sub = pysrt.SubRipItem(1, start='00:00:00,000', end='00:00:01,000', text="Sub was not found") file.append(sub) subtype = "F-" file.save("H:\#Everything Else\#Project Ashwini\SRT\\" + channelName + "\\" + subtype + channelName + "-" + str(videolinknum) + ".srt", encoding='utf-8') submissing = True print("Sub missing") elif len(basicdiv) <= 1: file = pysrt.SubRipFile() sub = pysrt.SubRipItem(1, start='00:00:00,000', end='00:00:01,000',