def json_to_srt(deepspeech_json, max_word_time=10, min_sub_time=1.5, max_sub_time=3): index = 0 subtitle = "" start_time = 0 end_time = 0 subtitles = SubRipFile() for word in deepspeech_json["words"]: word["end_time"] = word["start_time"] + word["duration"] if word["duration"] < max_word_time: if start_time + max_sub_time >= word["end_time"] and subtitle: subtitle += " " subtitle += word["word"] end_time = max(word["end_time"], start_time + min_sub_time) elif subtitle: # Convert to milliseconds subtitles.append( SubRipItem(index=++index, start=int(start_time*1000), end=int(end_time*1000), text=subtitle)) subtitle = "" if not subtitle: start_time = word["start_time"] subtitle += word["word"] end_time = max(word["end_time"], start_time + min_sub_time) if subtitle: subtitles.append(SubRipItem(index=++index, start=int(start_time*1000), end=int(end_time*1000), text=subtitle)) return subtitles
def process(self, subs: SubRipFile, items: List[PgsSubtitleItem], post_process, confidence: int, max_width: int): full_image = FullImage.from_items(items, self.gap, max_width) config = {'output_type': tess.Output.DICT, 'config': '--psm 11'} if self.pgs.language: config.update({'lang': self.pgs.language.alpha3}) if self.omp_thread_limit: os.environ['OMP_THREAD_LIMIT'] = str(self.omp_thread_limit) # cv2.imwrite(f'{subs.path}-{len(items)}-{confidence}.png', full_image.data) data = TsvData(tess.image_to_data(full_image.data, **config)) remaining = [] for item in items: text = self.accept(data, item, confidence) if text is None: remaining.append(item) continue text = item.text if post_process: text = post_process(text) if text: item = SubRipItem(0, item.start, item.end, text) subs.append(item) return remaining
def to_srt(df, filename): out = SubRipFile(encoding='utf-8') for i, r in df.iterrows(): begin = convert_time(r['begin']) end = convert_time(r['end']) out.append(SubRipItem(0, begin, end, r['text'])) out.save(filename)
def get_captions(client_name, clip_id): h = httplib2.Http() g_url = 'http://%s/JSON.php?clip_id=%s' % ( client_name, clip_id) print "Fetching URL: %s" % g_url try: response, j = h.request(g_url) except httplib.BadStatusLine as exception: return None dirname = os.getcwd() + "/data/granicus/srt/%s/" % client_name filename = dirname + "%s.srt" % clip_id subs = SubRipFile() if response.get('status') == '200': captions = [] try: j = json.loads(j, strict=False)[0] except ValueError: ts = re.sub('([{,]\s+)([a-z]+)(: ")', lambda s: '%s"%s"%s' % (s.groups()[0], s.groups()[1], s.groups()[2]), j).replace("\\", "") try: j = json.loads(ts, strict=False)[0] except UnicodeDecodeError: ts = unicode(ts, errors='ignore') j = json.loads(ts, strict=False)[0] except: j = False sub_count = 0 for item in j: if item["type"] == "text": cap = item["text"] offset = round(float(item["time"]), 3) captions.append({'time': offset, 'text': cap}) end = get_cap_end(j, sub_count) if end: subtitle = SubRipItem(index=sub_count, start=SubRipTime(seconds=offset), end=SubRipTime(seconds=end), text=cap) subs.append(subtitle) sub_count = sub_count + 1 try: subs.save(path=filename, encoding="utf-8") except IOError: p = subprocess.Popen('mkdir -p %s' % dirname, shell=True, stdout=subprocess.PIPE) t = p.wait() subs.save(path=filename, encoding="utf-8") s3_url = push_to_s3(filename, '%s/%s.srt' % (client_name, clip_id)) return (captions, s3_url) else: return ([], '')
def get_captions(client_name, clip_id): h = httplib2.Http() g_url = 'http://%s/JSON.php?clip_id=%s' % ( client_name, clip_id) print "Fetching URL: %s" % g_url response, j = h.request(g_url) dirname = os.getcwd() + "/data/granicus/srt/%s/" % client_name filename = dirname + "%s.srt" % clip_id subs = SubRipFile() if response.get('status') == '200': captions = [] try: j = json.loads(j, strict=False)[0] except ValueError: ts = re.sub('([{,]\s+)([a-z]+)(: ")', lambda s: '%s"%s"%s' % (s.groups()[0], s.groups()[1], s.groups()[2]), j).replace("\\", "") try: j = json.loads(ts, strict=False)[0] except UnicodeDecodeError: ts = unicode(ts, errors='ignore') j = json.loads(ts, strict=False)[0] except: j = False sub_count = 0 for item in j: if item["type"] == "text": cap = item["text"] offset = round(float(item["time"]), 3) captions.append({'time': offset, 'text': cap}) end = get_cap_end(j, sub_count) if end: subtitle = SubRipItem(index=sub_count, start=SubRipTime(seconds=offset), end=SubRipTime(seconds=end), text=cap) subs.append(subtitle) sub_count = sub_count + 1 try: subs.save(path=filename, encoding="utf-8") except IOError: p = subprocess.Popen('mkdir -p %s' % dirname, shell=True, stdout=subprocess.PIPE) t = p.wait() subs.save(path=filename, encoding="utf-8") s3_url = push_to_s3(filename, '%s/%s.srt' % (client_name, clip_id)) return (captions, s3_url) else: return ([], '')
def save(self, path): if path.endswith('srt'): verify_dependencies(['pysrt']) from pysrt import SubRipFile, SubRipItem from datetime import time out = SubRipFile() for elem in self._elements: start = time(*self._to_tup(elem.onset)) end = time(*self._to_tup(elem.onset + elem.duration)) out.append(SubRipItem(0, start, end, elem.text)) out.save(path) else: with open(path, 'w') as f: f.write('onset\ttext\tduration\n') for elem in self._elements: f.write('{}\t{}\t{}\n'.format(elem.onset, elem.text, elem.duration))
def merge_subtitle(sub_a, sub_b, delta, encoding='utf-8'): """ 合并两种不同言语的srt字幕 因为两个字幕文件的时间轴不一样,所以合并后的字幕会在某一字幕文件转换时生成新的一条字幕, 导致双语字幕并不是同时变化,不过这也是没有办法的事,无法避免 参考https://github.com/byroot/pysrt/issues/17 https://github.com/byroot/pysrt/issues/15 :param sub_a: 使用sub_a = SubRipFile.open(sub_a_path, encoding=encoding) :param sub_b: :param delta: :return: """ out = SubRipFile() intervals = [item.start.ordinal for item in sub_a] intervals.extend([item.end.ordinal for item in sub_a]) intervals.extend([item.start.ordinal for item in sub_b]) intervals.extend([item.end.ordinal for item in sub_b]) intervals.sort() j = k = 0 for i in xrange(1, len(intervals)): start = SubRipTime.from_ordinal(intervals[i - 1]) end = SubRipTime.from_ordinal(intervals[i]) if (end - start) > delta: text_a, j = find_subtitle(sub_a, start, end, j) text_b, k = find_subtitle(sub_b, start, end, k) text = join_lines(text_a, text_b) if len(text) > 0: item = SubRipItem(0, start, end, text) out.append(item) out.clean_indexes() return out
def merge_subtitle(sub_a, sub_b, delta): out = SubRipFile() intervals = [item.start.ordinal for item in sub_a] intervals.extend([item.end.ordinal for item in sub_a]) intervals.extend([item.start.ordinal for item in sub_b]) intervals.extend([item.end.ordinal for item in sub_b]) intervals.sort() j = k = 0 for i in xrange(1, len(intervals)): start = SubRipTime.from_ordinal(intervals[i-1]) end = SubRipTime.from_ordinal(intervals[i]) if (end-start) > delta: text_a, j = find_subtitle(sub_a, start, end, j) text_b, k = find_subtitle(sub_b, start, end, k) text = join_lines(text_a, text_b) if len(text) > 0: item = SubRipItem(0, start, end, text) out.append(item) out.clean_indexes() return out
def merge_subtitle(sub_a, sub_b, delta): out = SubRipFile() intervals = [item.start.ordinal for item in sub_a] intervals.extend([item.end.ordinal for item in sub_a]) intervals.extend([item.start.ordinal for item in sub_b]) intervals.extend([item.end.ordinal for item in sub_b]) intervals.sort() j = k = 0 for i in range(1, len(intervals)): start = SubRipTime.from_ordinal(intervals[i - 1]) end = SubRipTime.from_ordinal(intervals[i]) if (end - start) > delta: text_a, j = find_subtitle(sub_a, start, end, j) text_b, k = find_subtitle(sub_b, start, end, k) text = join_lines(text_a, text_b) if len(text) > 0: item = SubRipItem(0, start, end, text) out.append(item) out.clean_indexes() return out
chat_channel, chat_server[0], chat_server[1], twitchclient_version=twitchclient_version) outsrt = SubRipFile() text = '' while 1: raw_msg_list = bot.get_message() if len(raw_msg_list) > 0: if len(text) > 0: end = SubRipTime.from_time(datetime.now()) item = SubRipItem(0, start, end, text) outsrt.append(item) start = SubRipTime.from_time(datetime.now()) text = '' timestamp = get_timestamp(timestamp_format) for item in raw_msg_list: if record_raw: log_add(raw_log_path, timestamp + ' ' + item + '\n') username, message = irc_bot.parse_user(item) if username != '': safe_print(chat_channel + " " + username + ": " + message) log_add(log_path, timestamp + ' ' + username + ': ' + message + '\n') text += username + ": " + message + '\n' outsrt.clean_indexes() outsrt.save(srt_log_path, encoding='utf-8')
srt = SubRipFile() # get all DisplaySets that contain an image print("Loading DisplaySets...") allsets = [ds for ds in tqdm(pgs.iter_displaysets())] print(f"Running OCR on {len(allsets)} DisplaySets and building SRT file...") subText = "" subStart = 0 subIndex = 0 for ds in tqdm(allsets): if ds.has_image: # get Palette Display Segment pds = ds.pds[0] # get Object Display Segment ods = ds.ods[0] # img = make_image(ods, pds) # subText = pytesseract.image_to_string(img) subStart = ods.presentation_timestamp else: startTime = SubRipTime(milliseconds=int(subStart)) endTime = SubRipTime( milliseconds=int(ds.end[0].presentation_timestamp)) srt.append(SubRipItem(subIndex, startTime, endTime, "subText")) subIndex += 1 print(f"Done. SRT file saved as {srtFile}") srt.save(srtFile, encoding='utf-8')
srt = SubRipFile(eol='\n', encoding='utf-8') i = 1 for line in sublog: line = line.split(",", 1) if (line[0] and line[0][0] == '-'): if (START_TIME == None and line[0][:8] == '- start '): START_TIME = datetime.strptime(line[0], '- start ' + TIMEFORMAT + '\n') continue no = datetime.strptime(line[0], TIMEFORMAT) - START_TIME if (abs(no) > timedelta(1)): print("\nCan't go over a day in a subtitle! Delete non-used lines in" + \ " log.\nLet there only be one '- start' line at the top of" + \ " the log-file.") sys.exit(1) time = SubRipTime.from_ordinal(no.seconds*1000 + no.microseconds*0.001) item = SubRipItem(i, start=time, end=time + 30*1000, text=unicode(line[1], 'utf-8')) srt.append(item) i += 1 srt.clean_indexes() #srt.save(path=sys.stdout) for line in srt: sys.stdout.write(unicode(line).encode('utf-8'))
def handle_tracks(tracks, start, end, fps, srt_filename): global XML_FILENAME, HUE_SAMPLING, DMX_SAMPLING, TRANSITION_TIME, DEBUG, VERBOSE track_list = [] for track in tracks: track_list = handle_track_list(track, start, end, fps) # print(track_list[3][0]) # try: # print(len(track_list[3]),len(track_list[3][0]),track_list[3][0][1:10],track_list[3][-1][1:10]) # except: # pass # srt_file = open(srt_filename,"w") dmx_frame = zeros(512) prev_dmx_frame = zeros(512) prev_dmx_valid_frame = zeros(512) subrip_file = SubRipFile(path=srt_filename) print(40 * "-") print("Processing frames") print(40 * "-") # print(track_list[3][1]) # print(len(track_list[1])) if len(track_list[1]) > 0: # If there isn't only an audio track # print(track_list[1][0]) # print(track_list[1][0]!="audio") # print(len(track_list[1]) != 1 and track_list[1][0]!="audio") if (len(track_list[1]) != 1 or track_list[1][0] != "audio"): print("Number of lighting events: ", len(track_list[3][0])) frame_no = 0 for i in range(len(track_list[3][0])): # frame_no = track_list[4][i] frame_no = i t = i * (1.0 / float(fps)) if VERBOSE: print(40 * "-") # print(frame_no,fps) print("Frame %s / time %s seconds" % (frame_no, t)) print(40 * "-") hue_cmd = "" dmx_cmd = "" # for the bug, len(of track_list[0]) is greater than # len(track_list[3]) for j in range(len(track_list[0])): # print(track_list[1][j]) if track_list[1][j] != "audio": name = track_list[0][j] type = track_list[1][j] addr = track_list[2][j] # print(name,type,addr) # TODO: if frame_no = i as on line 181, the following line fails! # [3][j] is out of range therefore j is the problem try: payload = track_list[3][j][i] except Exception as e: print( 'ERROR: could not get payload, len(of track_list[0]) is likely greater than \ len (track_list[3])') # print(name, type, addr, payload) # Convert Hue payload to hue command if payload != "": if addr[1:4].lower( ) == "hue" and type == "OSCColor/floatarray": if VERBOSE: print("hue", addr, payload) r, g, b, a = 0, 0, 0, 0 try: payload_list = payload.split(",") # print(payload_list) if len(payload_list) == 3: r, g, b = payload_list elif len(payload_list) == 4: r, g, b, a = payload_list except Exception as e: print(e) h, s, v = rgb_to_hsv(float(r), float(g), float(b)) h *= 65535.0 s *= 254.0 v *= 254.0 h = int(h) s = int(s) v = int(v) # print("hue", addr, payload, h,s,v) n = int(addr[4:]) # print("hue", n, h,s,v) if len(hue_cmd) == 0: hue_cmd += "HUE%s(%s,%s,%s,%s)" % ( n, h, s, v, TRANSITION_TIME) else: hue_cmd += ";HUE%s(%s,%s,%s,%s)" % ( n, h, s, v, TRANSITION_TIME) # Convert single DMX channel to command elif addr[1:4].lower( ) == "dmx" and type == "OSCValue/float": if VERBOSE: print("dmx value", addr, payload) n = int(addr[4:]) if payload != "": dmx_frame[int(n)] = int( float(payload) * 254) # Convert multiple DMX channels to command elif addr[1:4].lower() == "dmx" and ( type == "OSCColor/floatarray" or type == "OSCValue/standard"): if VERBOSE: print("dmx colour", addr, payload) n = int(addr[4:]) if payload != "": payload_list = payload.split(",") for channel in payload_list: dmx_frame[int(n)] = int( float(channel) * 254) n += 1 # Output HUE commands # hue_t = frame_no * (1.0/HUE_SAMPLING) if frame_no % fps == 0 and hue_cmd != "": item = SubRipItem(frame_no, text=hue_cmd) item.shift(seconds=t) item.end.shift(seconds=1) if VERBOSE: print(item) else: print("h", end="") stdout.flush() subrip_file.append(item) frame_no += 1 # Output DMX command dmx_frame_trimmed = trim_zeros(dmx_frame, 'b').astype('uint8') # print("dmx_frame_trimmed before",dmx_frame_trimmed) # if len(dmx_frame_trimmed)==0: # dmx_frame_trimmed = zeros(512) # print("dmx_frame_trimmed after",dmx_frame_trimmed) dmx_cmd = "DMX1" + str(tuple(dmx_frame_trimmed)[1:]).replace( " ", "") if VERBOSE: print('dmx_cmd to be written: ', dmx_cmd) # cmd = hue_cmd + ";" + dmx_cmd if (not array_equal(dmx_frame_trimmed, prev_dmx_frame)) or (frame_no % fps == 0): # if frame_no % fps == 0 and dmx_cmd=="": # if frame_no % fps == 0: # print(dmx_cmd, prev_dmx_frame) # Fix for and empty DMX command # Usually found at the start of a treatment track if dmx_cmd == "DMX1()": item = dmx_cmd = "DMX1" + str( tuple(zeros(512, dtype=int))[1:]).replace(" ", "") item = SubRipItem(frame_no, text=dmx_cmd) item.shift(seconds=t) item.end.shift(seconds=1.0 / fps) if VERBOSE: print(item) else: print("d", end="") stdout.flush() subrip_file.append(item) frame_no += 1 prev_dmx_frame = dmx_frame_trimmed # print(cmd) if VERBOSE: print(40 * "-") # print(track_list[0][j], track_list[1][j], track_list[2][j], track_list[3][j][i]) # print(frame) # j = 1 # for frame in track: # print(track_list[0][i] + " " +frame, end = " ") # j += 1 # print() encoding = "utf_8" subrip_file.save(srt_filename, encoding=encoding) print()
srt_log_path = current_directory + '/comment_log/' + chat_channel + '.srt' bot = irc_bot.irc_bot(username, oauth, chat_channel, chat_server[0], chat_server[1], twitchclient_version = twitchclient_version) outsrt = SubRipFile() text = '' while 1: raw_msg_list = bot.get_message() if len(raw_msg_list) > 0: if len(text) > 0: end = SubRipTime.from_time(datetime.now()) item = SubRipItem(0, start, end, text) outsrt.append(item) start = SubRipTime.from_time(datetime.now()) text = '' timestamp = get_timestamp(timestamp_format) for item in raw_msg_list: if record_raw: log_add(raw_log_path, timestamp + ' ' + item + '\n') username, message = irc_bot.parse_user(item) if username != '': safe_print(chat_channel + " " + username + ": " + message) log_add(log_path, timestamp + ' ' + username + ': ' + message + '\n') text += username + ": " + message + '\n' outsrt.clean_indexes() outsrt.save(srt_log_path, encoding='utf-8')
def syncSrts(subs_L1, subs_L2): """Sync subs_L1 by subs_L2 timings and return a SubRipFile. """ out = SubRipFile() subs_L2_out = SubRipFile() j = 0 last_j = -1 dupes = 0 L2_ind = -1 for L2_sub in subs_L2: L2_ind = L2_ind + 1 start = L2_sub.start end = L2_sub.end j = matchSubtitle(subs_L1, start, end, max(last_j, 0)) L1_sub = subs_L1[j] if (j > -1) else None if L1_sub is None: text = L2_sub.text print("---- Missing: {}: {}".format( L2_sub.index, L2_sub.text.replace("\n", "[[NL]]"))) else: text = L1_sub.text if j - 1 > last_j and last_j > -1: # we skipped a sub in L1_subs if isSubMatch(subs_L1[j - 1], subs_L2[L2_ind - 1].start, subs_L2[L2_ind - 1].end): out[len(out) - 1].text = out[len(out) - 1].text + "\n" + subs_L1[j - 1].text elif isSubMatch(subs_L1[j - 1], start, end): text = subs_L1[j - 1].text + "\n" + text else: # A sub line in L1 does not match any in L2 # We add it to synced L1, and add an empty one to subs L2 item = SubRipItem(0, subs_L1[j - 1].start, subs_L1[j - 1].end, subs_L1[j - 1].text) out.append(item) item2 = SubRipItem(0, subs_L1[j - 1].start, subs_L1[j - 1].end, " ") subs_L2_out.append(item2) if j == last_j: dupes = dupes + 1 #print("---- OOPS. {}: {} - {}".format(L2_sub.index, L2_sub.text.replace("\n",""), L1_sub.text.replace("\n",""))) last_j = j item = SubRipItem(0, start, end, text) out.append(item) item2 = SubRipItem(0, start, end, L2_sub.text) subs_L2_out.append(item2) out.clean_indexes() subs_L2_out.clean_indexes() fixed = 0 for i in range(1, len(out)): sub1 = out[i - 1].text sub2 = out[i].text if ((sub1 == sub2) and (subs_L2_out[i - 1].text != subs_L2_out[i].text)): if (trySplitLine(out, i, sub1)): fixed = fixed + 1 i = i + 1 else: print("---- Oy. {}: {} not fixed".format( i, sub1.replace("\n", "[[NL]]"))) return out, dupes, fixed, subs_L2_out