def generate_word_contexts(self, length): self.word_contexts = {} self.len_windows = [] delta = SubRipTime.from_ordinal(int(length) * 1000) # (ordinal is milliseconds) if not self.all_frames: self.full_tokens() for i, f in enumerate(self.all_frames): # Get data from frame f_start = f["start"] f_end = f["end"] f_tokens = f["tokens"] start_of_window = f_start - delta end_of_window = f_end + delta if not f_tokens: # The frame has no tokens continue f_context = f["tokens"].copy() # Initialization of the context # Add tokens of preceding frames j = -1 while (i + j ) >= 0 and self.all_frames[i + j]["end"] >= start_of_window: f_context.extend(self.all_frames[i + j]["tokens"]) j -= 1 # Add tokens of later frames j = 1 while (i + j) <= (len(self.all_frames) - 1) and self.all_frames[ i + j]["start"] <= end_of_window: f_context.extend(self.all_frames[i + j]["tokens"]) j += 1 # Add to context dictionary for t in f["tokens"]: self.len_windows.append( len(f_context) - 1) # This is for the length of windows analysis if t not in self.word_contexts: self.word_contexts[t] = {} for c in f_context: self.word_contexts[t][c] = self.word_contexts[t].get(c, 0) + 1 self.word_contexts[t][t] -= 1 if self.word_contexts[t][t] == 0: del self.word_contexts[t][t] if not self.check_correct_start_end( ): # If end and star are not correct, the matrices are not symmmetric self.correct_symmetry() return (self.word_contexts)
def __init__(self, index: int, media_path: MediaPath, pds: PaletteDefinitionSegment, ods: ObjectDefinitionSegment, wds: WindowDefinitionSegment): self.index = index self.start = SubRipTime.from_ordinal(ods.presentation_timestamp) self.end: Optional[SubRipTime] = None self.pds = pds self.ods = ods self.wds = wds self.media_path = media_path self.image = PgsImage(ods.img_data, pds.palettes) self.text: Optional[str] = None self.place: Optional[Tuple[int, int, int, int]] = None
def merge_subtitle(sub_a, sub_b, delta, encoding='utf-8'): """ 合并两种不同言语的srt字幕 因为两个字幕文件的时间轴不一样,所以合并后的字幕会在某一字幕文件转换时生成新的一条字幕, 导致双语字幕并不是同时变化,不过这也是没有办法的事,无法避免 参考https://github.com/byroot/pysrt/issues/17 https://github.com/byroot/pysrt/issues/15 :param sub_a: 使用sub_a = SubRipFile.open(sub_a_path, encoding=encoding) :param sub_b: :param delta: :return: """ out = SubRipFile() intervals = [item.start.ordinal for item in sub_a] intervals.extend([item.end.ordinal for item in sub_a]) intervals.extend([item.start.ordinal for item in sub_b]) intervals.extend([item.end.ordinal for item in sub_b]) intervals.sort() j = k = 0 for i in xrange(1, len(intervals)): start = SubRipTime.from_ordinal(intervals[i - 1]) end = SubRipTime.from_ordinal(intervals[i]) if (end - start) > delta: text_a, j = find_subtitle(sub_a, start, end, j) text_b, k = find_subtitle(sub_b, start, end, k) text = join_lines(text_a, text_b) if len(text) > 0: item = SubRipItem(0, start, end, text) out.append(item) out.clean_indexes() return out
def merge_subtitle(sub_a, sub_b, delta, encoding='utf-8'): """ 合并两种不同言语的srt字幕 因为两个字幕文件的时间轴不一样,所以合并后的字幕会在某一字幕文件转换时生成新的一条字幕, 导致双语字幕并不是同时变化,不过这也是没有办法的事,无法避免 参考https://github.com/byroot/pysrt/issues/17 https://github.com/byroot/pysrt/issues/15 :param sub_a: 使用sub_a = SubRipFile.open(sub_a_path, encoding=encoding) :param sub_b: :param delta: :return: """ out = SubRipFile() intervals = [item.start.ordinal for item in sub_a] intervals.extend([item.end.ordinal for item in sub_a]) intervals.extend([item.start.ordinal for item in sub_b]) intervals.extend([item.end.ordinal for item in sub_b]) intervals.sort() j = k = 0 for i in xrange(1, len(intervals)): start = SubRipTime.from_ordinal(intervals[i - 1]) end = SubRipTime.from_ordinal(intervals[i]) if (end - start) > delta: text_a, j = find_subtitle(sub_a, start, end, j) text_b, k = find_subtitle(sub_b, start, end, k) text = join_lines(text_a, text_b) if len(text) > 0: item = SubRipItem(0, start, end, text) out.append(item) out.clean_indexes() return out
def merge_subtitle(sub_a, sub_b, delta): out = SubRipFile() intervals = [item.start.ordinal for item in sub_a] intervals.extend([item.end.ordinal for item in sub_a]) intervals.extend([item.start.ordinal for item in sub_b]) intervals.extend([item.end.ordinal for item in sub_b]) intervals.sort() j = k = 0 for i in xrange(1, len(intervals)): start = SubRipTime.from_ordinal(intervals[i-1]) end = SubRipTime.from_ordinal(intervals[i]) if (end-start) > delta: text_a, j = find_subtitle(sub_a, start, end, j) text_b, k = find_subtitle(sub_b, start, end, k) text = join_lines(text_a, text_b) if len(text) > 0: item = SubRipItem(0, start, end, text) out.append(item) out.clean_indexes() return out
def merge_subtitle(sub_a, sub_b, delta): out = SubRipFile() intervals = [item.start.ordinal for item in sub_a] intervals.extend([item.end.ordinal for item in sub_a]) intervals.extend([item.start.ordinal for item in sub_b]) intervals.extend([item.end.ordinal for item in sub_b]) intervals.sort() j = k = 0 for i in range(1, len(intervals)): start = SubRipTime.from_ordinal(intervals[i - 1]) end = SubRipTime.from_ordinal(intervals[i]) if (end - start) > delta: text_a, j = find_subtitle(sub_a, start, end, j) text_b, k = find_subtitle(sub_b, start, end, k) text = join_lines(text_a, text_b) if len(text) > 0: item = SubRipItem(0, start, end, text) out.append(item) out.clean_indexes() return out
def decode(cls, data: bytes, media_path: MediaPath): display_sets = PgsReader.decode(data, media_path) index = 0 items = [] for display_set in display_sets: if items and not display_set.has_image and display_set.wds: items[-1].end = SubRipTime.from_ordinal(display_set.wds[-1].presentation_timestamp) continue for (pds, ods, wds) in zip(display_set.pds, display_set.ods, display_set.wds): item = PgsSubtitleItem(index, media_path, pds, ods, wds) if items and items[-1].end is None and items[-1].start + 10000 >= item.start: items[-1].end = max(items[-1].start, item.start - 1) items.append(item) index += 1 for item in items: item.validate() return items
def test_from_ordinal(self): self.assertEquals(SubRipTime.from_ordinal(3600000), {'hours': 1}) self.assertEquals(SubRipTime(1), 3600000)
srt = SubRipFile(eol='\n', encoding='utf-8') i = 1 for line in sublog: line = line.split(",", 1) if (line[0] and line[0][0] == '-'): if (START_TIME == None and line[0][:8] == '- start '): START_TIME = datetime.strptime(line[0], '- start ' + TIMEFORMAT + '\n') continue no = datetime.strptime(line[0], TIMEFORMAT) - START_TIME if (abs(no) > timedelta(1)): print("\nCan't go over a day in a subtitle! Delete non-used lines in" + \ " log.\nLet there only be one '- start' line at the top of" + \ " the log-file.") sys.exit(1) time = SubRipTime.from_ordinal(no.seconds*1000 + no.microseconds*0.001) item = SubRipItem(i, start=time, end=time + 30*1000, text=unicode(line[1], 'utf-8')) srt.append(item) i += 1 srt.clean_indexes() #srt.save(path=sys.stdout) for line in srt: sys.stdout.write(unicode(line).encode('utf-8'))
def test_from_ordinal(self): self.assertEqual(SubRipTime.from_ordinal(3600000), {'hours': 1}) self.assertEqual(SubRipTime(1), 3600000)