예제 #1
0
    def generate_word_contexts(self, length):
        self.word_contexts = {}
        self.len_windows = []
        delta = SubRipTime.from_ordinal(int(length) *
                                        1000)  # (ordinal is milliseconds)
        if not self.all_frames:
            self.full_tokens()

        for i, f in enumerate(self.all_frames):
            # Get data from frame
            f_start = f["start"]
            f_end = f["end"]
            f_tokens = f["tokens"]
            start_of_window = f_start - delta
            end_of_window = f_end + delta

            if not f_tokens:  # The frame has no tokens
                continue

            f_context = f["tokens"].copy()  # Initialization of the context

            # Add tokens of preceding frames
            j = -1
            while (i + j
                   ) >= 0 and self.all_frames[i + j]["end"] >= start_of_window:
                f_context.extend(self.all_frames[i + j]["tokens"])
                j -= 1

            # Add tokens of later frames
            j = 1
            while (i + j) <= (len(self.all_frames) - 1) and self.all_frames[
                    i + j]["start"] <= end_of_window:
                f_context.extend(self.all_frames[i + j]["tokens"])
                j += 1

            # Add to context dictionary
            for t in f["tokens"]:
                self.len_windows.append(
                    len(f_context) -
                    1)  # This is for the length of windows analysis
                if t not in self.word_contexts:
                    self.word_contexts[t] = {}
                for c in f_context:
                    self.word_contexts[t][c] = self.word_contexts[t].get(c,
                                                                         0) + 1
                self.word_contexts[t][t] -= 1
                if self.word_contexts[t][t] == 0:
                    del self.word_contexts[t][t]

        if not self.check_correct_start_end(
        ):  # If end and star are not correct, the matrices are not symmmetric
            self.correct_symmetry()

        return (self.word_contexts)
예제 #2
0
 def __init__(self, index: int, media_path: MediaPath,
              pds: PaletteDefinitionSegment, ods: ObjectDefinitionSegment, wds: WindowDefinitionSegment):
     self.index = index
     self.start = SubRipTime.from_ordinal(ods.presentation_timestamp)
     self.end: Optional[SubRipTime] = None
     self.pds = pds
     self.ods = ods
     self.wds = wds
     self.media_path = media_path
     self.image = PgsImage(ods.img_data, pds.palettes)
     self.text: Optional[str] = None
     self.place: Optional[Tuple[int, int, int, int]] = None
예제 #3
0
def merge_subtitle(sub_a, sub_b, delta, encoding='utf-8'):
    """
    合并两种不同言语的srt字幕

    因为两个字幕文件的时间轴不一样,所以合并后的字幕会在某一字幕文件转换时生成新的一条字幕,
    导致双语字幕并不是同时变化,不过这也是没有办法的事,无法避免

    参考https://github.com/byroot/pysrt/issues/17

    https://github.com/byroot/pysrt/issues/15

    :param sub_a: 使用sub_a = SubRipFile.open(sub_a_path, encoding=encoding)
    :param sub_b:
    :param delta:
    :return:
    """
    out = SubRipFile()
    intervals = [item.start.ordinal for item in sub_a]
    intervals.extend([item.end.ordinal for item in sub_a])
    intervals.extend([item.start.ordinal for item in sub_b])
    intervals.extend([item.end.ordinal for item in sub_b])
    intervals.sort()

    j = k = 0
    for i in xrange(1, len(intervals)):
        start = SubRipTime.from_ordinal(intervals[i - 1])
        end = SubRipTime.from_ordinal(intervals[i])

        if (end - start) > delta:
            text_a, j = find_subtitle(sub_a, start, end, j)
            text_b, k = find_subtitle(sub_b, start, end, k)

            text = join_lines(text_a, text_b)
            if len(text) > 0:
                item = SubRipItem(0, start, end, text)
                out.append(item)

    out.clean_indexes()
    return out
예제 #4
0
def merge_subtitle(sub_a, sub_b, delta, encoding='utf-8'):
    """
    合并两种不同言语的srt字幕

    因为两个字幕文件的时间轴不一样,所以合并后的字幕会在某一字幕文件转换时生成新的一条字幕,
    导致双语字幕并不是同时变化,不过这也是没有办法的事,无法避免

    参考https://github.com/byroot/pysrt/issues/17

    https://github.com/byroot/pysrt/issues/15

    :param sub_a: 使用sub_a = SubRipFile.open(sub_a_path, encoding=encoding)
    :param sub_b:
    :param delta:
    :return:
    """
    out = SubRipFile()
    intervals = [item.start.ordinal for item in sub_a]
    intervals.extend([item.end.ordinal for item in sub_a])
    intervals.extend([item.start.ordinal for item in sub_b])
    intervals.extend([item.end.ordinal for item in sub_b])
    intervals.sort()

    j = k = 0
    for i in xrange(1, len(intervals)):
        start = SubRipTime.from_ordinal(intervals[i - 1])
        end = SubRipTime.from_ordinal(intervals[i])

        if (end - start) > delta:
            text_a, j = find_subtitle(sub_a, start, end, j)
            text_b, k = find_subtitle(sub_b, start, end, k)

            text = join_lines(text_a, text_b)
            if len(text) > 0:
                item = SubRipItem(0, start, end, text)
                out.append(item)

    out.clean_indexes()
    return out
예제 #5
0
def merge_subtitle(sub_a, sub_b, delta):
    out = SubRipFile()
    intervals = [item.start.ordinal for item in sub_a]
    intervals.extend([item.end.ordinal for item in sub_a])
    intervals.extend([item.start.ordinal for item in sub_b])
    intervals.extend([item.end.ordinal for item in sub_b])
    intervals.sort()

    j = k = 0
    for i in xrange(1, len(intervals)):
        start = SubRipTime.from_ordinal(intervals[i-1])
        end = SubRipTime.from_ordinal(intervals[i])

        if (end-start) > delta:
            text_a, j = find_subtitle(sub_a, start, end, j)
            text_b, k = find_subtitle(sub_b, start, end, k)

            text = join_lines(text_a, text_b)
            if len(text) > 0:
                item = SubRipItem(0, start, end, text)
                out.append(item)

    out.clean_indexes()
    return out
예제 #6
0
def merge_subtitle(sub_a, sub_b, delta):
    out = SubRipFile()
    intervals = [item.start.ordinal for item in sub_a]
    intervals.extend([item.end.ordinal for item in sub_a])
    intervals.extend([item.start.ordinal for item in sub_b])
    intervals.extend([item.end.ordinal for item in sub_b])
    intervals.sort()

    j = k = 0
    for i in range(1, len(intervals)):
        start = SubRipTime.from_ordinal(intervals[i - 1])
        end = SubRipTime.from_ordinal(intervals[i])

        if (end - start) > delta:
            text_a, j = find_subtitle(sub_a, start, end, j)
            text_b, k = find_subtitle(sub_b, start, end, k)

            text = join_lines(text_a, text_b)
            if len(text) > 0:
                item = SubRipItem(0, start, end, text)
                out.append(item)

    out.clean_indexes()
    return out
예제 #7
0
    def decode(cls, data: bytes, media_path: MediaPath):
        display_sets = PgsReader.decode(data, media_path)
        index = 0
        items = []
        for display_set in display_sets:
            if items and not display_set.has_image and display_set.wds:
                items[-1].end = SubRipTime.from_ordinal(display_set.wds[-1].presentation_timestamp)
                continue

            for (pds, ods, wds) in zip(display_set.pds, display_set.ods, display_set.wds):
                item = PgsSubtitleItem(index, media_path, pds, ods, wds)
                if items and items[-1].end is None and items[-1].start + 10000 >= item.start:
                    items[-1].end = max(items[-1].start, item.start - 1)
                items.append(item)
                index += 1

        for item in items:
            item.validate()

        return items
예제 #8
0
 def test_from_ordinal(self):
     self.assertEquals(SubRipTime.from_ordinal(3600000), {'hours': 1})
     self.assertEquals(SubRipTime(1), 3600000)
예제 #9
0
srt = SubRipFile(eol='\n', encoding='utf-8')
i = 1

for line in sublog:
    line = line.split(",", 1)
    if (line[0] and line[0][0] == '-'):
        if (START_TIME == None and line[0][:8] == '- start '):
            START_TIME = datetime.strptime(line[0], '- start ' + TIMEFORMAT +
            '\n')
        continue

    no = datetime.strptime(line[0], TIMEFORMAT) - START_TIME
    if (abs(no) > timedelta(1)):
        print("\nCan't go over a day in a subtitle! Delete non-used lines in" + \
                " log.\nLet there only be one '- start' line at the top of" + \
                " the log-file.")
        sys.exit(1)

    time = SubRipTime.from_ordinal(no.seconds*1000 + no.microseconds*0.001)

    item = SubRipItem(i, start=time, end=time + 30*1000,
            text=unicode(line[1], 'utf-8'))
    srt.append(item)
    i += 1

srt.clean_indexes()
#srt.save(path=sys.stdout)

for line in srt:
    sys.stdout.write(unicode(line).encode('utf-8'))
예제 #10
0
 def test_from_ordinal(self):
     self.assertEqual(SubRipTime.from_ordinal(3600000), {'hours': 1})
     self.assertEqual(SubRipTime(1), 3600000)