Пример #1
0
    def test_invalid_file(self):
        with self.assertRaises(InvalidSubtitleTypeError):
            parser.parse("test.py")

        with self.assertRaises(InvalidSubtitleTypeError):
            parser.parse(PATH.format(test_type="valid", subtitle_type="srt"),
                         subtitle_type="py")
Пример #2
0
def subtitles_parser(subtitle_file, splits=1, set_directory="."):
    subs = psparser.parse(subtitle_file)

    try:
        lines = [line.text for line in subs]
    except UnicodeDecodeError:
        subs = psparser.parse(subtitle_file, encoding="ISO-8859-1")
        lines = [line.text for line in subs]

    text = " ".join(lines)
    generate_sets(text, splits, set_directory)
Пример #3
0
def _get_raw_subtitles_for_video(video_path, mkv_streams):
    dir_name = dirname(video_path)
    file_name = basename(video_path)
    video_ext = splitext(video_path)[1]
    target_episode_number = float(_get_episode_no(file_name))

    # do the loop the first time without checking if we match the target
    # episode number this is so we deal with all the "failed to extract
    # the episode number" issues upfront (so "initialising" it)
    if video_path not in _has_initialised_path:
        _has_initialised_path[video_path] = None
        for root, _, file_names in walk(dir_name):
            for file_name in file_names:
                file_ext = splitext(file_name)[1][1:]

                if file_ext not in _valid_sub_exts:
                    continue

                _get_episode_no(file_name)

    for root, _, file_names in walk(dir_name):
        for file_name in file_names:
            file_ext = splitext(file_name)[1][1:]

            if file_ext not in _valid_sub_exts:
                continue

            episode_number = float(_get_episode_no(file_name))

            if episode_number == target_episode_number:
                subtitle_path = join(root, file_name)
                return parser.parse(subtitle_path)

    if video_ext == '.mkv':
        subtitles = (None, 0)

        for i, (lang, ext) in enumerate(mkv_streams[1]):
            if ext == 'subrip':
                ext = 'srt'

            subtitle_path = join(dir_name, f'({i}) {file_name}.{ext}')
            run(f'{_get_ffmpeg()} -i "{video_path}" -y -map 0:s:{i} "{subtitle_path}"',
                stdout=PIPE,
                stderr=PIPE)
            parsed_subtitles = list(parser.parse(subtitle_path))
            no_subtitles = len(parsed_subtitles)
            remove(subtitle_path)

            if no_subtitles > subtitles[1]:
                subtitles = (parsed_subtitles, no_subtitles)

        return subtitles[0]
Пример #4
0
    def test_invalid_timestamps(self, subtitle_type):
        path = PATH.format(test_type="invalid_timestamps",
                           subtitle_type=subtitle_type)
        subtitles = parser.parse(path, subtitle_type=subtitle_type, fps=24)

        with self.assertRaises(InvalidTimestampError):
            next(subtitles)
Пример #5
0
def match_file_with_lang(sub_files, lang_iso):
    # Match based on the full language name & alpha_3
    def _sublist_of_generator(gen, start, end):
        ls = []
        try:
            for i in range(start):
                next(gen)
            for i in range(end - start):
                ls.append(next(gen))
        except:
            pass

        return ls

    for f in sub_files:
        sub_basename = os.path.basename(f).lower()
        if lang_iso.name.lower() in sub_basename:
            return f
        if sub_basename.rsplit('.', 1)[0].endswith(lang_iso.alpha_3):
            return f

    # Match based on the content of the text
    #print("Search %s using content of : %s" % (lang_iso.name, sub_files))
    for f in sub_files:
        # Get 30 subtitles in the middle
        subtitles = [
            s.text.strip()
            for s in _sublist_of_generator(SubParser.parse(f), 10, 41)
        ]
        detected_lang = langdetect.detect('. '.join(subtitles))
        if detected_lang == lang_iso.alpha_2:
            return f

    return None
Пример #6
0
def parse_subtitles(subtitle_file: str) -> List[Subtitle]:
    """ Return a list of Subtitle objects derived from the given file """
    with open(subtitle_file, 'rb') as f:
        chardet_result = chardet.detect(f.read())

    return [
        Subtitle(s.index, s.text, _to_sec(s.start), _to_sec(s.end))
        for s in subparser.parse(subtitle_file,
                                 encoding=chardet_result['encoding'])
    ]
Пример #7
0
    def test_valid_subtitles(self, subtype):
        path = PATH.format(test_type='valid', subtype=subtype)

        subtitles = parse(path, subtype=subtype, fps=24)

        self.validate(sub=next(subtitles),
                      index=0,
                      text='Subtitle',
                      clean='subtitle',
                      start=(0, 0, 1),
                      end=(0, 0, 2),
                      duration=1000)

        self.validate(sub=next(subtitles),
                      index=1,
                      text='- Subtitle',
                      clean='subtitle',
                      start=(0, 0, 3),
                      end=(0, 0, 3),
                      duration=500)

        self.validate(sub=next(subtitles),
                      index=2,
                      text='[Sound effect] Subtitle',
                      clean='subtitle',
                      start=(0, 1, 5),
                      end=(0, 1, 5),
                      duration=250)

        self.validate(sub=next(subtitles),
                      index=3,
                      text='<format>Subtitle</format>',
                      clean='subtitle',
                      start=(1, 30, 0),
                      end=(1, 35, 0),
                      duration=300000)

        self.validate(sub=next(subtitles),
                      index=4,
                      text='Multi line Subtitle',
                      clean='multi line subtitle',
                      start=(2, 0, 0),
                      end=(2, 11, 11),
                      duration=671000)

        self.validate(sub=next(subtitles),
                      index=5,
                      text='Subtitle',
                      clean='subtitle',
                      start=(2, 20, 0),
                      end=(3, 0, 0),
                      duration=2400000)
Пример #8
0
    def parse(filename: str) -> List[Subtitle]:
        encoding = SubtitleParser._detect_encoding(filename)

        raw_subtitles = parser.parse(filename, encoding=encoding)
        raw_subtitles = formatting.clean(raw_subtitles)

        subtitle_entities = []

        for raw_subtitle in raw_subtitles:
            subtitle_entity = Subtitle(quote=raw_subtitle.text,
                                       start_time=raw_subtitle.start,
                                       end_time=raw_subtitle.end)

            subtitle_entities.append(subtitle_entity)

        return subtitle_entities
Пример #9
0
    def test_valid_subtitles(self, subtitle_type):
        path = PATH.format(test_type="valid", subtitle_type=subtitle_type)

        subtitles = parser.parse(path, subtitle_type=subtitle_type, fps=24)

        self._assert_subtitle(sub=next(subtitles),
                              index=0,
                              text="Subtitle",
                              start=(0, 0, 1),
                              end=(0, 0, 2),
                              duration=1000)

        self._assert_subtitle(sub=next(subtitles),
                              index=1,
                              text="- Subtitle",
                              start=(0, 0, 3),
                              end=(0, 0, 3),
                              duration=500)

        self._assert_subtitle(sub=next(subtitles),
                              index=2,
                              text="[Sound effect] Subtitle",
                              start=(0, 1, 5),
                              end=(0, 1, 5),
                              duration=250)

        self._assert_subtitle(sub=next(subtitles),
                              index=3,
                              text="<format>Subtitle</format>",
                              start=(1, 30, 0),
                              end=(1, 35, 0),
                              duration=300000)

        self._assert_subtitle(sub=next(subtitles),
                              index=4,
                              text="Multi line Subtitle",
                              start=(2, 0, 0),
                              end=(2, 11, 11),
                              duration=671000)

        self._assert_subtitle(sub=next(subtitles),
                              index=5,
                              text="Subtitle",
                              start=(2, 20, 0),
                              end=(3, 0, 0),
                              duration=2400000)
Пример #10
0
def cli(sub_file, output_manifest_file):
    subtitles = parser.parse(sub_file.name)

    output_json = []
    for subtitle in subtitles:
        start_time = get_microseconds(subtitle.start)
        end_time = get_microseconds(subtitle.end)
        length = end_time - start_time

        length_in_seconds = math.ceil(length / 1000_000)
        start_time_in_seconds = math.floor(start_time / 1000_000)

        clip_filename = f"{slugify(subtitle.text)}-{start_time_in_seconds}.mp4"
        output_json.append(
            {
                "start_time": start_time_in_seconds,
                "length": length_in_seconds,
                "rename_to": clip_filename,
                "title": subtitle.text,
            }
        )

    json.dump(output_json, output_manifest_file, indent=4, sort_keys=True)
Пример #11
0
from pysubparser import parser
import freqlist
import MeCab


#put there absolute path of the file with double slashes eg 'E:\PythonProjects\pythonProject1\Anime frec list\[Kamigami] Barakamon - 01 [1280×720 x264 AAC Sub(Chs,Jap)].ass'
sub_file = 'put there path of the file'
subtitles = parser.parse('E:\\PythonProjects\\pythonProject1\\Anime frec list\\[Kamigami] Barakamon - 01 [1280×720 x264 AAC Sub(Chs,Jap)].ass')

wakati = MeCab.Tagger("-Owakati")

for subtitle in subtitles:
    subline = subtitle
    #print(subtitle.text)
    line_splitted = (wakati.parse(subtitle.text).split())
    freqlist.
    print(line_splitted)
Пример #12
0
    def test_invalid_file(self):
        with self.assertRaises(InvalidSubtitleTypeError):
            parse('test.py')

        with self.assertRaises(InvalidSubtitleTypeError):
            parse(PATH.format(test_type='valid', subtype='srt'), subtype='py')
Пример #13
0
 def test_invalid_encoding(self):
     with self.assertRaises(UnicodeDecodeError):
         list(
             parse(PATH.format(test_type='invalid_encoding', subtype='srt'),
                   encoding='ascii'))
Пример #14
0
from pysubparser import parser
import jieba
import sys
import csv

tsv_file = open("Chinese.txt")
read_tsv = csv.reader(tsv_file, delimiter="\t")

word_count = {}

ignore_list = ['》', '《']

for row in read_tsv:
    w = row[1]
    ignore_list.append(w)

for filename in sys.argv[1:]:
    subtitles = parser.parse(filename)

    for subtitle in subtitles:
        seg_list = jieba.cut(subtitle.text, cut_all=False)

        for word in seg_list:
            if word not in ignore_list:
                word_count[word] = word_count.get(word, 0) + 1

sorted_words = sorted(word_count.items(), key=lambda kv: kv[1])

for w, count in sorted_words[-101:-1]:
    print(w + " -> " + str(count))
Пример #15
0
 def setUp(self) -> None:
     self.subtitles = parse("./tests/files/valid/cleaners.srt")
Пример #16
0
def getSeconds(timeObj):
    return float((timeObj.hour * 60 + timeObj.minute) * 60 + timeObj.second)


def callSystem(time, index):
    command = 'echo "python3 ' + str(filepath) + '/post.py ' + str(index) + '"'
    at = ' | at -M ' + str(time.strftime('%H:%M %Y-%m-%d'))
    print(command + at)
    #os.system(command + at)


config = configparser.ConfigParser()
config.read(str(filepath) + '/config.ini')

filename = config['DEFAULT']['filename']
subtitlesGen = parser.parse('./' + filename)
subtitles = []
# convert Generator into a list because
# I'm not sure how else to accomplish thi
for subtitle in subtitlesGen:
    subtitles.append(subtitle)

startTime = datetime.datetime.strptime('00:00:00.000000', "%H:%M:%S.%f").time()
endTime = subtitles[len(subtitles) - 1].end

TOTAL_SECONDS_IN_ONE_YEAR = float(60 * 60 * 24 * 365)
secondsInMovie = getSeconds(endTime)

movieSecondMultiplier = TOTAL_SECONDS_IN_ONE_YEAR / secondsInMovie

#pseudoNow is sixty seconds in the future, to allow time for the script to run
Пример #17
0
import streamlit as st
import pandas as pd
import spacy

st.sidebar.title('Views')

st.text_input('Type a word',
              value='',
              max_chars=None,
              key=None,
              type='default')

from pysubparser import parser

subtitles = parser.parse('top gun-English.sub')

for subtitle in subtitles:
    st.write(subtitle.text)
Пример #18
0
 def __init__(self, path_to_subtitle_file):
     self._subtitles = parser.parse(path_to_subtitle_file)
Пример #19
0
    def test_srt_writer(self, subtitle_type):
        path = PATH.format(test_type="valid", subtitle_type=subtitle_type)
        subtitles = parser.parse(path, subtitle_type=subtitle_type, fps=24)

        subtitles = brackets.clean(
            formatting.clean(
                lower_case.clean(
                    ascii.clean(
                        subtitles
                    )
                )
            )
        )

        new_path = f"{path}.srt"
        writer.write(subtitles, new_path)

        subtitles = parser.parse(new_path, subtitle_type="srt")

        self._assert_subtitle(
            sub=next(subtitles),
            index=0,
            text="subtitle",
            start=(0, 0, 1),
            end=(0, 0, 2),
            duration=1000
        )

        self._assert_subtitle(
            sub=next(subtitles),
            index=1,
            text="- subtitle",
            start=(0, 0, 3),
            end=(0, 0, 3),
            duration=500
        )

        self._assert_subtitle(
            sub=next(subtitles),
            index=2,
            text="subtitle",
            start=(0, 1, 5),
            end=(0, 1, 5),
            duration=250
        )

        self._assert_subtitle(
            sub=next(subtitles),
            index=3,
            text="subtitle",
            start=(1, 30, 0),
            end=(1, 35, 0),
            duration=300000
        )

        self._assert_subtitle(
            sub=next(subtitles),
            index=4,
            text="multi line subtitle",
            start=(2, 0, 0),
            end=(2, 11, 11),
            duration=671000
        )

        self._assert_subtitle(
            sub=next(subtitles),
            index=5,
            text="subtitle",
            start=(2, 20, 0),
            end=(3, 0, 0),
            duration=2400000
        )
Пример #20
0
from pysubparser import parser

subtitles = parser.parse(
    'Anime frec list//[Kamigami] Barakamon - 01 [1280×720 x264 AAC Sub(Chs,Jap)].ass'
)

# outputs springs
for subtitle in subtitles:
    print(subtitle.text)
Пример #21
0
import configparser
#from os import path
import sys
from pysubparser import parser
import pathlib
import os

filepath = pathlib.Path(__file__).parent.absolute()

config = configparser.ConfigParser()
config.read(str(filepath) + '/config.ini')
filename = config['DEFAULT']['filename']

if len(sys.argv) < 2:
    quit()
indexToSend = int(sys.argv[1])
subtitles = parser.parse(str(filepath) + '/' + filename, 'srt')

api = twitter.Api(consumer_key=config['DEFAULT']['consumer_key'],
                  consumer_secret=config['DEFAULT']['consumer_secret'],
                  access_token_key=config['DEFAULT']['access_token_key'],
                  access_token_secret=config['DEFAULT']['access_token_secret'])

subtitleToSend = ''

for subtitle in subtitles:
    if subtitle.index == indexToSend:
        subtitleToSend = subtitle.text
        break

api.PostUpdates(status=subtitleToSend)
Пример #22
0
 def test_invalid_encoding(self):
     with self.assertRaises(UnicodeDecodeError):
         path = PATH.format(test_type="invalid_encoding",
                            subtitle_type="srt")
         list(parser.parse(path, encoding="ascii"))
Пример #23
0
    for sub in seg:
        print(sub.text)

    print(seg[-1].end)
    print("------------------------------------")
    print("Segment duration: " +
          str((time_to_millis(seg[-1].end) - time_to_millis(seg[0].start)) /
              1000))
    print("====================================")


audio_filename = sys.argv[1]
subtitle_filename = sys.argv[2]

subtitles = parser.parse(subtitle_filename)
segments = get_segments(subtitles)

song = AudioSegment.from_mp3(audio_filename)

folder = "out/"
episode = "e01"
n = 1
for seg in segments:
    start = time_to_millis(seg[0].start) - 1000
    end = time_to_millis(seg[-1].end) + 1500

    cut = song[start:end]
    cut.export(folder + episode + "_seg" + str(n) + ".mp3", format="mp3")

    print("===== Segment " + str(n) + " ========")