예제 #1
0
def main(argv):
    if len(argv) != 4:
        print 'usage:'
        print '    python subtitle.py input-video.mp4 input-subtitle.srt output-video.avi'
        return

    input_video_fname = argv[1]
    input_subt_fname = argv[2]
    output_video_fname = argv[3]

    subt = Subtitle()
    subt.render(input_video_fname, input_subt_fname, output_video_fname)
    def get_subtitles(file_rows: list) -> list:
        result = []
        timeline_raw = ''
        subtitle_raw = ''
        for i, file_row in enumerate(file_rows):
            if Subtitle.is_timeline(file_row):
                timeline_raw = file_row
            if Subtitle.is_subtitle_string(file_row):
                subtitle_raw = file_row
            if timeline_raw != '' and subtitle_raw != '':
                result.append(Subtitle(SubtitleString(subtitle_raw), TimeLine(timeline_raw)))
                timeline_raw = ''
                subtitle_raw = ''

        return result
def search_movie_subtitle(movie, language):
    """find subtitle for a movie base on assigned language
    
    if a language has not be assigned 'english' will be supplied by the Movie class

    Args:
        movie (Movie): The movie whose subtitle is to be searched.
        language (str): The subtitle language.
    """
    url = 'https://subscene.com/subtitles/' + movie.slug() + '/' + language
    response = requests.get(url)
    response.raise_for_status()
    soup = bs4.BeautifulSoup(response.text, "html.parser")
    del response
    subtitles = []
    selection = soup.findAll('td', attrs={'class': 'a1'})
    for td in selection:
        title = str(td.findAll(
            'span', attrs={'class':
                           None})[0].contents[0]).encode('utf-8').strip()
        url = str(td.find('a')['href']).strip()
        lang = str(td.find('span').contents[0]).encode('utf-8').strip()
        temp = Subtitle(title, url, lang)
        subtitles.append(temp)
    return subtitles
예제 #4
0
  def test_html_scan(self):
    self.test_dumpData()

    sub_scan = Subtitle(loglevel=logging.DEBUG)
    sub_scan.setLexiconFile(self.pkl)
    sub_scan.loadOldData()
    #sub_scan.addFile('https://www.coursera.org/')
    sub_scan.addFile('https://selenium-python.readthedocs.org/en/latest/index.html')
    sub_scan.parse()
    #sub_scan.words_show(50)
    #sub_scan.show()
    self.sub_assert(sub_scan,lex=4156, stem_lex=3104, words=127, stem_words=71, new_words=32)
    pass
    def get_timelines(input_file: str) -> list:
        res = []
        file_rows = FileManager.get_file_rows(input_file)

        for i, file_row in enumerate(file_rows):
            if Subtitle.is_timeline(file_row):
                res.append(file_row)
        return res
예제 #6
0
파일: main.py 프로젝트: xiaofud/srt_parser
def move_subtitles(subtitles, seconds, filename):
    for sub in subtitles:
        assert isinstance(sub, Subtitle)
        start_time = sub.get_start_second()
        end_time = sub.get_end_second()

        start_time += seconds
        end_time += seconds

        if start_time < 0 or end_time < 0:
            print("TIMESTAMP OUT OF RANGE!!!")
            return

        sub.start_time = Subtitle.to_srt_timestamp(start_time)
        sub.end_time = Subtitle.to_srt_timestamp(end_time)
    Subtitle.to_srt_file(subtitles, filename)
    print('saved to', filename)
예제 #7
0
 def setupMediaExtractor(self, i):
     movie_name = self.clients[i]['movie_name']
     self.clients[i]['video_extractor'] = VideoCapturer(movie_name)
     video_extractor = self.clients[i]['video_extractor']
     fps = video_extractor.fps
     frame_count = video_extractor.frame_count
     self.clients[i]['audio_extractor'] = AudioCapturer(
         movie_name, fps, frame_count)
     if self.clients[i]['subtitle_file'] is not None:
         self.clients[i]['subtitle'] = Subtitle(
             frame_count, fps, self.clients[i]['subtitle_file'])
예제 #8
0
def load_subtitle(path, aqt_file):
    print "Loading %s%s" % (path, aqt_file)
    subtitles = []
    with open('%s%s' % (path, aqt_file), 'r') as f:
        while True:
            lines = read_next_lines(f, 4)
            if lines:
                sub = Subtitle(lines[1], lines[0], lines[2])
                subtitles.append(sub)
            else:
                break
    return subtitles
예제 #9
0
def scan_subtitle(path):
    if not os.path.exists(path):
        raise ValueError('Path does not exist')

    dirpath, filename = os.path.split(path)
    logger.info('Scanning subtitle %r in %r', filename, dirpath)

    # guess
    parent_path = path.strip(filename)
    subtitle = Subtitle.fromguess(parent_path, guessit(path))

    return subtitle
예제 #10
0
def main(in_subt, out_subt):
    assert in_subt != ""
    assert out_subt != ""

    parser = Parser()
    normalizer = Normalizer()
    lemma_filter = Filter()

    try:
        f = codecs.open(in_subt, 'r', encoding='utf8')
        text = f.read()
        f.close()
    except IOError:
        sys.exit("The subtitle could not be found in the path you provided.")

    parser.parse(text)
    normalizer.normalize(parser.get_text())
    lemma_filter.clean_lemmas(normalizer.get_lemmas())

    new_sub = Subtitle(parser.get_indexes(), parser.get_times(),
                       parser.get_text(), lemma_filter.get_final_lemmas(),
                       lemma_filter.get_dict(), out_subt)
    new_sub.create_subtitle()
예제 #11
0
  def test_txt_scan(self):
    self.test_dumpData()
    
    sub_scan = Subtitle(loglevel=logging.DEBUG)
    sub_scan.setLexiconFile(self.pkl)
    sub_scan.loadOldData()
    sub_scan.addFile('../data/srt/Lord.of.War.eng.480p.SDHF-NORMTEAM.srt')
    sub_scan.parse()
    #sub_scan.words_show(50)
    #sub_scan.show()
    self.sub_assert(sub_scan,lex=4156, stem_lex=3104, words=1936, stem_words=943, new_words=518)
    '''
    assert len(sub_scan.lexicon) == 3929
    assert len(sub_scan.stem_lexicon) == 2968
    assert len(sub_scan.wordSet) <= 1807
    assert len(sub_scan.stem_newWords) <= 922
    assert len(sub_scan.newWords) <= 531
    '''

    names_mv=['Yuri', 'Simeon']
    for n in names_mv:
      assert n in sub_scan.nameSet
      '''
      if n in sub_scan.nameSet:
        print n;
        pass
      '''
      pass
    #print sub_scan.nameSet
    for n in names_mv:
      '''
      if n.lower() in sub_scan.newWords:
        print n;
        pass
      '''
      assert n.lower() not in sub_scan.newWords
      pass
    pass
예제 #12
0
    def fix(self):
        caption_rows = FileManager.get_file_rows(self.input_filename)

        for i, caption_row in enumerate(caption_rows):
            if Subtitle.is_timeline(caption_row):

                if i + 3 < len(caption_rows):
                    timeline_splitted = caption_row.split(',')
                    timeline_splitted_next = caption_rows[i + 3].split(',')
                    HandleFunctions.save_str_to_file(
                        self.output_filename, timeline_splitted[0] + ',' +
                        timeline_splitted_next[0] + '\n')
                else:
                    HandleFunctions.save_str_to_file(self.output_filename,
                                                     caption_row)
            else:
                HandleFunctions.save_str_to_file(self.output_filename,
                                                 caption_row)
예제 #13
0
    def __init__(self, logger=None, log_level=logging.INFO):
        classname = type(self).__name__
        # print("classname: ", classname)
        if logger is None:
            self.logger = create_log(log_name=classname, level=log_level)
        else:
            self.logger = logger

        self.logger.info("\n-----------------")
        self.logger.info("Begin to init")
        self.logger.info("\n-----------------")

        self.sub = Subtitle(self.logger)
        self.sub.set_parse(True)
        self.files = dict()
        self.filetypes = ['srt', 'bak', 'm3u', 'txt']
        self.vediotypes = ['mkv', 'mp4', 'avi']
        self.rmsrt = False

        if self.sub.lexicon_path is None:
            self.sub.set_lexicon_file("lexicon/lexicon.xlsx")
        pass
예제 #14
0
 def setUp(self):
   self.sub = Subtitle(loglevel=logging.DEBUG)
   pass
예제 #15
0
 def read_next_subtitle(self):
     subtitle = Subtitle()
     subtitle.identifier = self._read_identifier()
     subtitle.timestamp_begin, subtitle.timestamp_end = self._read_timestamps()
     subtitle.text = self._read_text()
     return subtitle
예제 #16
0
	def import_subtitle(self, file:Path) -> Subtitle:
		return Subtitle.new(file, self.imdbid)
 def remove_timelines(input_file: str, output_file: str):
     file_rows = FileManager.get_file_rows(input_file)
     for i, file_row in enumerate(file_rows):
         if not Subtitle.is_timeline(file_row):
             HandleFunctions.save_str_to_file(output_file, file_row)
예제 #18
0
def main(argv=None, log_ger=None):
    if log_ger is None:
        log_ger = create_log(log_name="subtitle", level=logging.INFO)

    fname = None

    start_dtime = datetime.now()
    # print("Start time: "+str(start_dtime))#.strftime("%Y-%m-%d %H:%M:%S"))
    print()
    # sub=Subtitle(logging.getLogger())
    sub = Subtitle(log_ger)

    try:
        opts, args = getopt.getopt(
            argv,
            "hvf:w:t:d:e:p:s:b:?lm:WDc",
            ["help", "version", "parse",
             "checkup" "file=", "word=",
             "type=", "dir=", "pickle=",
             "limit=", "section=", "bigger="])
        # print opts, args
        log_ger.info("opts:{0};args:{1}".format(opts, args))
    except getopt.GetoptError as msg:
        print("error happened when get options!!! error:{0}".format(msg))
        usage()
        log_ger.error("getopt.GetoptError:{0}, exit!".format(msg))
        sys.exit(2)
    except Exception as msg:
        log_ger.error("error:{0}, exit!".format(msg))
        sys.exit(2)

    _is_lines_show = False
    _is_words_show = False
    sub_type = ""
    words_limit = None
    for opt, arg in opts:
        if opt in ("-?", "-h", "--help"):
            usage()
            sys.exit()
            pass
        elif opt in ("-v", "--version"):
            version()
            sys.exit()
            pass
        elif opt in ("-b", "--bigger"):
            sub.set_times_bigger(int(arg))
            pass
        elif opt in ("-c", "--checkup"):
            sub.checkup = True
            pass
        elif opt in ("-d", "--dir"):
            print("Sorry, -d --dir option still not offer")
            sys.exit()
            pass
        elif opt in ("-e", "--excel"):
            pkl = arg
            sub.set_lexicon_file(pkl)
            pass
        elif opt in ("-s", "--section"):
            if ',' in arg:
                section = arg.split(',')
                if len(section) == 2:
                    # print(section)
                    start, end = section
                    if len(start) != 0:
                        sub.set_start(int(start))
                    if len(end) != 0:
                        sub.set_end(int(end))
                    # print(start, end)
                else:
                    print("something wrong, with option -s --section:", arg)
                    sys.exit()
            else:
                print("something wrong, with option -s --section:", arg)
                sys.exit()

            pass
        elif opt in ('-f', "--file"):
            fname = arg
            sub.add_file(fname)
            pass
        elif opt in ('-p', "--parse"):
            sub.set_parse(True)
            pass

        elif opt == '-D':
            log_ger.setLevel(logging.DEBUG)
            sub.set_logger(log_ger)
            sub.set_debug(True)
            pass
        elif opt in ("-w", "--word"):
            word = arg
            sub.add_word(word)
            # 多用于测试,放弃写入
            sub.set_output(False)
            pass
        elif opt in ("-t", "--type"):
            sub_type = arg
            if sub_type not in ('save', 'scan', 'cloud'):
                usage()
                sys.exit()
                pass
            pass
        elif opt in ("-m", "--limit"):
            words_limit = int(arg)
            # print words_limit
            _is_words_show = True
            pass
        elif opt == '-l':
            # show lines
            _is_lines_show = True
            pass
        elif opt == '-W':
            # show words
            _is_words_show = True
            pass

    """
    if(len(sys.argv)<2):
      print "need args!!"
      log_ger.error("need args!!sys.argv:{0}".format(sys.argv))
      return None
      pass
    """
    # print sys.argv

    # sub.add_punctuation([',','!',';','.',':','>','<'])
    # sub.addLexicon(["hello", "world"])

    if sub.lexicon_path is None:
        sub.set_lexicon_file("lexicon/lexicon.xlsx")
    sub.load_old_data()

    sub.add_files(args)
    # sub.add_strings("hello world, I'm wang. Please call me wang.")

    sub.check_all(encode='utf-8')

    if _is_lines_show:
        sub.lines_show(words_limit)
        pass

    if _is_words_show:
        # print words_limit
        sub.words_show(words_limit)
        pass
    sub.show()

    if sub_type == 'save':
        sub.dump_data()
    elif sub_type == 'cloud':
        sub.cloud()

    print()
    end_dtime = datetime.now()
    # print("End time: "+str(end_dtime))
    timedelta = end_dtime - start_dtime
    print("Cost time: " + str(timedelta))

    # getChecksum(sys.argv[1])
    pass
예제 #19
0
class TransferBak(object):
    """
    # save srt to srt.bak
    # transfer all srt.bak files
    # mv wordch.srt to srt
    # create m3u from sub srt
    # create and cat word.txt
    """

    def __init__(self, logger=None, log_level=logging.INFO):
        classname = type(self).__name__
        # print("classname: ", classname)
        if logger is None:
            self.logger = create_log(log_name=classname, level=log_level)
        else:
            self.logger = logger

        self.logger.info("\n-----------------")
        self.logger.info("Begin to init")
        self.logger.info("\n-----------------")

        self.sub = Subtitle(self.logger)
        self.sub.set_parse(True)
        self.files = dict()
        self.filetypes = ['srt', 'bak', 'm3u']
        self.vediotypes = ['mkv', 'mp4', 'avi']
        self.rmsrt = False
        self.is_m3u_only = False

        if self.sub.lexicon_path is None:
            self.sub.set_lexicon_file("lexicon/lexicon.xlsx")

        pass

    def setrm(self):
        self.rmsrt = True

    def set_debug(self):
        self.logger.setLevel(logging.DEBUG)
        self.sub.set_logger(self.logger)
        self.sub.set_debug(True)

    def set_m3u_only(self):
        self.is_m3u_only = True

    def set_logger(self, logger):
        self.logger = logger
        pass

    # save srt to srt.bak
    def srt_to_baks(self):
        for i in self.files["srt"]:
            # mv srt to srt.bak
            srt = os.path.join(i[0], i[1])
            bak = srt + ".bak"
            base = os.path.splitext(srt)[0]

            if self.get_vedio_num() <= 0 :
                # 无视频的情况
                print("move " + srt + " to " + bak)
                shutil.move(srt, bak)
                pass
            else:
                for postfix in self.vediotypes:
                    vedio = ''
                    if base.endswith(".en"):
                        vedio = base[0:-3] + "." + postfix
                        bak = base[0:-3] + ".srt.bak"
                    else:
                        vedio = base + "." + postfix
                    if os.path.exists(vedio) and not os.path.exists(bak):
                        print("move " + srt + " to " + bak)
                        shutil.move(srt, bak)
        pass

    # get srt.bak file from mkv!
    def baks_mkv(self):
        failedfiles = []
        print("------------")
        print('get srt.bak file from vedio!')

        for postfix in self.vediotypes:
            if postfix in self.files:
                for fi, f in enumerate(self.files[postfix]):
                    mkvfile = os.path.join(f[0], f[1])
                    print("------------")
                    print(fi, mkvfile)
                    srtbak_path = os.path.splitext(mkvfile)[0] + ".srt.bak"
                    if os.path.exists(srtbak_path):
                        print(srtbak_path, "exists")
                        print("------------")
                        continue
                    print("1. Check streams in mkv!")
                    res = subprocess.run(["ffmpeg",
                                          "-i", mkvfile],
                                         stderr=subprocess.PIPE)
                    """
                    print(res)
                    print("------------")
                    print(res.stderr)
                    print(res.stdout)
                    print(res.args)
                    print("------------over")
                    """
                    beg = False
                    streams = []
                    stream = None
                    for i, v in enumerate(res.stderr.decode("utf-8").split("\n")):
                        if "Stream" in v:
                            beg = True
                            m = re.match(r" *Stream #(\d+):(\d+)(\(\w+\))?: (\w+): (\w+)", v)
                            if m:
                                # print(m.groups()[1:])  # c
                                if stream is not None:
                                    streams.append(stream)
                                stream = {"id": m.group(2), "type": m.group(4), "filetype": m.group(5)}
                                if m.group(3) is not None:
                                    # subhead 副标题
                                    stream['subhead'] = m.group(3)
                                continue
                            else:
                                print("err: reg not work on this sentence")
                                print(v)
                                sys.exit(2)
                        elif not beg:
                            continue
                        else:
                            m = re.match(r" *title *: (.*)", v)
                            if m:
                                stream['title'] = m.group(1)
                        pass
                    pass

                    streams_df = pd.DataFrame(streams)
                    print("streams_df:\n", streams_df)  # c
                    # print(streams_df.dtypes) # c
                    if len(streams_df) < 2 or len(streams_df[streams_df["type"] == "Subtitle"]) <= 0:
                        print("not have enough stream")
                        failedfiles.append(mkvfile)
                        continue

                    print("2. get srt.bak file from mkv!")

                    engchs = []
                    if 'title' in streams_df:
                        criterion = streams_df['title']. \
                            map(lambda t: re.match(r"(英.中)|(英中)|(中英字幕)", str(t)) is not None)
                        engchs = streams_df[criterion]
                    if len(engchs) > 0:
                        # todo: deal engchs
                        print("engchs:\n", engchs)

                        # print( os.path.splitext(mkvfile)[0] + ".srt.bak") # c
                        base = os.path.splitext(mkvfile)[0]
                        srt = base + ".srt"

                        if 'ass' == str(engchs.iloc[0].filetype):
                            ass = base + ".ass"

                            ass_file = open(ass)
                            srt_str = asstosrt.convert(ass_file)
                            f = open(srt, "w")
                            f.write(srt_str)
                            f.close()
                            """
                            subprocess.run(
                                ["mkvextract", "tracks",
                                 mkvfile,
                                 "{0}:{1}".format(
                                     str(engchs.iloc[0].id),
                                     ass
                                 )
                                 ])
                            time.sleep(2)
                            res = subprocess.run(["ffmpeg", "-i", ass, srt])
                            if os.path.getsize(srt) <= 10:
                                print(res)
                                print("------------")
                                print(res.stderr)
                                print(res.stdout)
                                print(res.args)
                                print("------------over")
                                sys.exit(8)
                            """
                            if os.path.exists(srt):
                                shutil.move(srt, srtbak_path)
                            else:
                                # todo: log err
                                print("error: {} failed to create!".format(srt))
                                failedfiles.append(mkvfile)
                            pass
                        elif 'subrip' == str(engchs.iloc[0].filetype):
                            print("filetype:", str(engchs.iloc[0].filetype))
                            subprocess.run(
                                ["mkvextract", "tracks",
                                 mkvfile,
                                 "{0}:{1}".format(
                                     str(engchs.iloc[0].id),
                                     srtbak_path
                                 )
                                 ])
                        pass
                    else:
                        criterion = streams_df['subhead']. \
                            map(lambda t: re.match(r"eng", str(t)) is not None)
                        eng = streams_df[
                            criterion &
                            (streams_df.type == "Subtitle")
                            ]
                        if len(eng) > 0:
                            # todo: deal eng
                            print("eng:\n", eng)
                            pass
                        else:
                            print('-------------')
                            print(f[1], "has no suitable subtitle stream!")
                            print(streams_df[streams_df.type == "Subtitle"])
                            failedfiles.append(mkvfile)
                            print('-------------')
                            pass

        print("files failed to create .srt.bak:\n")
        for f in failedfiles:
            print(f)
        print('-------------')
        pass

    # create and cat word.txt
    def words(self):
        pass

    def show_files(self):
        for k, v in self.files.items():
            print(k + ": {}".format(len(v)))
        pass

    def check_types(self, path):
        self.files = dict()
        for root, dirs, files in os.walk(path):
            for file in files:
                for postfix in (self.filetypes + self.vediotypes):
                    if file.endswith("." + postfix):
                        if postfix not in self.files:
                            self.files[postfix] = set()
                        self.files[postfix].add((root, file))
                        break
        self.show_files()
        pass

    def rm_m3u(self, path):
        if 'm3u' in self.files:
            for f in self.files['m3u']:
                p = os.path.join(f[0], f[1])
                os.remove(p)
                print(p + " was removed!")


    def rm_srt(self, path):
        if 'srt' in self.files:
            for f in self.files['srt']:
                p = os.path.join(f[0], f[1])
                # 没有bak的情况下不要删除原始srt, 悲剧过了
                if os.path.exists(p+".bak"):
                    os.remove(p)
                    print(p + " was removed!")
        self.rm_m3u(path)

        out = path+"/output.xlsx"
        if os.path.exists(out):
            os.remove(out)
            print(out + " was removed!")

    def get_vedio_num(self):
        num = 0
        for postfix in self.vediotypes:
            if postfix in self.files:
                num += len(self.files[postfix])
        return num

    def transfer_dir(self, path):
        self.logger.info("path: {}".format(path))
        self.check_types(path)
        if self.rmsrt:
            self.rm_srt(path)
            print("\nafter remove str and m3u files:")
            self.check_types(path)

        old_bak = set()
        if 'bak' in self.files:
            old_bak = copy.deepcopy(self.files["bak"])
        if self.get_vedio_num() > 0 and self.get_vedio_num() != len(old_bak):
            if "srt" in self.files and \
                    len(self.files["srt"]) > 0:
                self.srt_to_baks()
                pass
            print("------------")
            print("after move srt to bak")
            self.check_types(path)
            if "bak" in self.files:
                if self.get_vedio_num() > len(self.files["bak"]):
                    print("There is more vedio than bak!")
                    print("There is no enough .srt.bak!")
                    self.baks_mkv()
            elif self.get_vedio_num() > 0:
                    print("There is more vedio than bak(0)!")
                    print("There is no enough .srt.bak!")
                    self.baks_mkv()

            self.check_types(path)
            if "bak" in self.files and len(self.files["bak"]) == len(old_bak):
                print("baks num get no increase!")
            if 'bak' in self.files:
                old_bak = copy.deepcopy(self.files["bak"])

        elif len(old_bak) > 0:
            print(".srt.bak exists!")
            pass
        elif self.get_vedio_num() <= 0 :
            # 有字幕,无视频的情况, 继续执行,背背单词
            print("no .srt.bak at all!")
            if "srt" in self.files and \
                    len(self.files["srt"]) > 0:
                self.srt_to_baks()
                pass
            print("------------")
            print("after move srt to bak")
            self.check_types(path)
            if len(self.files["bak"]) == len(old_bak):
                print("baks num get no increase!")
            if 'bak' in self.files:
                old_bak = copy.deepcopy(self.files["bak"])
        else:
            print("no .srt.bak at all!")
            pass

        srt_cre = []
        if "srt" in self.files:
            srt_set = set([os.path.join(x[0], x[1]) for x in self.files["srt"]])
        else:
            srt_set = set()

        if len(old_bak) > 0:
            for f in self.files["bak"]:
                bak = os.path.join(f[0], f[1])
                bak_base = os.path.splitext(bak)[0]  # will get *.srt
                # print(bak_base)
                if bak_base not in srt_set:
                    srt_cre.append(bak)

        self.sub.load_old_data()
        self.sub.add_files(srt_cre)
        self.sub.check_all(encode='utf-8')
        # 在目标文件夹也放一份单词excel
        output_dir = "output/output.xlsx"
        if self.sub.words_len() > 0 and os.path.exists(output_dir):
            shutil.copy(output_dir, path)
            print("output.xlsx was copy to " + path)

        self.sub.show()

        failedfiles = []
        srt_files = []
        for f in srt_cre:
            base = os.path.splitext(f)[0]  # will get *.srt
            print("base:", base)
            wordsrt = base + "_word.srt"
            srt = base
            print(wordsrt)
            if os.path.exists(wordsrt):
                shutil.move(wordsrt, srt)
                srt_files.append(srt)
            else:
                failedfiles.append(srt_cre)

                print("files failed to create .srt_word.srt:\n")
                for f in failedfiles:
                    print(f)
                print('-------------')

        print("srt files created:\n")
        for f in srt_files:
            print(f)
        print('-------------')

        self.get_m3u(path)

    def get_m3u(self, path):
        self.check_types(path)
        old_m3u = set()
        if 'm3u' in self.files:
            old_m3u = copy.deepcopy(self.files['m3u'])
        else:
            old_m3u = []
        for postfix in self.vediotypes:
            if postfix in self.files:
                for f in self.files[postfix]:
                    vedio = os.path.join(f[0], f[1])
                    base = os.path.splitext(vedio)[0]  # will get * without .mkv
                    m3u = base + ".m3u"
                    srt = base + ".srt"

                    if not os.path.exists(m3u) and os.path.exists(srt):
                        print(vedio, "is getting m3u...")
                        set_m3u(vedio, self.logger)

        self.check_types(path)
        if 'm3u' in self.files and len(old_m3u) >= len(self.files['m3u']):
            print("old_m3u:\n", old_m3u)
            print("new_m3u:\n", self.files['m3u'])
        pass

    def transfer_paths(self, paths):
        self.logger.info("paths:{}".format(paths))
        # print(paths)  # for debug
        for p in paths:
            if os.path.isfile(p):
                print(
                    "should be dir, don't input file:" +
                    p)
            elif os.path.isdir(p):
                print("transfer dir:" + p)
                if self.is_m3u_only is True:
                    self.check_types(p)
                    self.rm_m3u(p)
                    self.get_m3u(p)
                else:
                    self.transfer_dir(p)
            else:
                print(p, "not exists!")
        pass
예제 #20
0
from turtle_graphics import TextMarker

screen = turtle.Screen()

width, height = screen.window_width(), screen.window_height()
canvas = screen.getcanvas()

left, top = 100, 100
geom = '{}x{}+{}+{}'.format(width, height, left, top)
canvas.master.geometry(geom)

screen.title("U.S. States Game |  0/50 ")
image = "blank_states_img.gif"
screen.addshape(image)
turtle.shape(image)
subtitle = Subtitle()
states = States()
states.new_game_memory()

game_is_on = True
while game_is_on:
    screen.update()
    sleep(0.1)
    # ADD TEXT TO SCREEN
    score = states.last_score()
    answer = screen.textinput(title=f"Guess a State | {score}/50 ",
                              prompt=" What's another state's name?\n")

    # BREAK OUT ON EXIT
    if answer.title() == "Exit":
        states.states_to_learn()
예제 #21
0
import sys
import os

from subtitle import Subtitle

SUB_PATH = "/media/data/mtriet/raw_video/%s/train" % (sys.argv[1])
FRAME_PATH = "/media/data/mtriet/dataset/scnn_%s_frames" % sys.argv[1]

if __name__ == '__main__':
    if len(sys.argv) > 4 or len(sys.argv) < 3:
        print('command fb/bb front_pad rear_pad')
        sys.exit(0)
    for frame_root, sub_folder, _ in os.walk(FRAME_PATH):
        for folder in sub_folder:
            print(folder)
            subtitles = Subtitle.load_subtitle(SUB_PATH, folder, False)

            with open("%s/%s_pad.aqt" % (SUB_PATH, folder), 'w') as f:
                for s in subtitles:
                    f.write(s.to_string(sys.argv[2], sys.argv[3]))
예제 #22
0
class TransferBak(object):
    """
    # save srt to srt.bak
    # transfer all srt.bak files
    # mv wordch.srt to srt
    # create m3u from sub srt
    # create and cat word.txt
    """

    def __init__(self, logger=None, log_level=logging.INFO):
        classname = type(self).__name__
        # print("classname: ", classname)
        if logger is None:
            self.logger = create_log(log_name=classname, level=log_level)
        else:
            self.logger = logger

        self.logger.info("\n-----------------")
        self.logger.info("Begin to init")
        self.logger.info("\n-----------------")

        self.sub = Subtitle(self.logger)
        self.sub.set_parse(True)
        self.files = dict()
        self.filetypes = ['srt', 'bak', 'm3u', 'txt']
        self.vediotypes = ['mkv', 'mp4', 'avi']
        self.rmsrt = False

        if self.sub.lexicon_path is None:
            self.sub.set_lexicon_file("lexicon/lexicon.xlsx")
        pass

    def set_logger(self, logger):
        self.logger = logger
        pass

    def show_files(self):
        for k, v in self.files.items():
            print(k + ": {}".format(len(v)))
        pass

    def clear_no_en(self, path):
        if os.path.isfile(path):
            print("should be dir, don't input file:" + path)
            return None
        elif os.path.isdir(path):
            print("transfer dir:" + path)
        else:
            print(path, "not exists!")
            return None

        self.files = dict()
        for root, dirs, files in os.walk(path):
            for file in files:
                if file.endswith(".en.srt"):
                    if ".en.srt" not in self.files:
                        self.files[".en.srt"] = set()
                    self.files[".en.srt"].add((root, file))
                for postfix in (self.filetypes + self.vediotypes):
                    if file.endswith("." + postfix):
                        if postfix not in self.files:
                            self.files[postfix] = set()
                        self.files[postfix].add((root, file))
                        break
        self.show_files()
        if ".en.srt" not in self.files or len(self.files[".en.srt"]) <= 0:
            return None
        self.rm_not_en(path)
        pass

    def rm_not_en(self, path):
        if 'srt' in self.files:
            for f in self.files['srt']:
                p = os.path.join(f[0], f[1])
                if not p.endswith(".en.srt"):
                    os.remove(p)
                    print(p + " was removed!")
                elif p.endswith(".en.srt"):
                    print("move " + p + " to " + p.replace(".en.srt", ".srt"))
                    shutil.move(p, p.replace(".en.srt", ".srt"))

        if 'txt' in self.files:
            for f in self.files['txt']:
                p = os.path.join(f[0], f[1])
                if not p.endswith(".en.txt"):
                    os.remove(p)
                    print(p + " was removed!")
예제 #23
0
def main(argv=None, logger=None):
  if(logger is None):
    logger=createLog(logname="subtitle",level=logging.INFO)

  fname=None

  startDtime=datetime.now()
  print "Start time: "+str(startDtime)#.strftime("%Y-%m-%d %H:%M:%S"))
  print
  #sub=Subtitle(logging.getLogger())
  sub=Subtitle(logger)

  try:
    opts, args=getopt.getopt(
      argv, 
      "hvf:w:t:d:p:?lm:WDc", 
      ["help", "version", "checkup" "file=","word=","type=","dir=","pickle=","limit="])
    #print opts, args
    logger.info("opts:{0};args:{1}".format(opts, args))
  except getopt.GetoptError as msg:
    print "error happened when get options!!! error:{0}".format(msg)
    usage()
    logger.error("getopt.GetoptError:{0}, exit!".format(msg))
    sys.exit(2)
  except Exception as msg:
    logger.error("error:{0}, exit!".format(msg))
    sys.exit(2)

  _is_lines_show=False
  _is_words_show=False
  sub_type = ""
  words_limit=None
  for opt, arg in opts:
    if(opt in ("-?","-h", "--help")):
      usage()
      sys.exit()
      pass
    elif(opt in ("-v", "--version")):
      version()
      sys.exit()
      pass
    elif(opt in ("-c", "--checkup")):
      sub.checkup=True
      pass
    elif(opt in ("-d", "--dir")):
      print "Sorry, -d --dir option still not offer"
      sys.exit()
      pass
    elif(opt in ("-p", "--pickle")):
      pkl=arg
      sub.setLexiconFile(pkl)
      pass
    elif(opt in ('-f',"--file")):
      fname= arg
      sub.addFile(fname)
      pass
    elif(opt == '-D'):
      logger.setLevel(logging.DEBUG)
      sub.setLogger(logger)
      pass
    elif(opt in ("-w", "--word")):
      word = arg
      sub.addWord(word)
      pass
    elif(opt in ("-t","--type")):
      sub_type = arg
      if(sub_type not in ('word', 'scan')):
        usage()
        sys.exit()
        pass
      pass
    elif(opt in ("-m","--limit")):
      words_limit= int(arg)
      #print words_limit
      _is_words_show=True
      pass
    elif(opt == '-l'):
      #show lines
      _is_lines_show=True
      pass
    elif(opt == '-W'):
      #show words
      _is_words_show=True
      pass

  """
  if(len(sys.argv)<2):
    print "need args!!"
    logger.error("need args!!sys.argv:{0}".format(sys.argv))
    return None
    pass
  """
  #print sys.argv

  #sub.addPunctuation([',','!',';','.',':','>','<'])
  #sub.addLexicon(["hello", "world"])

  if sub.lexicon_path is None:
    sub.setLexiconFile("lexicon.pickle")
  sub.loadOldData()

  sub.addFiles(args)
  #sub.addStrings("hello world, I'm wang. Please call me wang.")

  sub.parse()

  if(_is_lines_show):
    sub.lines_show()
    pass

  if(_is_words_show):
    #print words_limit
    sub.words_show(words_limit)
    pass
  sub.show()

  if(sub_type =='word'):
    sub.dumpData()

  print 
  endDtime = datetime.now()
  print "End time: "+str(endDtime)
  timedelta  = endDtime-startDtime
  print "Cost time: "+str(timedelta) 

  #getChecksum(sys.argv[1])
  pass
예제 #24
0
    def parse(self, content):

        subtitles = []
        number = None
        start_time = None
        end_time = None
        text = ""

        self.i = 0
        ch = content[self.i]
        token_count = len(self.analyser.tokens)
        while self.i < len(content):
            move_cursor = self.analyser.read_char(ch)

            # 检查是否有新的token被读到了
            if len(self.analyser.tokens) > token_count:
                token_count += 1

                new_token = self.analyser.tokens[-1]
                assert isinstance(new_token, SrtToken)
                # 根据当前state分析
                if self.state == SrtParser.start_state:
                    # 期待输入为标号类型
                    if new_token.type != SrtToken.TYPE_COUNTER:
                        print("ERROR, TYPE COUNTER NEEDED BUT", new_token.type,
                              "FOUND")
                        return
                    print("COUNTER", new_token.value)
                    number = int(new_token.value)
                    # 跳转到标号态
                    self.state = SrtParser.counter_state

                elif self.state == SrtParser.counter_state:
                    if new_token.type != SrtToken.TYPE_TIMESTAMP:
                        print("ERROR, TYPE TIMESTAMP NEEDED BUT",
                              new_token.type, "FOUND")
                        return
                    print("START TIME", new_token.value)
                    start_time = new_token.value
                    # 跳转到开始时间态
                    self.state = SrtParser.start_time_state

                elif self.state == SrtParser.start_time_state:
                    if new_token.type != SrtToken.TYPE_TIME_ARROW:
                        print("ERROR, TYPE ARROW NEEDED BUT", new_token.type,
                              "FOUND")
                        return
                    print(new_token.value)
                    # 跳转到 --> 态
                    self.state = SrtParser.arrow_state

                elif self.state == SrtParser.arrow_state:
                    if new_token.type != SrtToken.TYPE_TIMESTAMP:
                        print("ERROR, TYPE TIMESTAMP NEEDED BUT",
                              new_token.type, "FOUND")
                        return
                    print("END   TIME", new_token.value)
                    end_time = new_token.value
                    # 结束时间态
                    self.state = SrtParser.end_time_state

                elif self.state == SrtParser.end_time_state or self.state == SrtParser.text_state:
                    # 接受任意非换行符字符串(即空行)
                    # print("TYPE:", new_token.type, [new_token.value])
                    if new_token.value != "\n":
                        # 进入字幕态
                        self.state = SrtParser.text_state
                        print("TEXT:", new_token.value)
                        if not new_token.value.endswith("\n"):
                            new_token.value += "\n"
                        text += new_token.value
                    else:
                        # 跳到起始态
                        self.state = SrtParser.start_state
                        print("------- END OF A BLOCK OF SUBTITLE -------")
                        # 生成新的subtitle对象
                        subtitle = Subtitle(number, start_time, end_time, text)
                        subtitles.append(subtitle)

                        # 清空数据
                        number = None
                        start_time = end_time = None
                        text = ""

            if move_cursor:
                self.i += 1
                if self.i < len(content):
                    ch = content[self.i]
                else:
                    break

        print("Tokens:")
        for token in self.analyser.tokens:
            print(token)
        return subtitles
예제 #25
0
	def subtitles(self) -> List[Path]:
		try:
			return [Subtitle(subtitle) for subtitle in self.subtitles_path.iterdir()]
		except FileNotFoundError:
			return []
예제 #26
0
class Sub_testCase(unittest.TestCase):
  '''unit test for Subtitle Class'''

  def __init__(self, *args, **kwargs):
    unittest.TestCase.__init__(self, *args, **kwargs)
    self.pkl = "../data/test.pickle"
    self.fname = '../data/vocabulary/Vocabulary -juniorHighschool(chinese) .txt'
    pass

  def setUp(self):
    self.sub = Subtitle(loglevel=logging.DEBUG)
    pass

  def tearDown(self):
    if os.path.exists(self.pkl):
      os.remove(self.pkl);
      pass
    pass

  def sub_assert(self, sub, lex=None, stem_lex=None, words=None, stem_words=None, new_words=None):
    """
    """
    if lex:
      self.assertEqual(len(sub.lexicon), lex)
      pass
    if stem_lex:
      self.assertEqual(len(sub.stem_lexicon), stem_lex)
      pass
    if words:
      self.assertLessEqual(len(sub.wordSet), words)
      pass
    if stem_words:
      self.assertLessEqual(len(sub.stem_newWords), stem_words)
      pass
    if new_words:
      self.assertLessEqual(len(sub.newWords), new_words)
      pass
    pass

  def test_addWord(self):
    self.sub.setLexiconFile(self.pkl)
    self.sub.addWord('eyes')
    #print type(self.sub.newWords)
    #assert type(self.sub.newWords) is Counter
    self.assertIs(self.sub.newWords, None)
    #self.sub.show()
    self.sub_assert(self.sub,lex=0, stem_lex=0)

    self.sub.parse()
    #self.sub.words_show()
    #print type(self.sub.newWords)
    self.assertIs(type(self.sub.newWords), Counter)
    #assert type(self.sub.newWords) is Counter
    #self.sub.show()
    self.sub_assert(self.sub,lex=0, stem_lex=0, words=1, stem_words=1, new_words=1)

    self.sub.addWords(['anymore','sold'])
    #self.sub.show()
    self.sub_assert(self.sub,lex=0, stem_lex=0, words=1, stem_words=1, new_words=1)
    self.sub.parse()
    #self.sub.show()
    self.sub_assert(self.sub,lex=0, stem_lex=0, words=2, stem_words=2, new_words=2)

    #print self.sub.raw
    pass

  def test_dumpData(self):
    self.sub.setLexiconFile(self.pkl)
    #self.sub.loadOldData()
    self.sub.addFile(self.fname)
    self.sub.addFile('../data/vocabulary/Vocabulary -highschool(chinese).txt')
    self.sub.addFile('../data/vocabulary/Vocabulary-cet-4 (chinese).txt')
    self.sub.parse()
    #self.sub.show()
    self.sub.dumpData()
    self.sub_assert(self.sub, lex=0, stem_lex=0, words=4156, stem_words=4156, new_words=4156)
    '''
    assert len(self.sub.lexicon) == 0
    assert len(self.sub.stem_lexicon) == 0
    assert len(self.sub.wordSet) <= 3929
    assert len(self.sub.stem_newWords) <= 3929
    assert len(self.sub.newWords) <= 3929
    '''
    pass

  def test_html_scan(self):
    self.test_dumpData()

    sub_scan = Subtitle(loglevel=logging.DEBUG)
    sub_scan.setLexiconFile(self.pkl)
    sub_scan.loadOldData()
    #sub_scan.addFile('https://www.coursera.org/')
    sub_scan.addFile('https://selenium-python.readthedocs.org/en/latest/index.html')
    sub_scan.parse()
    #sub_scan.words_show(50)
    #sub_scan.show()
    self.sub_assert(sub_scan,lex=4156, stem_lex=3104, words=127, stem_words=71, new_words=32)
    pass

  def test_txt_scan(self):
    self.test_dumpData()
    
    sub_scan = Subtitle(loglevel=logging.DEBUG)
    sub_scan.setLexiconFile(self.pkl)
    sub_scan.loadOldData()
    sub_scan.addFile('../data/srt/Lord.of.War.eng.480p.SDHF-NORMTEAM.srt')
    sub_scan.parse()
    #sub_scan.words_show(50)
    #sub_scan.show()
    self.sub_assert(sub_scan,lex=4156, stem_lex=3104, words=1936, stem_words=943, new_words=518)
    '''
    assert len(sub_scan.lexicon) == 3929
    assert len(sub_scan.stem_lexicon) == 2968
    assert len(sub_scan.wordSet) <= 1807
    assert len(sub_scan.stem_newWords) <= 922
    assert len(sub_scan.newWords) <= 531
    '''

    names_mv=['Yuri', 'Simeon']
    for n in names_mv:
      assert n in sub_scan.nameSet
      '''
      if n in sub_scan.nameSet:
        print n;
        pass
      '''
      pass
    #print sub_scan.nameSet
    for n in names_mv:
      '''
      if n.lower() in sub_scan.newWords:
        print n;
        pass
      '''
      assert n.lower() not in sub_scan.newWords
      pass
    pass

  def test_word(self):
    self.sub.setLexiconFile(self.pkl)
    #self.sub.loadOldData()
    self.sub.addFile(self.fname)
    self.sub.parse()
    #self.sub.show()

    self.sub_assert(self.sub, lex=0, stem_lex=0, words=1599, stem_words=1599, new_words=1599)
    '''
    assert len(self.sub.lexicon) == 0
    assert len(self.sub.stem_lexicon) == 0
    assert len(self.sub.wordSet) <= 1449
    assert len(self.sub.stem_newWords) <= 1449
    assert len(self.sub.newWords) <= 1449
    '''

    pass
  pass
예제 #27
0
	def new_ost_subtitle(self, lang:str=Constants.OST_LANG, filtr:Callable[[dict], bool]=lambda filtr : True) -> Optional[Subtitle]:
		return Subtitle.ost_new(self.imdbid, lang, filtr)
import util

FRAME_PATH = "/media/data/mtriet/dataset/scnn_%s_frames" % sys.argv[1]
SUB_PATH = "/media/data/mtriet/raw_video/%s/train" % (sys.argv[1])
WINDOW_SIZE = [16, 32, 64, 128, 256, 512]
OVERLAP_RATE = 0.75
CLASSES = [[], []]  # 0, 1

if len(sys.argv) < 3:
    print('fb pad=True/False')
    sys.exit(1)

for window_size in WINDOW_SIZE:
    for frame_root, sub_folder, sub_files in os.walk(FRAME_PATH):
        for folder in sub_folder:
            subtitles = Subtitle.load_subtitle(SUB_PATH, folder, sys.argv[2])
            frames = sorted(os.listdir(frame_root + '/' + folder))
            sub_index = 0
            for begin_pivot in range(
                    1,
                    len(frames) - window_size,
                    int(window_size *
                        (1 - OVERLAP_RATE))):  # ignore last few frames
                if (begin_pivot > subtitles[sub_index].end) and (
                        sub_index < len(subtitles) - 1):
                    sub_index += 1

                end_pivot = min(begin_pivot + window_size, len(frames))
                segment = range(begin_pivot, begin_pivot + window_size + 1)
                sub_range = subtitles[sub_index].get_range()
                intersection = np.intersect1d(segment, sub_range)
예제 #29
0
                                 '-p',
                                 dest='path_folder',
                                 type=str,
                                 required=True,
                                 help='Path folder to find .str files')
        self.parser.add_argument('--overwrite_file',
                                 '--o',
                                 '-o',
                                 dest='overwrite_file',
                                 type=str,
                                 required=True,
                                 help='Overwrite the file')
        self.args = self.parser.parse_args()

    def get_params(self):
        return self.args


if __name__ == '__main__':
    Credits()
    params = Parameters().get_params()
    files = HandleFiles(params.path_folder, ".srt").find_files()
    for file in files:
        print(f'File: "{file}"')
        sub = Subtitle(file)
        try:
            print(params.overwrite_file)
            sub.run(params.overwrite_file)
        except:
            pass