Exemplo n.º 1
0
def force_utf8_and_filter_duplicates(subtitles):
    log_debug('Trying to filter duplicated subtitles...')

    for s in subtitles:
        _, s['lang'], s['content'] = guess_locale_and_convert(s['content'])

    dup_tag = [False] * len(subtitles)
    for i in range(len(subtitles)):
        if dup_tag[i]:
            continue
        for j in range(i + 1, len(subtitles)):
            sa = subtitles[i]
            sb = subtitles[j]
            if sa['extension'] != sb['extension'] or sa['lang'] != sb['lang']:
                continue
            import difflib
            similarity = difflib.SequenceMatcher(
                None, sa['content'], sb['content']).real_quick_ratio()
            log_debug('Similarity is {0}.'.format(similarity))
            if similarity > 0.9:
                dup_tag[j] = True
    # TODO: reserve longer subtitles
    subtitles = [subtitles[i] for i in range(len(subtitles)) if not dup_tag[i]]
    log_debug('{0} subtitle(s) reserved after duplicates filtering.'.format(
        len(subtitles)))
Exemplo n.º 2
0
 def parse_local_subtitles(self):
     info = self.__info
     raw = self.__raw_info['mplayer']
     
     info['subtitle'] = defaultdict(bool)
     if raw['ID_SUBTITLE_ID']:
         # TODO: extract subtitles and combine to a bi-lingual sub
         # ffmpeg -i Seinfeld.2x01.The_Ex-Girlfriend.xvid-TLF.mkv -vn -an -scodec srt sub.srt
         info['subtitle']['embed'] = []
         for i in raw['ID_SUBTITLE_ID']:
             info['subtitle']['embed'] += raw['ID_SID_{0}_LANG'.format(i)]
     if raw['ID_FILE_SUB_ID']:
         info['subtitle']['external'] = raw['ID_FILE_SUB_FILENAME']
         log_debug('Converting the external subtitles to UTF-8...')
         from charset import guess_locale_and_convert
         for subfile in raw['ID_FILE_SUB_FILENAME']:
             # open in binary mode because we don't know the encoding
             with open(subfile,'r+b') as f:
                 s = f.read()
                 enc,_,s = guess_locale_and_convert(s)
                 if not enc in ['utf_8','ascii']:
                     f.seek(0)
                     f.write(s)
         self.add_arg('-subcp utf8')
     if raw['ID_VOBSUB_ID']:
         info['subtitle']['vobsub'] = True
         unrar = which('unrar')
         if unrar:
             self.add_arg('-unrarexec {0}'.format(unrar))
Exemplo n.º 3
0
    def parse_local_subtitles(self):
        info = self.__info
        raw = self.__raw_info['mplayer']

        info['subtitle'] = defaultdict(bool)
        if raw['ID_SUBTITLE_ID']:
            # TODO: extract subtitles and combine to a bi-lingual sub
            # ffmpeg -i Seinfeld.2x01.The_Ex-Girlfriend.xvid-TLF.mkv -vn -an -scodec srt sub.srt
            info['subtitle']['embed'] = []
            for i in raw['ID_SUBTITLE_ID']:
                info['subtitle']['embed'] += raw['ID_SID_{0}_LANG'.format(i)]
        if raw['ID_FILE_SUB_ID']:
            info['subtitle']['external'] = raw['ID_FILE_SUB_FILENAME']
            log_debug('Converting the external subtitles to UTF-8...')
            from charset import guess_locale_and_convert
            for subfile in raw['ID_FILE_SUB_FILENAME']:
                # open in binary mode because we don't know the encoding
                with open(subfile, 'r+b') as f:
                    s = f.read()
                    enc, _, s = guess_locale_and_convert(s)
                    if not enc in ['utf_8', 'ascii']:
                        f.seek(0)
                        f.write(s)
            self.add_arg('-subcp utf8')
        if raw['ID_VOBSUB_ID']:
            info['subtitle']['vobsub'] = True
            unrar = which('unrar')
            if unrar:
                self.add_arg('-unrarexec {0}'.format(unrar))
Exemplo n.º 4
0
def force_utf8_and_filter_duplicates(subtitles):
    log_debug('Trying to filter duplicated subtitles...')

    for s in subtitles:
        _,s['lang'],s['content'] = guess_locale_and_convert(s['content'])
            
    dup_tag = [False]*len(subtitles)
    for i in range(len(subtitles)):
        if dup_tag[i]:
            continue
        for j in range(i+1, len(subtitles)):
            sa = subtitles[i]
            sb = subtitles[j]
            if sa['extension'] != sb['extension'] or sa['lang'] != sb['lang']:
                continue
            import difflib
            similarity = difflib.SequenceMatcher(None, sa['content'], sb['content']).real_quick_ratio()
            log_debug('Similarity is {0}.'.format(similarity))
            if similarity > 0.9:
                dup_tag[j] = True
    # TODO: reserve longer subtitles 
    subtitles = [subtitles[i] for i in range(len(subtitles)) if not dup_tag[i]]
    log_debug('{0} subtitle(s) reserved after duplicates filtering.'.format(len(subtitles)))