예제 #1
0
def make_indexing(sminames, output_name, movie_dir, verbose = False):
    if type(sminames) == type('subtitle.smi'):
        sminames =[sminames]

    indexing_result = []

    for sminame in sminames:
        smi = smipy.Smi(os.path.join(movie_dir, sminame))
        subtitles = smi.subtitles
        print('Processing {}'.format(sminame))

        if smi.sinc_verification:
            sinc = 'T'
        else:
            sinc = 'F'

        if smi.eng_signal is None:
            raise NameError('The english subtitle signal is not detected in {}'.format(sminame))
        if smi.kor_signal is None:
            warnings.warn('The korean subtitle signal is not detected')

        for ind, sub in enumerate(subtitles):
            eng_sentence = sub['eng']
            start_time = sub['start']
            end_time = sub['end']

            if eng_sentence is not None:
                token_list = to_tokens(eng_sentence)

                for word in token_list:
                    indexing_result.append(dict(
                        der_word=word, subtitle_ind=ind, eng_sent=eng_sentence,
                        video_code=''.join(sminame.split('.')[:-1]),
                        start=start_time, end=end_time, tokens = ' '.join(token_list),
                        movie_dir = movie_dir, sinc = sinc
                    ))

    df = pd.DataFrame(indexing_result)
    df.index.name = 'index'
    ####    Saving results

    df_path = get_save_path(dir=movie_dir, name=output_name, format = '.csv')
    df.to_csv(df_path, encoding=CSV_ENCODING)
    return df
예제 #2
0
def make_subtitle(movie_name, directory, overlap=1):
    format_m = '.mp4'
    print('making subtitle {}'.format(movie_name))

    all_names = [
        n for n in os.listdir(directory)
        if n[:len(movie_name) + 1] == movie_name + '_'
    ]
    clip_names = [n for n in all_names if n[-4:] == format_m]
    text_names = [n for n in all_names if n[-4:] == '.txt']
    clip_names.sort()

    durations = []

    for cn in clip_names:
        clip = VideoFileClip(os.path.join(directory, cn))
        dur = clip.duration
        durations.append(dur)

    times = []
    for ind, dur in enumerate(durations):
        start_time = round(sum(durations[:ind]) - (ind) * overlap, 2) + overlap
        end_time = start_time + dur - overlap
        times.append((int(start_time * 1000), int(end_time * 1000) - 2))

    subs = []
    for ind, tn in enumerate(text_names):
        _js = open(os.path.join(directory, tn), 'r').read()
        txt = json.loads(_js)['text']
        s, e = times[ind]
        subs.append(dict(start=s, end=e, eng=txt, kor=''))

    smi = smipy.Smi()
    smi.from_sentences(subs)

    smitext = smi.export()
    smi_path = os.path.join(directory, movie_name.split('.')[0] + '.smi')
    print('saving subtitle {}'.format(smi_path))
    with open(smi_path, 'w') as f:
        f.write(smitext)
예제 #3
0
import os
import smipy
import naverapi
from os.path import join as osj
import numpy as np

threshold_ms = 20000

directory = './friends'
smi_names = [n for n in os.listdir(directory) if n[-4:] == '.smi']

bundle = 10

for n in smi_names:
    print(n)
    smi = smipy.Smi(osj(directory, n))
    smi.kor_signal = 'KRCC'
    sub_len = len(smi.subtitles)
    bundle_len = int(np.ceil(sub_len / bundle))

    for i_bundle in range(bundle_len):
        start = i_bundle * bundle
        end = start + bundle
        st = smi.subtitles[start:end]
        st = [s['eng'] for s in st]
        engtext = '\n'.join(st)
        tr = naverapi.papago_translate(engtext)
        if tr is not None:
            tr_list = tr.split('\\n')
            print(len(st), len(tr_list))
            for ind, kor in enumerate(tr_list):
예제 #4
0
    engsmis = [n for n in os.listdir(directory) if '_k' not in n and n[-4:] == '.smi']

    kornames = [n for n in os.listdir(directory) if '_k' in n and n[-4:] == '.smi']
    # for n in engsmis:
    #     print(n)
    #     text = open(os.path.join(directory,n), 'r', encoding='utf-8').read()
    #     open(n, 'w').write(text.encode('cp949', errors = 'replace').decode('cp949'))
    # for n in kornames:
    #     print(n)
    #     text = open(os.path.join(directory,n), 'r', encoding = 'euc-kr').read()
    #     open(n, 'w').write(text.encode('cp949', errors = 'replace').decode('cp949'))

    print(engsmis)
    for engname in engsmis:
        korname = engname.split('.')[0] + '_k' + '.smi'
        smieng = smipy.Smi(os.path.join(directory, engname))
        smikor = smipy.Smi(os.path.join(directory, korname))

        smieng.to_csv(engname, dest=directory)
        smikor.to_csv(korname, dest = directory)

        # engsubtitles = []

        # for sub in smieng.subtitles:
        #     line = sub['kor'] + sub['eng']
        #     line = line.strip()
        #     engsubtitles.append(dict(start=sub['start'], end=sub['end'], kor='', eng=line))
        #
        # allsent = []
        # allsent.extend(engsubtitles)
        # allsent.extend(smikor.subtitles)
예제 #5
0
def make_word_list(originals, original_to_derivative,original_to_meaning, smi_df_lists, output_name,
                   before_window = 20000, after_window = 10000, output_dir ='./'):
    Author = 'YB'
    #####################################
    #
    #   word list 가지고 단어장 초안 만드는 과정
    #
    #####################################

    #### indexing of word bag
    ori_to_index = {}
    for index, original in enumerate(originals):
        ori_to_index[original] = index

    der_to_index = {}

    for index, original in enumerate(originals):
        ders = original_to_derivative[original]
        for der in ders:
            der_to_index[der] = index

    derivative_to_ori = {}
    derivatives = []

    for k, v in original_to_derivative.items():
        for der in v:
            derivatives.append(der)
            derivative_to_ori[der] = k

    ori_counter = {}
    for ori in originals:
        ori_counter[ori] = 0

    ori_counter_bigbang = {}
    for ori in originals:
        ori_counter_bigbang[ori] = 0

    # list가 아니면 list화
    if not type(smi_df_lists) == type([]):
        smi_df_lists = [smi_df_lists]

    # #df구조
    # indexing_result.append(dict(
    #     der_word=word, subtitle_ind=ind, eng_sent=eng_sentence,
    #     video_code=''.join(sminame.split('.')[:-1]),
    #     start=start_time, end=end_time, tokens=' '.join(token_list),
    #     movie_dir=movie_dir, sinc=smi.sinc_verification
    # ))

    res_list = []
    for df in smi_df_lists:
        #경로모드
        if type(df) == type('path.csv'):
            if df[-4:] == '.csv':
                df = df[:-4]
            df = pd.read_csv(os.path.join(output_dir, df + '.csv'), encoding = CSV_ENCODING)


        for derivative in derivatives:
            df_sel = df[df['der_word'] == derivative]
            # indexing_result.append(dict(
            #     der_word=word, subtitle_ind=ind, eng_sent=eng_sentence,
            #     video_code=''.join(sminame.split('.')[:-1]),
            #     start=start_time, end=end_time, tokens=' '.join(token_list),
            #     movie_dir=movie_dir, sinc=smi.sinc_verification
            # ))

            for _i in range(len(df_sel)):
                row = df_sel.iloc[_i]
                d = dict(row)

                ind = der_to_index[derivative]
                ori = derivative_to_ori[derivative]
                d['word_ind'] = der_to_index[derivative]
                d['ori_word'] = derivative_to_ori[derivative]
                d['clip_index'] = ori_counter[ori] + 10
                ori_counter[ori] +=1

                if d['video_code'][:2] == 'BB':
                    ori_counter_bigbang[ori] +=1

                # pick neighbors of sentences
                smipath = os.path.join(d['movie_dir'], d['video_code'] + '.smi')
                smi = smipy.Smi(smipath)
                neighbor_start = d['start'] - before_window
                neighbor_end = d['end'] + after_window
                cut_list, cut_ind = smi.slice(start_time=neighbor_start, end_time=neighbor_end)
                try:
                    _senti = cut_ind.index(d['subtitle_ind'])
                    before_list = cut_list[:_senti]
                    after_list = cut_list[_senti + 1:]
                except:
                    before_list = []
                    after_list = []

                before_text = '\n'.join([sent['eng'] for sent in before_list])
                after_text = '\n'.join([sent['eng'] for sent in after_list])
                _before_kr = '\n'.join([sent['kor'] for sent in before_list])
                _after_kr = '\n'.join([sent['kor'] for sent in after_list])

                d['_before_no'] = len(before_list)
                d['_after_no'] = len(after_list)

                d['before_text'] = before_text
                d['after_text'] = after_text

                d['_before_kor'] = _before_kr
                d['_after_kor'] = _after_kr

                d['word_meaning'] = original_to_meaning[ori]

                d['verify'] = 'F'

                if ori_counter_bigbang[ori] > 10:
                    pass
                else:
                    res_list.append(d)
        df = pd.DataFrame.from_dict(res_list)

    #abstract result
    abstract_result = []
    for ind, ori in enumerate(originals):
        no_occur = ori_counter[ori]
        abstract_result.append(dict(ori_word = ori, occurance = no_occur))
    df_abs = pd.DataFrame.from_dict(abstract_result)

    #Saving df

    out_path = get_save_path(dir = output_dir, name = output_name, format = '.csv')
    out_path_abs = get_save_path(dir=output_dir, name=output_name + '_abs', format='.csv')

    df.to_csv(out_path, encoding= CSV_ENCODING)
    df_abs.to_csv(out_path_abs, encoding = CSV_ENCODING)
예제 #6
0
def make_clip(words_path, title, out_dir = './clips', pad = 2000, encoding = 'utf-8'):
    ##########################################
    #
    #   clip
    #   밑 변수 순서나 인덱싱 재정리할것!!
    #
    ##########################################
    try:
        worddf = pd.read_csv(words_path, encoding = encoding)
    except:
        worddf = pd.read_csv(words_path, encoding = CSV_ENCODING)

    clip_result_list = []
    for i in range(len(worddf)):

        row = dict(worddf.iloc[i])

        ori = row['ori_word']
        der = row['der_word']
        clip_index = row['clip_index']
        start_time = row['start'] - pad
        end_time = row['end'].item() + pad
        word_meaning = row['word_meaning']
        movie_dir = row['movie_dir']
        video_code = row['video_code']

        if video_code[:2] == 'BB' :
            video_name = 'Big Bang Theory'
        elif video_code[:2] == 'SI':
            video_name = 'Silicon Valley'
        elif video_code[:2] == 'FR':
            video_name = 'Friends'
        else:
            video_name = ''

        video_name = video_name + ' ' + 'Season {} Ep{}'.format(video_code[2], video_code[3:])



        eng_sent = row['eng_sent']
        word_ind = row['word_ind']

        word_loc = -1
        for _i, _w in enumerate(eng_sent.split(' ')):
            if der in ''.join(to_tokens(_w)):
                word_loc = _i

        clip_code = '{}'.format(11000000 + clip_index + int(row['word_ind']) * 1000)
        clip_code = clip_code[-7:]

        smipath = os.path.join(row['movie_dir'], row['video_code'] + '.smi')
        smi = smipy.Smi(smipath)


        # before after lines 중심으로 movie start end time 알아내기
        if type(row['before_text']) == type('sometext'):
            before_no = len(row['before_text'].strip().split('\n'))
        else:
            before_no = 0

        if type(row['after_text']) == type('sometext'):
            after_no = len(row['after_text'].strip().split('\n'))
        else:
            after_no = 0


        sub_index = row['subtitle_ind']
        kor_sent = smi.subtitles[sub_index]['kor']

        sub_slice = smi.subtitles[sub_index - before_no:sub_index + 1 + after_no]
        clip_start = sub_slice[0]['start'] - pad
        clip_end = sub_slice[-1]['end'] + pad

        sent_start = row['start'] - clip_start
        sent_end = row['end'] - clip_start

        # for debugging, lines of neighbors
        before_list = sub_slice[:before_no]
        after_list = sub_slice[-after_no:]

        whole_eng = '\n'.join([sent['eng'] for sent in sub_slice])
        whole_kor = '\n'.join([sent['kor'] for sent in sub_slice])
        # before_text = '\n'.join([sent['eng'] for sent in before_list])
        # after_text = '\n'.join([sent['eng'] for sent in after_list])
        # _before_kr = '\n'.join([sent['kor'] for sent in before_list])
        # _after_kr = '\n'.join([sent['kor'] for sent in after_list])

        # export smi
        cliptxt = smi.export(clip_start, clip_end, slice_manual=sub_slice)
        with open(os.path.join(out_dir, clip_code + '.smi'), 'w') as f:
            f.write(cliptxt)

        # export sliced video
        try:
            ffmpeg_extract_subclip(filename = os.path.join(movie_dir, video_code + '.mkv'),
                               t1 =clip_start / 1000, t2 =clip_end / 1000,
                               targetname=os.path.join(out_dir, clip_code + '.mp4'))
        except:
            print('error in {}'.format(video_code))
            continue
        # export final db for app
        d = dict(word_ind = word_ind, ori_word = ori, clip_code = clip_code,
                 eng_sent =eng_sent, kor_sent = kor_sent,
                 sent_start = sent_start, sent_end = sent_end, word_loc = word_loc, line_loc = before_no, whole_eng = whole_eng,
                 whole_kor = whole_kor, word_meaning = word_meaning, video_name = video_name,
                 _v_s = clip_start, _v_e = clip_end)
        clip_result_list.append(d)
    df = pd.DataFrame.from_dict(clip_result_list)
    df.to_csv(get_save_path(dir = out_dir, name = title, format = '.csv'), encoding = CSV_ENCODING)
예제 #7
0
import re
import os
import codecs

import smipy

smi = smipy.Smi('./testbb501.smi', debug=True)
smi.to_csv(title='testbb501')