Exemplo n.º 1
0
def spacer():
    return ChatSpace()
Exemplo n.º 2
0
def main():
    data = load_nsmc_data()
    #     test()
    spacer = ChatSpace()
    df['review'] = spacing(df['review'], spacer)
    df.to_csv('spacing_nsmc_data.csv')
Exemplo n.º 3
0
def spacer():
    return ChatSpace(from_jit=False)
from tqdm import tqdm
import torch
from metric import correct_sum
from chatspace import ChatSpace

spacer = ChatSpace()


def evaluate(model, data_loader, metrics, device, tokenizer=None):
    if model.training:
        model.eval()

    summary = {metric: 0 for metric in metrics}
    num_correct_elms = 0

    for step, mb in tqdm(enumerate(data_loader),
                         desc='steps',
                         total=len(data_loader)):
        enc_input, dec_input, dec_output = map(lambda elm: elm.to(device), mb)

        with torch.no_grad():
            y_pred = model(enc_input, dec_input)

            if step % 1000 == 0:
                decoding_from_result(enc_input, y_pred, dec_output, tokenizer)

            y_pred = y_pred.reshape(-1, y_pred.size(-1))
            dec_output = dec_output.view(-1).long()

            for metric in metrics:
                if metric is 'acc':
Exemplo n.º 5
0
def spacer():
    return ChatSpace(from_jit=False, encoding="utf-8")
Exemplo n.º 6
0
def gSTT_file(file_path):
    r = sr.Recognizer()
    sapcer = ChatSpace()
    sig, fs = librosa.load(file_path,
                           mono=False,
                           sr=16000,
                           duration=None,
                           dtype=np.float)
    #sig = np.mean(sig, 0)
    sig = sig[0, 9000:]
    filename = file_path.split('/')[-1]
    sf.write(filename + 'tmp_noise_file.wav',
             sig[:int(fs * 0.7)],
             samplerate=16000,
             subtype='PCM_16')
    noise_sig = sr.AudioFile(filename + 'tmp_noise_file.wav')
    with noise_sig as source:
        r.adjust_for_ambient_noise(source, duration=0.2)

    os.remove(filename + 'tmp_noise_file.wav')
    print(file_path)
    #(vad, VoiceInterval) = stVAD(sig, fs, nFFT=512, win_length=0.032, hop_length=0.008)
    VoiceInterval = signalSegmentation(sig, fs)

    Out_file_name_txt = "{}/{}.txt".format(
        os.path.dirname(os.path.realpath(__file__)) + '/tmp_ASR', filename)
    if os.path.exists(Out_file_name_txt):
        os.remove(Out_file_name_txt)

    f = open(Out_file_name_txt, encoding="utf-8", mode="a")
    All_texts = []
    for seg_i in range(len(VoiceInterval)):
        strt_i = int(VoiceInterval[seg_i][0])
        end_i = int(VoiceInterval[seg_i][1])

        seg_sig = sig[strt_i:end_i, ]

        sf.write(filename + 'tmp_file.wav',
                 seg_sig,
                 samplerate=16000,
                 subtype='PCM_16')

        seg_sig = sr.AudioFile(filename + 'tmp_file.wav')

        with seg_sig as source:
            audio = r.record(source)
        os.remove(filename + 'tmp_file.wav')
        try:
            ASR_result = r.recognize_google(audio,
                                            language="ko-KR",
                                            show_all=True)
        except:
            ASR_result = None

        if not ASR_result:
            ASR_result = ' '
        else:
            ASR_result_text = ASR_result['alternative'][0]['transcript']

            ASR_result_text = spell_checker.check(ASR_result_text)
            ASR_result_text = ASR_result_text.as_dict()['checked']

            ASR_result_text = (' '.join(str(x) for x in ASR_result_text))
            ASR_result_text = ASR_result_text.replace("          ", "")
            ASR_result_text = ASR_result_text.replace("         ", "")
            ASR_result_text = ASR_result_text.replace("        ", "")
            ASR_result_text = ASR_result_text.replace("       ", "")
            ASR_result_text = ASR_result_text.replace("      ", "")
            ASR_result_text = ASR_result_text.replace("     ", "")
            ASR_result_text = ASR_result_text.replace("    ", "")
            ASR_result_text = ASR_result_text.replace("   ", "")
            ASR_result_text = ASR_result_text.replace("  ", "")
            ASR_result_text = ASR_result_text.replace(" ", "")

            ASR_result_text = ASR_result_text.replace("          ", "")

            ASR_result_text = sapcer.space(ASR_result_text, batch_size=1)
            All_texts.append(ASR_result_text)

    One_text = (' '.join(str(x) for x in All_texts)) + ' ' + str(0)
    print(One_text, file=f)
Exemplo n.º 7
0
 def __init__(self, tokenizer='tc'):
     self.spacer = ChatSpace()
     self.mecab = Mecab()
     self.tc_tagger = Tagger()
     self.tokenizer = tokenizer
     self.retokenize = RegexpTokenizer("[\w]+")
Exemplo n.º 8
0
def spacer():
    return ChatSpace(encoding="utf-8")
Exemplo n.º 9
0
def spacer():
    return ChatSpace(from_jit=True)
Exemplo n.º 10
0
import torch
import numpy as np
from chatspace import ChatSpace

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from konlpy.tag import Okt


okt = Okt()
def tokenizer_morphs(doc):
    return okt.morphs(doc)


spacer = ChatSpace()

spacer = ChatSpace(device= torch.device('cuda:0'))

i2senti = {
    0 : 'joy',
    1 : 'interest',
    2 : 'anger',
    3 : 'admiration',
    4 : 'sadness',
    5 : 'surprise',
    6 : 'fear',
    7 : 'disgust'
}

Exemplo n.º 11
0
def spacer(request):
    device = torch.device(request.param)
    return ChatSpace(device=device)
 def spacer(self, comment, batch_size=1):
     spacer = ChatSpace()
     comment = spacer.space(comment,
                            batch_size=batch_size,
                            custom_vocab=self.user_word)
     return comment
Exemplo n.º 13
0
 def __init__(self):
     self.inst = ChatSpace()
Exemplo n.º 14
0
def spacer(text):
    spacer = ChatSpace()
    result = spacer.space(text)
    return result
Exemplo n.º 15
0
    def spacefix(self, x):
        spacer = ChatSpace()
        x = spacer.space(x, batch_size=64)
        x = pd.Series(x).str.split(' ').tolist()

        return x