def spacer(): return ChatSpace()
def main(): data = load_nsmc_data() # test() spacer = ChatSpace() df['review'] = spacing(df['review'], spacer) df.to_csv('spacing_nsmc_data.csv')
def spacer(): return ChatSpace(from_jit=False)
from tqdm import tqdm import torch from metric import correct_sum from chatspace import ChatSpace spacer = ChatSpace() def evaluate(model, data_loader, metrics, device, tokenizer=None): if model.training: model.eval() summary = {metric: 0 for metric in metrics} num_correct_elms = 0 for step, mb in tqdm(enumerate(data_loader), desc='steps', total=len(data_loader)): enc_input, dec_input, dec_output = map(lambda elm: elm.to(device), mb) with torch.no_grad(): y_pred = model(enc_input, dec_input) if step % 1000 == 0: decoding_from_result(enc_input, y_pred, dec_output, tokenizer) y_pred = y_pred.reshape(-1, y_pred.size(-1)) dec_output = dec_output.view(-1).long() for metric in metrics: if metric is 'acc':
def spacer(): return ChatSpace(from_jit=False, encoding="utf-8")
def gSTT_file(file_path): r = sr.Recognizer() sapcer = ChatSpace() sig, fs = librosa.load(file_path, mono=False, sr=16000, duration=None, dtype=np.float) #sig = np.mean(sig, 0) sig = sig[0, 9000:] filename = file_path.split('/')[-1] sf.write(filename + 'tmp_noise_file.wav', sig[:int(fs * 0.7)], samplerate=16000, subtype='PCM_16') noise_sig = sr.AudioFile(filename + 'tmp_noise_file.wav') with noise_sig as source: r.adjust_for_ambient_noise(source, duration=0.2) os.remove(filename + 'tmp_noise_file.wav') print(file_path) #(vad, VoiceInterval) = stVAD(sig, fs, nFFT=512, win_length=0.032, hop_length=0.008) VoiceInterval = signalSegmentation(sig, fs) Out_file_name_txt = "{}/{}.txt".format( os.path.dirname(os.path.realpath(__file__)) + '/tmp_ASR', filename) if os.path.exists(Out_file_name_txt): os.remove(Out_file_name_txt) f = open(Out_file_name_txt, encoding="utf-8", mode="a") All_texts = [] for seg_i in range(len(VoiceInterval)): strt_i = int(VoiceInterval[seg_i][0]) end_i = int(VoiceInterval[seg_i][1]) seg_sig = sig[strt_i:end_i, ] sf.write(filename + 'tmp_file.wav', seg_sig, samplerate=16000, subtype='PCM_16') seg_sig = sr.AudioFile(filename + 'tmp_file.wav') with seg_sig as source: audio = r.record(source) os.remove(filename + 'tmp_file.wav') try: ASR_result = r.recognize_google(audio, language="ko-KR", show_all=True) except: ASR_result = None if not ASR_result: ASR_result = ' ' else: ASR_result_text = ASR_result['alternative'][0]['transcript'] ASR_result_text = spell_checker.check(ASR_result_text) ASR_result_text = ASR_result_text.as_dict()['checked'] ASR_result_text = (' '.join(str(x) for x in ASR_result_text)) ASR_result_text = ASR_result_text.replace(" ", "") ASR_result_text = ASR_result_text.replace(" ", "") ASR_result_text = ASR_result_text.replace(" ", "") ASR_result_text = ASR_result_text.replace(" ", "") ASR_result_text = ASR_result_text.replace(" ", "") ASR_result_text = ASR_result_text.replace(" ", "") ASR_result_text = ASR_result_text.replace(" ", "") ASR_result_text = ASR_result_text.replace(" ", "") ASR_result_text = ASR_result_text.replace(" ", "") ASR_result_text = ASR_result_text.replace(" ", "") ASR_result_text = ASR_result_text.replace(" ", "") ASR_result_text = sapcer.space(ASR_result_text, batch_size=1) All_texts.append(ASR_result_text) One_text = (' '.join(str(x) for x in All_texts)) + ' ' + str(0) print(One_text, file=f)
def __init__(self, tokenizer='tc'): self.spacer = ChatSpace() self.mecab = Mecab() self.tc_tagger = Tagger() self.tokenizer = tokenizer self.retokenize = RegexpTokenizer("[\w]+")
def spacer(): return ChatSpace(encoding="utf-8")
def spacer(): return ChatSpace(from_jit=True)
import torch import numpy as np from chatspace import ChatSpace from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB from konlpy.tag import Okt okt = Okt() def tokenizer_morphs(doc): return okt.morphs(doc) spacer = ChatSpace() spacer = ChatSpace(device= torch.device('cuda:0')) i2senti = { 0 : 'joy', 1 : 'interest', 2 : 'anger', 3 : 'admiration', 4 : 'sadness', 5 : 'surprise', 6 : 'fear', 7 : 'disgust' }
def spacer(request): device = torch.device(request.param) return ChatSpace(device=device)
def spacer(self, comment, batch_size=1): spacer = ChatSpace() comment = spacer.space(comment, batch_size=batch_size, custom_vocab=self.user_word) return comment
def __init__(self): self.inst = ChatSpace()
def spacer(text): spacer = ChatSpace() result = spacer.space(text) return result
def spacefix(self, x): spacer = ChatSpace() x = spacer.space(x, batch_size=64) x = pd.Series(x).str.split(' ').tolist() return x