import dataop as dop a = dop.load_var("audio_stem_join") d = dop.load_var("dicts/audio_stem_all_dict") d = d[0] #34910 inp = dop.create_indexMatrix(a, d, 5000) dop.save_var(inp, "inputs/audio_stem") #a = dop.load_var("audio_join") #d = dop.load_var("dicts/audio_alternatives_dict") #d = d[0] #inp = dop.create_indexMatrix(a,d,5) #dop.save_var(inp,"inputs/audio")
import dataop as dop import re audio = dop.load_var("audio_alternatives") new = list() for slide in audio : txt = "" for alt in slide : txt += " " + alt new.append(re.sub("\d","",txt)) new = new dop.save_var(new,"audio_join")
import dataop as dop import numpy as np from math import floor inp = np.array(dop.load_var("inputs/audio_stem")) l = np.array(dop.load_var("label")) num_classes = max(l) p = floor(len(l)/10) out = np.zeros([len(l),num_classes]) for i in range(len(l)) : out[i][l[i]-1]=1 rp = np.random.permutation(len(inp)) inp = inp[rp] out = out[rp] dop.save_var(rp.tolist(),"rp") dop.save_var(inp[:p].tolist(),"inputs/audio_stem_tst") dop.save_var(out[:p].tolist(),"inputs/audio_stem_tst_l") dop.save_var(inp[p:].tolist(),"inputs/audio_stem_tr") dop.save_var(out[p:].tolist(),"inputs/audio_stem_tr_l")
import dataop as dop import numpy as np d = dop.load_var("dicts/audio_stem_all_dict") a = np.array(list(d[2].values())) occ2num = dict() for u in set(a): occ2num[str(u)] = int(np.sum(a >= u)) dop.save_var(occ2num, "occ2num_audio_stem") I = dop.tfidf( dop.create_freq_input(dop.load_var("audio_stem_all"), d[0], size=occ2num[str(15)])) I, _ = dop.normalize_lin(I, []) I = I[:, np.sum(I != 0, 0) > 0] dop.save_np(I, "inputs/audio_stem_tfidf_norm") #---ocr--- d = dop.load_var("dicts/ocr_3_dict") a = np.array(list(d[2].values())) occ2num = dict() for u in set(a): occ2num[str(u)] = int(np.sum(a >= u)) dop.save_var(occ2num, "occ2num_ocr") I = dop.tfidf( dop.create_freq_input(dop.load_var("audio_stem_all"), d[0],
import dataop as dop #--audio_stem-- a = dop.load_var("audio_stem_all") a = dop.edit_text(a) d = dop.load_var("dicts/audio_stem_all_dict") d = d[0] inp = dop.create_indexMatrix(a, d, 50000) dop.save_var(inp, "inputs/audio_stem") #--ocr-- a = dop.load_var("ocr") a = dop.edit_text(a) d = dop.load_var("dicts/ocr_1_dict") d = d[0] inp = dop.create_indexMatrix(a, d, char_level=True) dop.save_var(inp, "inputs/ocr_char") #--audio_all-- a = dop.load_var("audio_join") a = dop.edit_text(a) d = dop.load_var("dicts/audio_join_dict") d = d[0] inp = dop.create_indexMatrix(a, d) dop.save_var(inp, "inputs/audio_all")