def EstimateChord(idx, dnnmodel, todir=False): #dnn = networks.FeatureDNN() #dnn = networks.ConvnetFeatExtractor() dnn = networks.FullCNNFeatExtractor() #dnn = networks.NoOperation() dnn.load(dnnmodel) dnn.to_gpu(0) decoder = networks.NBLSTMCRF() decoder.load() decoder.to_gpu(0) cqtfilelist = np.array(find_files(const.PATH_HCQT, ext="npy"))[idx] i = 0 chainer.config.train = False chainer.config.enable_backprop = False for cqtfile in cqtfilelist: cqt = utils.Embed(utils.PreprocessSpec(np.load(cqtfile)[:, :, :]), 1) chroma = dnn.GetFeature(cp.asarray(cqt)).data path = decoder.argmax(chroma) feat = cp.asnumpy(chroma) if todir: fname = cqtfile.split("/")[-1] + ".lab" alb = cqtfile.split("/")[-2] utils.SaveEstimatedLabelsFramewise( path, const.PATH_ESTIMATE_CROSS + alb + "/" + fname, feat) else: utils.SaveEstimatedLabelsFramewise( path, const.PATH_ESTIMATE + "%03d.lab" % i, feat) i += 1
def TrainConvnetExtractor(trainidx, epoch=20, saveas="convnet.model"): cqtfilelist = np.array(find_files(const.PATH_MIDIHCQT, ext="npz"))[trainidx] #midifilelist = find_files(const.PATH_MIDI,ext="mid")[:filecnt] config.train = True config.enable_backprop = True convnet = networks.FullCNNFeatExtractor() model = networks.ConvnetPredictor(convnet) model.to_gpu(0) opt = optimizers.AdaDelta() opt.setup(model) print("train set length: %d" % trainidx.size) print("start epochs...") S = [] T = [] for cqtfile in cqtfilelist: dat = np.load(cqtfile) spec = utils.PreprocessSpec(dat["spec"])[:const.CQT_H, :, :] targ = GetConvnetTargetFromPianoroll(dat["target"]).astype(np.int32) assert (spec.shape[1] == targ.shape[0]) S.append(spec) T.append(targ) S = np.concatenate(S, axis=1) T = np.concatenate(T, axis=0) for ep in range(epoch): sum_loss = 0 assert (S.shape[1] == T.shape[0]) randidx = np.random.randint(0, S.shape[1] - const.CONV_TRAIN_SEQLEN - 1, S.shape[1] // const.CONV_TRAIN_SEQLEN * 4) for i in range(0, randidx.size - const.CONV_TRAIN_BATCH, const.CONV_TRAIN_BATCH): x_batch = np.stack([ S[:, randidx[j]:randidx[j] + const.CONV_TRAIN_SEQLEN, :] for j in range(i, i + const.CONV_TRAIN_BATCH) ]) t_batch = np.stack([ T[randidx[j]:randidx[j] + const.CONV_TRAIN_SEQLEN, :] for j in range(i, i + const.CONV_TRAIN_BATCH) ]) x_in = cp.asarray(x_batch) t_in = cp.asarray(t_batch) model.cleargrads() loss = model(x_in, t_in) loss.backward() opt.update() sum_loss += loss.data convnet.save(saveas) print("epoch: %d/%d loss:%.04f" % (ep + 1, epoch, sum_loss / const.CONV_TRAIN_BATCH)) convnet.save(saveas)
type=str, default="nblstm_crf.model", action="store") args = parser.parse_args() audio_list = find_files("Datas/audios_estimation") for audiofile in audio_list: fname = audiofile.split("/")[-1] print("Processing: %s" % fname) #load audio y,sr = load(audiofile,sr=C.SR) #extract Harmonic-CQT from audio fmin = note_to_hz("C1") hcqt = np.stack([np.abs(cqt(y,sr=C.SR,hop_length=C.H,n_bins=C.BIN_CNT,bins_per_octave=C.OCT_BIN,fmin=fmin*(h+1),filter_scale=2,tuning=None)).T.astype(np.float32) for h in range(C.CQT_H)]) #extract feature using trained CNN extractor cnn_feat_extractor = N.FullCNNFeatExtractor() cnn_feat_extractor.load(args.f) feat = cnn_feat_extractor.GetFeature(U.PreprocessSpec(hcqt)).data #decode label sequence decoder = N.NBLSTMCRF() decoder.load(args.d) labels = decoder.argmax(feat) #convert into .lab file labfile = os.path.join("Datas/labs_estimated",fname+".lab") U.SaveEstimatedLabelsFramewise(labels,labfile,feat)
def TrainNStepCRF(idx, epoch=20, augment=0, featmodel=const.DEFAULT_CONVNETFILE, path_blstm="blstm.model", savefile="nblstm_crf.model"): cqtfilelist = np.array(find_files(const.PATH_HCQT, ext="npy")) chordlablist = np.array( find_files(const.PATH_CHORDLAB, ext=["lab", "chords"])) if idx is not None: cqtfilelist = cqtfilelist[idx] chordlablist = chordlablist[idx] chainer.config.train = False chainer.config.enable_backprop = False #dnn = networks.TripleDNNExtractor() #dnn = networks.FeatureDNN() dnn = networks.FullCNNFeatExtractor() #dnn = networks.NoOperation() #dnn = networks.ConvnetFeatExtractor() dnn.load(featmodel) dnn.to_gpu(0) rnn = networks.NBLSTMCRF() rnn.blstm.load(path_blstm) rnn.to_gpu(0) opt = optimizers.MomentumSGD() opt.setup(rnn) #opt.add_hook(optimizer.WeightDecay(0.001)) X = [] T = [] for cqtfile, labfile in zip(cqtfilelist, chordlablist): cqt = utils.Embed(utils.PreprocessSpec(np.load(cqtfile)[:, :, :]), 1) feature = cp.asnumpy(dnn.GetFeature(cp.asarray(cqt)).data) lab = utils.LoadLabelArr(labfile) min_sz = min([feature.shape[0], lab.shape[0]]) X.append(feature[:min_sz, :]) T.append(lab[:min_sz]) sizes = np.array([x.shape[0] for x in X], dtype="int32") print("start epoch:") chainer.config.train = False chainer.config.enable_backprop = True last_loss = np.inf for ep in range(epoch): sum_loss = 0.0 rand_songid = np.random.randint(len(X), size=np.sum(sizes) // const.DECODER_TRAIN_SEQLEN * 8) for i in range(0, rand_songid.size, const.DECODER_TRAIN_BATCH): xbatch = [] tbatch = [] for songid in rand_songid[i:i + const.DECODER_TRAIN_BATCH]: seq_len = sizes[songid] idx = np.random.randint(seq_len - const.DECODER_TRAIN_SEQLEN - 1) x_snip = X[songid][idx:idx + const.DECODER_TRAIN_SEQLEN, :] t_snip = T[songid][idx:idx + const.DECODER_TRAIN_SEQLEN] if augment > 0: shift = np.random.randint(augment) x_snip, t_snip = shift_data(x_snip, t_snip, shift) xbatch.append(Variable(cp.asarray(x_snip))) tbatch.append(Variable(cp.asarray(t_snip))) rnn.cleargrads() opt.update(rnn, xbatch, tbatch) sum_loss += rnn.loss.data print("epoch %d/%d loss=%.3f" % (ep + 1, epoch, sum_loss / 12800.0)) rnn.save(savefile)
cqt(wav, sr=C.SR, hop_length=C.H, n_bins=144, bins_per_octave=24, filter_scale=2, tuning=None)).T.astype(np.float32)), size=1) #dat = np.load("/media/wuyiming/TOSHIBA EXT/midihcqt_12/000005.npy") #dat_24 = np.load("/media/wuyiming/TOSHIBA EXT/midihcqt_24/000005.npz") #spec_dnn = U.Embed(U.PreprocessSpec(dat_24["spec"]),size=7) spec = spec[:, :250, :] spec_dnn = spec_dnn[:250, :] cnn = networks.FullCNNFeatExtractor() cnn.load("fullcnn_crossentropy_6000.model") deepchroma = networks.FeatureDNN() deepchroma.load( "/home/wuyiming/Projects/TranscriptionChordRecognition/dnn3500.model") chroma_cnn = cnn.GetFeature(spec).data[:, 12:24].T chroma_dnn = deepchroma.GetFeature(spec_dnn).data[:, 12:24].T chroma = np.log( 1 + chroma_cqt(wav, sr=C.SR, hop_length=C.H, bins_per_octave=24)[:, :250]) target = chromatemplate.GetConvnetTargetFromPianoroll( U.GetPianoroll( "/media/wuyiming/TOSHIBA EXT/AIST.RWC-MDB-P-2001.SMF_SYNC/RM-P051.SMF_SYNC.MID" ))
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Wed Jun 6 13:13:37 2018 @author: wuyiming """ import networks as N from librosa.core import cqt, load, note_to_hz import const as C import numpy as np cnn = N.FullCNNFeatExtractor() cnn.load("fullcnn_crossentropy_6000.model") y, sr = load("audio.wav", sr=C.SR) fmin = fmin = note_to_hz("C1") spec = np.stack([ np.abs( cqt(y, sr=C.SR, hop_length=C.H, n_bins=C.BIN_CNT, bins_per_octave=C.OCT_BIN, fmin=fmin * (h + 1), filter_scale=2, tuning=None)).T.astype(np.float32) for h in range(C.CQT_H) ])