data = pd.read_csv(csv_path) paths = data["ImageId"].values paths = [os.path.join(img_dir, p) for p in paths] labels = data["TrueLabel"].values encoder = Encoder(channels, out_ch=2048) decoder = Decoder(2048, channels) encoder.load_state_dict(torch.load(config["encoder"], map_location="cpu")) decoder.load_state_dict(torch.load(config["decoder"], map_location="cpu")) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") encoder.to(device) decoder.to(device) encoder.eval() decoder.eval() x_adv = [] with torch.no_grad(): bar = tqdm.tqdm(paths) for path in bar: filename = os.path.basename(path) bar.set_description(f"processing:{filename}") image = cv2.imread(path) image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) h, w = image.shape[:2] norm = Compose([ Resize(img_size, img_size, always_apply=True), Normalize(mean=means, std=std, always_apply=True) ]) norm_data = norm(image=image)
def inference(checkpoint_file, text): ds = tiny_words(max_text_length=hp.max_text_length, max_audio_length=hp.max_audio_length, max_dataset_size=args.data_size) print(ds.texts) # prepare input indexes = indexes_from_text(ds.lang, text) indexes.append(EOT_token) padded_indexes = pad_indexes(indexes, hp.max_text_length, PAD_token) texts_v = Variable(torch.from_numpy(padded_indexes)) texts_v = texts_v.unsqueeze(0) if hp.use_cuda: texts_v = texts_v.cuda() encoder = Encoder(ds.lang.num_chars, hp.embedding_dim, hp.encoder_bank_k, hp.encoder_bank_ck, hp.encoder_proj_dims, hp.encoder_highway_layers, hp.encoder_highway_units, hp.encoder_gru_units, dropout=hp.dropout, use_cuda=hp.use_cuda) decoder = AttnDecoder(hp.max_text_length, hp.attn_gru_hidden_size, hp.n_mels, hp.rf, hp.decoder_gru_hidden_size, hp.decoder_gru_layers, dropout=hp.dropout, use_cuda=hp.use_cuda) postnet = PostNet(hp.n_mels, 1 + hp.n_fft // 2, hp.post_bank_k, hp.post_bank_ck, hp.post_proj_dims, hp.post_highway_layers, hp.post_highway_units, hp.post_gru_units, use_cuda=hp.use_cuda) encoder.eval() decoder.eval() postnet.eval() if hp.use_cuda: encoder.cuda() decoder.cuda() postnet.cuda() # load model checkpoint = torch.load(checkpoint_file) encoder.load_state_dict(checkpoint['encoder']) decoder.load_state_dict(checkpoint['decoder']) postnet.load_state_dict(checkpoint['postnet']) encoder_out = encoder(texts_v) # Prepare input and output variables GO_frame = np.zeros((1, hp.n_mels)) decoder_in = Variable(torch.from_numpy(GO_frame).float()) if hp.use_cuda: decoder_in = decoder_in.cuda() h, hs = decoder.init_hiddens(1) decoder_outs = [] for t in range(int(hp.max_audio_length / hp.rf)): decoder_out, h, hs, _ = decoder(decoder_in, h, hs, encoder_out) decoder_outs.append(decoder_out) # use predict decoder_in = decoder_out[:, -1, :].contiguous() # (batch_size, T, n_mels) decoder_outs = torch.cat(decoder_outs, 1) # postnet post_out = postnet(decoder_outs) s = post_out[0].cpu().data.numpy() print("Recontructing wav...") s = np.where(s < 0, 0, s) wav = spectrogram2wav(s**hp.power) # wav = griffinlim(s**hp.power) write("demo.wav", hp.sr, wav)