embed1 = Embedding(inpt, n_symbols, embed1_dim, random_state) inp_proj, inpgate_proj = GRUFork([embed1], [embed1_dim], h1_dim, random_state) def step(inp_t, inpgate_t, h1_tm1, h2_tm1, h3_tm1): h1 = GRU(inp_t, inpgate_t, h1_tm1, h1_dim, h1_dim, random_state) h1_t, h1gate_t = GRUFork([h1], [h1_dim], h2_dim, random_state) h2 = GRU(h1_t, h1gate_t, h2_tm1, h2_dim, h2_dim, random_state) h2_t, h2gate_t = GRUFork([h2], [h2_dim], h3_dim, random_state) h3 = GRU(h2_t, h2gate_t, h3_tm1, h3_dim, h3_dim, random_state) return h1, h2, h3 h1, h2, h3 = scan(step, [inp_proj, inpgate_proj], [init_h1, init_h2, init_h3]) final_h1, final_h2, final_h3 = [ni(h1, -1), ni(h2, -1), ni(h3, -1)] pred = Linear([h3], [h3_dim], out_dim, random_state) cost = tf.reduce_mean(categorical_crossentropy(softmax(pred), target)) # cost in bits # cost = cost * 1.44269504089 params = tf.trainable_variables() print_network(params) grads = tf.gradients(cost, params) grads = [tf.clip_by_value(grad, -grad_clip, grad_clip) for grad in grads] opt = tf.train.AdamOptimizer(learning_rate) updates = opt.apply_gradients(zip(grads, params)) def _loop(itr, sess, inits=None, do_updates=True):
inp_proj, inpgate_proj = GRUFork([oh_i], [n_symbols], enc_h1_dim, random_state) def enc_step(inp_t, inpgate_t, inpmask_t, h1_tm1): enc_h1 = GRU(inp_t, inpgate_t, h1_tm1, enc_h1_dim, enc_h1_dim, random_state, mask=inpmask_t) return enc_h1 enc_h1 = scan(enc_step, [inp_proj, inpgate_proj, inpt_mask], [init_enc_h1]) final_enc_h1 = ni(enc_h1, -1) # Kick off dynamics init_dec_h1 = tanh( Linear([final_enc_h1], [enc_h1_dim], dec_h1_dim, random_state)) oh_target = OneHot(target, n_symbols) # prepend 0, then slice off last timestep shift_target = shift(oh_target) # shift mask the same way? but use 1 to mark as active # shift_target_mask = shift(target_mask, fill_value=1.) out_proj, outgate_proj = GRUFork([shift_target], [n_symbols], dec_h1_dim, random_state) # Just add in at each timestep - no easy broadcast target here without some work
init_h2 = tf.placeholder(tf.float32, [batch_size, h_dim]) note_embed = Multiembedding(note_inpt, n_note_symbols, note_embed_dim, random_state) inp_proj, inpgate_proj = GRUFork([note_embed], [n_notes * note_embed_dim], h_dim, random_state) def step(inp_t, inpgate_t, h1_tm1, h2_tm1): h1 = GRU(inp_t, inpgate_t, h1_tm1, h_dim, h_dim, random_state) h2 = GRU(inp_t, inpgate_t, h2_tm1, h_dim, h_dim, random_state) return h1, h2 h1, h2 = scan(step, [inp_proj, inpgate_proj], [init_h1, init_h2]) final_h1 = ni(h1, -1) final_h2 = ni(h2, -1) target_note_embed = Multiembedding(note_target, n_note_symbols, note_embed_dim, random_state) target_note_masked = Automask(target_note_embed, n_notes) costs = [] note_preds = [] duration_preds = [] for i in range(n_notes): note_pred = Linear([h1, h2, target_note_masked[i]], [h_dim, h_dim, n_notes * note_embed_dim], note_out_dims[i], random_state, weight_norm=False)
scan_inp = tf.concat(2, [duration_embed, note_embed]) scan_inp_dim = n_notes * duration_embed_dim + n_notes * note_embed_dim def step(inp_t, h1_tm1): h1_t_proj, h1gate_t_proj = RNNFork([inp_t], [scan_inp_dim], h_dim, random_state, weight_norm=weight_norm_middle) h1_t = RNN(h1_t_proj, h1gate_t_proj, h1_tm1, h_dim, h_dim, random_state) return h1_t h1_f = scan(step, [scan_inp], [init_h1]) h1 = h1_f final_h1 = ni(h1, -1) target_note_embed = Multiembedding(note_target, n_note_symbols, note_embed_dim, random_state, name=name_note_emb, share_all=share_all_embeddings) target_note_masked = Automask(target_note_embed, n_notes) target_duration_embed = Multiembedding(duration_target, n_duration_symbols, duration_embed_dim, random_state, name=name_dur_emb, share_all=share_all_embeddings) target_duration_masked = Automask(target_duration_embed, n_notes)