def test_train_graph_build(batch_size, replication_factor, num_ipus, num_io_tiles): """ testing build for deep-voice training graph for different batch sizes and replication factors """ builder = popart.Builder() conf = get_test_conf(batch_size=batch_size, replication_factor=replication_factor, num_ipus=num_ipus, num_io_tiles=num_io_tiles) conf = conf_utils.set_model_conf(conf, print_model_conf=False) deep_voice_model_inputs = create_inputs_for_training(builder, conf) deep_voice_model = PopartDeepVoice(conf, builder, for_inference=False) main_outputs, aux_outputs, name_to_tensor = deep_voice_model( deep_voice_model_inputs["text_input"], deep_voice_model_inputs["mel_spec_input"], deep_voice_model_inputs["speaker_id"]) # checking if all outputs exist assert (len(main_outputs) == 3) assert ("mel_spec_output" in main_outputs) assert ("mag_spec_output" in main_outputs) assert ("done_flag_output" in main_outputs) assert (len(aux_outputs) == 2) assert ("attention_scores_arrays" in aux_outputs) assert ("speaker_embedding_matrix" in aux_outputs) # checking if all output shapes are correct assert_lists_equal( builder.getTensorShape(main_outputs["mel_spec_output"]), [conf.samples_per_device, conf.mel_bands, conf.max_spectrogram_length]) assert_lists_equal(builder.getTensorShape(main_outputs["mag_spec_output"]), [ conf.samples_per_device, conf.n_fft // 2 + 1, conf.max_spectrogram_length ]) assert_lists_equal( builder.getTensorShape(main_outputs["done_flag_output"]), [conf.samples_per_device, 1, conf.max_spectrogram_length]) for att_dist in aux_outputs["attention_scores_arrays"]: assert_lists_equal(builder.getTensorShape(att_dist), [ conf.samples_per_device, conf.max_text_sequence_length, conf.max_spectrogram_length ]) assert_lists_equal( builder.getTensorShape(aux_outputs["speaker_embedding_matrix"]), [conf.num_speakers, conf.speaker_embedding_dim])
def create_model_and_dataflow_for_training(builder, conf, inputs, anchor_mode='train'): """ builds the deep-voice model, loss function and dataflow for training """ def temporal_slice(tensor, start, end): """ slices tensors along the temporal (last) dimension """ tensor_shape = builder.getTensorShape(tensor) slice_starts = builder.aiOnnx.constant( np.array([0, 0, start]).astype('int32'), 'spec_slice_starts') slice_ends = builder.aiOnnx.constant( np.array([tensor_shape[0], tensor_shape[1], end]).astype('int32'), 'spec_slice_ends') return builder.aiOnnx.slice([tensor, slice_starts, slice_ends]) def type_cast(tensor, in_type, out_type): if in_type != out_type: return builder.aiOnnx.cast([tensor], out_type) else: return tensor def get_attention_mask(g=0.2): """ returns attention mask for guided attention """ attention_mask = np.zeros( (conf.max_text_sequence_length, conf.max_spectrogram_length), dtype=conf.precision) for n in range(conf.max_text_sequence_length): for t in range(conf.max_spectrogram_length): attention_mask[n, t] = 1 - np.exp( -(n / conf.max_text_sequence_length - t / conf.max_spectrogram_length)**2 / (2 * g * g)) attention_mask = builder.aiOnnx.constant(attention_mask, 'attention_mask') return attention_mask def get_done_mask(done_labels, num_timesteps): """ returns done mask for spectrogram loss computation """ done_labels_sliced = temporal_slice(done_labels, 1, num_timesteps) done_mask = builder.aiOnnx.add([ builder.aiOnnx.constant(np.array(1.0).astype(np.float32)), builder.aiOnnx.neg([done_labels_sliced]) ]) return done_mask deep_voice_model = PopartDeepVoice(conf, builder, for_inference=False) main_outputs, aux_outputs, name_to_tensor = deep_voice_model( inputs["text_input"], inputs["mel_spec_input"], inputs["speaker_id"]) num_timesteps = builder.getTensorShape(inputs["mel_spec_input"])[-1] float_type = _get_popart_type(conf.precision) # type cast tensors before loss computation (in case of doing experiments with FP16) mel_input_fp32_cast = type_cast( temporal_slice(inputs["mel_spec_input"], 1, num_timesteps), float_type, 'FLOAT') mel_output_fp32_cast = type_cast( temporal_slice(main_outputs["mel_spec_output"], 0, num_timesteps - 1), float_type, 'FLOAT') mag_spec_input_fp32_cast = type_cast( temporal_slice(inputs["mag_spec_input"], 1, num_timesteps), float_type, 'FLOAT') mag_spec_output_fp32_cast = type_cast( temporal_slice(main_outputs["mag_spec_output"], 0, num_timesteps - 1), float_type, 'FLOAT') done_flag_output_fp32_cast = type_cast(main_outputs["done_flag_output"], float_type, 'FLOAT') done_labels_fp32_cast = type_cast(inputs["done_labels"], 'INT32', 'FLOAT') done_mask = get_done_mask(done_labels_fp32_cast, num_timesteps) # mel-spectrogram reconstruction loss for decoder mel_spec_l1_loss = builder.aiGraphcore.l1loss( [ builder.aiOnnx.mul([ done_mask, builder.aiOnnx.add([ mel_output_fp32_cast, builder.aiOnnx.neg([mel_input_fp32_cast]) ]) ]) ], 1.0, reduction=popart.ReductionType.Mean) # linear-scale spectrogram loss for converter mag_spec_l1_loss = builder.aiGraphcore.l1loss( [ builder.aiOnnx.mul([ done_mask, builder.aiOnnx.add([ mag_spec_output_fp32_cast, builder.aiOnnx.neg([mag_spec_input_fp32_cast]) ]) ]) ], 1.0, reduction=popart.ReductionType.Mean) # loss for done-flags done_flag_loss = builder.aiGraphcore.l1loss( [ builder.aiOnnx.add([ done_flag_output_fp32_cast, builder.aiOnnx.neg([done_labels_fp32_cast]) ]) ], 1.0, reduction=popart.ReductionType.Mean) total_loss = builder.aiOnnx.add([mel_spec_l1_loss, mag_spec_l1_loss]) total_loss = builder.aiOnnx.add([total_loss, done_flag_loss]) # add desired output tensors builder.addOutputTensor(main_outputs["mel_spec_output"]) builder.addOutputTensor(main_outputs["mag_spec_output"]) builder.addOutputTensor(aux_outputs["speaker_embedding_matrix"]) for attention_distribution in aux_outputs["attention_scores_arrays"]: builder.addOutputTensor(attention_distribution) anchor_types_dict = { mel_spec_l1_loss: popart.AnchorReturnType("ALL"), mag_spec_l1_loss: popart.AnchorReturnType("ALL"), done_flag_loss: popart.AnchorReturnType("ALL"), } loss_dict = { "mel_spec_l1_loss": mel_spec_l1_loss, "mag_spec_l1_loss": mag_spec_l1_loss, "done_flag_loss": done_flag_loss } if conf.use_guided_attention: attention_mask = get_attention_mask(g=conf.guided_attention_g) masked_attention = builder.aiOnnx.mul( [attention_mask, aux_outputs["attention_scores_arrays"][0]]) for attention_distribution in aux_outputs["attention_scores_arrays"][ 1:]: masked_attention = builder.aiOnnx.add([ masked_attention, builder.aiOnnx.mul([attention_mask, attention_distribution]) ]) attention_loss = builder.aiGraphcore.l1loss( [masked_attention], 1.0, reduction=popart.ReductionType.Mean) anchor_types_dict[attention_loss] = popart.AnchorReturnType("ALL") loss_dict["attention_loss"] = attention_loss total_loss = builder.aiOnnx.add([total_loss, attention_loss]) loss_dict["total_loss"] = total_loss if anchor_mode == 'inference': anchor_types_dict[aux_outputs[ "speaker_embedding_matrix"]] = popart.AnchorReturnType("ALL") for attention_distribution in aux_outputs["attention_scores_arrays"]: anchor_types_dict[ attention_distribution] = popart.AnchorReturnType("ALL") anchor_types_dict[ main_outputs["mel_spec_output"]] = popart.AnchorReturnType("ALL") anchor_types_dict[ main_outputs["mag_spec_output"]] = popart.AnchorReturnType("ALL") proto = builder.getModelProto() dataflow = popart.DataFlow(conf.batches_per_step, anchor_types_dict) return proto, loss_dict, dataflow, main_outputs, aux_outputs, name_to_tensor