Пример #1
0
def test_train_graph_build(batch_size, replication_factor, num_ipus,
                           num_io_tiles):
    """ testing build for deep-voice training graph for different batch sizes and replication factors """
    builder = popart.Builder()
    conf = get_test_conf(batch_size=batch_size,
                         replication_factor=replication_factor,
                         num_ipus=num_ipus,
                         num_io_tiles=num_io_tiles)
    conf = conf_utils.set_model_conf(conf, print_model_conf=False)

    deep_voice_model_inputs = create_inputs_for_training(builder, conf)
    deep_voice_model = PopartDeepVoice(conf, builder, for_inference=False)

    main_outputs, aux_outputs, name_to_tensor = deep_voice_model(
        deep_voice_model_inputs["text_input"],
        deep_voice_model_inputs["mel_spec_input"],
        deep_voice_model_inputs["speaker_id"])

    # checking if all outputs exist
    assert (len(main_outputs) == 3)
    assert ("mel_spec_output" in main_outputs)
    assert ("mag_spec_output" in main_outputs)
    assert ("done_flag_output" in main_outputs)

    assert (len(aux_outputs) == 2)
    assert ("attention_scores_arrays" in aux_outputs)
    assert ("speaker_embedding_matrix" in aux_outputs)

    # checking if all output shapes are correct
    assert_lists_equal(
        builder.getTensorShape(main_outputs["mel_spec_output"]),
        [conf.samples_per_device, conf.mel_bands, conf.max_spectrogram_length])
    assert_lists_equal(builder.getTensorShape(main_outputs["mag_spec_output"]),
                       [
                           conf.samples_per_device, conf.n_fft // 2 + 1,
                           conf.max_spectrogram_length
                       ])
    assert_lists_equal(
        builder.getTensorShape(main_outputs["done_flag_output"]),
        [conf.samples_per_device, 1, conf.max_spectrogram_length])

    for att_dist in aux_outputs["attention_scores_arrays"]:
        assert_lists_equal(builder.getTensorShape(att_dist), [
            conf.samples_per_device, conf.max_text_sequence_length,
            conf.max_spectrogram_length
        ])
    assert_lists_equal(
        builder.getTensorShape(aux_outputs["speaker_embedding_matrix"]),
        [conf.num_speakers, conf.speaker_embedding_dim])
Пример #2
0
def create_model_and_dataflow_for_training(builder,
                                           conf,
                                           inputs,
                                           anchor_mode='train'):
    """ builds the deep-voice model, loss function and dataflow for training """
    def temporal_slice(tensor, start, end):
        """ slices tensors along the temporal (last) dimension """
        tensor_shape = builder.getTensorShape(tensor)
        slice_starts = builder.aiOnnx.constant(
            np.array([0, 0, start]).astype('int32'), 'spec_slice_starts')
        slice_ends = builder.aiOnnx.constant(
            np.array([tensor_shape[0], tensor_shape[1], end]).astype('int32'),
            'spec_slice_ends')
        return builder.aiOnnx.slice([tensor, slice_starts, slice_ends])

    def type_cast(tensor, in_type, out_type):
        if in_type != out_type:
            return builder.aiOnnx.cast([tensor], out_type)
        else:
            return tensor

    def get_attention_mask(g=0.2):
        """ returns attention mask for guided attention """
        attention_mask = np.zeros(
            (conf.max_text_sequence_length, conf.max_spectrogram_length),
            dtype=conf.precision)
        for n in range(conf.max_text_sequence_length):
            for t in range(conf.max_spectrogram_length):
                attention_mask[n, t] = 1 - np.exp(
                    -(n / conf.max_text_sequence_length -
                      t / conf.max_spectrogram_length)**2 / (2 * g * g))
        attention_mask = builder.aiOnnx.constant(attention_mask,
                                                 'attention_mask')
        return attention_mask

    def get_done_mask(done_labels, num_timesteps):
        """ returns done mask for spectrogram loss computation """
        done_labels_sliced = temporal_slice(done_labels, 1, num_timesteps)
        done_mask = builder.aiOnnx.add([
            builder.aiOnnx.constant(np.array(1.0).astype(np.float32)),
            builder.aiOnnx.neg([done_labels_sliced])
        ])
        return done_mask

    deep_voice_model = PopartDeepVoice(conf, builder, for_inference=False)

    main_outputs, aux_outputs, name_to_tensor = deep_voice_model(
        inputs["text_input"], inputs["mel_spec_input"], inputs["speaker_id"])

    num_timesteps = builder.getTensorShape(inputs["mel_spec_input"])[-1]
    float_type = _get_popart_type(conf.precision)

    # type cast tensors before loss computation (in case of doing experiments with FP16)
    mel_input_fp32_cast = type_cast(
        temporal_slice(inputs["mel_spec_input"], 1, num_timesteps), float_type,
        'FLOAT')
    mel_output_fp32_cast = type_cast(
        temporal_slice(main_outputs["mel_spec_output"], 0, num_timesteps - 1),
        float_type, 'FLOAT')

    mag_spec_input_fp32_cast = type_cast(
        temporal_slice(inputs["mag_spec_input"], 1, num_timesteps), float_type,
        'FLOAT')
    mag_spec_output_fp32_cast = type_cast(
        temporal_slice(main_outputs["mag_spec_output"], 0, num_timesteps - 1),
        float_type, 'FLOAT')

    done_flag_output_fp32_cast = type_cast(main_outputs["done_flag_output"],
                                           float_type, 'FLOAT')
    done_labels_fp32_cast = type_cast(inputs["done_labels"], 'INT32', 'FLOAT')

    done_mask = get_done_mask(done_labels_fp32_cast, num_timesteps)

    # mel-spectrogram reconstruction loss for decoder
    mel_spec_l1_loss = builder.aiGraphcore.l1loss(
        [
            builder.aiOnnx.mul([
                done_mask,
                builder.aiOnnx.add([
                    mel_output_fp32_cast,
                    builder.aiOnnx.neg([mel_input_fp32_cast])
                ])
            ])
        ],
        1.0,
        reduction=popart.ReductionType.Mean)

    # linear-scale spectrogram loss for converter
    mag_spec_l1_loss = builder.aiGraphcore.l1loss(
        [
            builder.aiOnnx.mul([
                done_mask,
                builder.aiOnnx.add([
                    mag_spec_output_fp32_cast,
                    builder.aiOnnx.neg([mag_spec_input_fp32_cast])
                ])
            ])
        ],
        1.0,
        reduction=popart.ReductionType.Mean)

    # loss for done-flags
    done_flag_loss = builder.aiGraphcore.l1loss(
        [
            builder.aiOnnx.add([
                done_flag_output_fp32_cast,
                builder.aiOnnx.neg([done_labels_fp32_cast])
            ])
        ],
        1.0,
        reduction=popart.ReductionType.Mean)

    total_loss = builder.aiOnnx.add([mel_spec_l1_loss, mag_spec_l1_loss])
    total_loss = builder.aiOnnx.add([total_loss, done_flag_loss])

    # add desired output tensors
    builder.addOutputTensor(main_outputs["mel_spec_output"])
    builder.addOutputTensor(main_outputs["mag_spec_output"])
    builder.addOutputTensor(aux_outputs["speaker_embedding_matrix"])
    for attention_distribution in aux_outputs["attention_scores_arrays"]:
        builder.addOutputTensor(attention_distribution)

    anchor_types_dict = {
        mel_spec_l1_loss: popart.AnchorReturnType("ALL"),
        mag_spec_l1_loss: popart.AnchorReturnType("ALL"),
        done_flag_loss: popart.AnchorReturnType("ALL"),
    }
    loss_dict = {
        "mel_spec_l1_loss": mel_spec_l1_loss,
        "mag_spec_l1_loss": mag_spec_l1_loss,
        "done_flag_loss": done_flag_loss
    }

    if conf.use_guided_attention:
        attention_mask = get_attention_mask(g=conf.guided_attention_g)
        masked_attention = builder.aiOnnx.mul(
            [attention_mask, aux_outputs["attention_scores_arrays"][0]])
        for attention_distribution in aux_outputs["attention_scores_arrays"][
                1:]:
            masked_attention = builder.aiOnnx.add([
                masked_attention,
                builder.aiOnnx.mul([attention_mask, attention_distribution])
            ])
        attention_loss = builder.aiGraphcore.l1loss(
            [masked_attention], 1.0, reduction=popart.ReductionType.Mean)
        anchor_types_dict[attention_loss] = popart.AnchorReturnType("ALL")
        loss_dict["attention_loss"] = attention_loss
        total_loss = builder.aiOnnx.add([total_loss, attention_loss])

    loss_dict["total_loss"] = total_loss

    if anchor_mode == 'inference':
        anchor_types_dict[aux_outputs[
            "speaker_embedding_matrix"]] = popart.AnchorReturnType("ALL")
        for attention_distribution in aux_outputs["attention_scores_arrays"]:
            anchor_types_dict[
                attention_distribution] = popart.AnchorReturnType("ALL")
        anchor_types_dict[
            main_outputs["mel_spec_output"]] = popart.AnchorReturnType("ALL")
        anchor_types_dict[
            main_outputs["mag_spec_output"]] = popart.AnchorReturnType("ALL")

    proto = builder.getModelProto()
    dataflow = popart.DataFlow(conf.batches_per_step, anchor_types_dict)

    return proto, loss_dict, dataflow, main_outputs, aux_outputs, name_to_tensor