Пример #1
0
    def _extract_audio_features(self, audio_path):
        if self.feature_type == "MFCC":
            return tools.calculate_mfcc(audio_path)
        
        if self.feature_type == "Pros":
            return tools.extract_prosodic_features(audio_path)
        
        if self.feature_type == "MFCC+Pros":
            mfcc_vectors = tools.calculate_mfcc(audio_path)
            pros_vectors = tools.extract_prosodic_features(audio_path)
            mfcc_vectors, pros_vectors = tools.shorten(mfcc_vectors, pros_vectors)
            return np.concatenate((mfcc_vectors, pros_vectors), axis=1)
        
        if self.feature_type =="Spectro":
            return tools.calculate_spectrogram(audio_path)
        
        if self.feature_type == "Spectro+Pros":
            spectr_vectors = tools.calculate_spectrogram(audio_path)
            pros_vectors = tools.extract_prosodic_features(audio_path)
            spectr_vectors, pros_vectors = tools.shorten(spectr_vectors, pros_vectors)
            return np.concatenate((spectr_vectors, pros_vectors), axis=1)

        # Unknown feature type
        print(f"ERROR: unknown feature type '{self.feature_type}' in the 'extract_audio_features' call!")
        print(f"Possible values: {self.supported_features}.")
        exit(-1)
Пример #2
0
    def _align_vector_lengths(self, audio_features, text_features):
        # NOTE: at this point audio is 20fps and text is 10fps
        min_len = min(len(audio_features), 2 * len(text_features))
        # make sure the length is even
        if min_len % 2 ==1:
            min_len -= 1

        audio_features, text_features = tools.shorten(audio_features, text_features, min_len)
        # The transcriptions were created with half the audio sampling rate
        # So the text vector should contain half as many elements 
        text_features = text_features[:int(min_len/2)] 

        # upsample the text so that it aligns with the audio
        cols = np.linspace(0, text_features.shape[0], endpoint=False, num=text_features.shape[0] * 2, dtype=int)
        text_features = text_features[cols, :]

        return audio_features, text_features
Пример #3
0
def _encode_vectors(audio_filename, gesture_filename, text_filename,
                    embedding_model, mode, args, augment_with_context):
    """
    Extract features from a given pair of audio and motion files.
    To be used by "_save_data_as_sequences" and "_save_dataset" functions.

    Args:
        audio_filename:        file name for an audio file (.wav)
        gesture_filename:      file name for a motion file (.bvh)
        text_filename:         file name with the text transcript (.json)
        embedding_model:       the embedding model to encode the text with
        mode:                  dataset type ('train', 'dev' or 'test')
        args:                  see the 'create_dataset' function for details
        augment_with_context:  if True, the data sequences will be augmented with future/past context 
                               intended use: True if the data will be used for training,
                                             False if it will be used for validation/testing

    Returns:
        input_vectors  [N, T, D] : speech features
        text_vectors             : text features
        output_vectors [N, T, D] : motion features
    """
    debug = False

    if mode == 'test':
        seq_length = 0
    elif mode == 'train' or mode == "train_mirrored":
        seq_length = args.seq_len
    elif mode == 'dev':
        seq_length = 5 * args.seq_len
    else:
        print(
            f"ERROR: Unknown dataset type '{mode}'! Possible values: 'train', 'train_mirrored', 'dev' and 'test'."
        )
        exit(-1)

    # Step 1: Vectorizing speech, with features of 'n_inputs' dimension, time steps of 0.01s
    # and window length with 0.025s => results in an array of 100 x 'n_inputs'

    if args.feature_type == "MFCC":

        input_vectors = tools.calculate_mfcc(audio_filename)

    elif args.feature_type == "Pros":

        input_vectors = tools.extract_prosodic_features(audio_filename)

    elif args.feature_type == "GeMAPS":

        input_vectors = tools.extract_gemaps_features(audio_filename)

    elif args.feature_type == "MFCC+Pros":

        mfcc_vectors = tools.calculate_mfcc(audio_filename)

        pros_vectors = tools.extract_prosodic_features(audio_filename)

        mfcc_vectors, pros_vectors = tools.shorten(mfcc_vectors, pros_vectors)

        input_vectors = np.concatenate((mfcc_vectors, pros_vectors), axis=1)

    elif args.feature_type == "Spectro":

        input_vectors = tools.calculate_spectrogram(audio_filename)

    elif args.feature_type == "Spectro+Pros":

        spectr_vectors = tools.calculate_spectrogram(audio_filename)

        pros_vectors = tools.extract_prosodic_features(audio_filename)

        spectr_vectors, pros_vectors = tools.shorten(spectr_vectors,
                                                     pros_vectors)

        input_vectors = np.concatenate((spectr_vectors, pros_vectors), axis=1)

    # Step 2: Read BVH
    ges_str = np.load(gesture_filename)
    output_vectors = ges_str['clips']

    # Subsample motion (from 60 fsp to 20 fps)
    output_vectors = output_vectors[0::3]

    # Step 3: Obtain text transcription:
    if isinstance(embedding_model, BertEmbedding):
        text_encoding = encode_json_transcript_with_bert(
            text_filename, embedding_model)
    elif isinstance(embedding_model, FastText):
        text_encoding = encode_json_transcript_with_fasttext(
            text_filename, embedding_model)

    if debug:
        print(input_vectors.shape)
        print(output_vectors.shape)
        print(text_encoding.shape)

    # Step 4: Align vector length
    min_len = min(len(input_vectors), len(output_vectors),
                  2 * len(text_encoding))

    # make sure the length is even
    if min_len % 2 == 1:
        min_len -= 1
    input_vectors, output_vectors = tools.shorten(input_vectors,
                                                  output_vectors, min_len)
    text_encoding = text_encoding[:int(min_len / 2)]

    if debug:
        print(input_vectors.shape)
        print(output_vectors.shape)
        print(text_encoding.shape)

    if not augment_with_context:
        return input_vectors, text_encoding, output_vectors

    # create a list of sequences with a fixed past and future context length ( overlap them to use data more efficiently)
    # ToDo: make sure the allignment holds
    start_ind = args.past_context
    seq_step = 10  # overlap of sequences: 0.5s

    # Test if the context length is appropriate
    assert args.past_context % 2 == 0
    assert args.future_context % 2 == 0
    assert seq_step % 2 == 0

    n_reserved_inds = seq_length + args.future_context

    stop_ind = input_vectors.shape[0] - n_reserved_inds
    input_vectors_final = np.array([
        input_vectors[i - args.past_context:i + n_reserved_inds]
        for i in range(start_ind, stop_ind, seq_step)
    ])

    stop_ind = output_vectors.shape[0] - n_reserved_inds
    output_vectors_final = np.array([
        output_vectors[i - args.past_context:i + n_reserved_inds]
        for i in range(start_ind, stop_ind, seq_step)
    ])

    # The text was sampled at half the sampling rate compared to audio
    # So the 1 frame of text corresponds to 2 frames of audio
    stop_ind = text_encoding.shape[0] - n_reserved_inds // 2
    text_vectors_final = np.array([
        text_encoding[i - args.past_context // 2:i + n_reserved_inds // 2]
        for i in range(start_ind // 2, stop_ind, seq_step // 2)
    ])

    if debug:
        print(input_vectors_final.shape)
        print(output_vectors_final.shape)
        print(text_vectors_final.shape)

    return input_vectors_final, text_vectors_final, output_vectors_final