예제 #1
0
    def process_file(self, file_name):
        """
        Processes one file.

        :param file_name:
        :return:
        """

        try:
            eaf = Eaf(file_name)
            videos = [
                os.path.basename(
                    urlparse(media_descriptors['MEDIA_URL']).path)
                for media_descriptors in eaf.media_descriptors
                if media_descriptors['MIME_TYPE'] == 'video/mpeg'
            ]
            duration = self.find_max_duration(videos)
            if duration == 0.0:
                print("Duration could not be determined.", file=sys.stderr)
            else:
                annotation_values = self.create_annotation_values(file_name)
                self.add_new_annotations(eaf, annotation_values, duration)
                eaf.to_file(self.output_dir + os.sep +
                            os.path.basename(urlparse(file_name).path),
                            pretty=True)
        except IOError:
            print("The EAF %s could not be processed." % file_name,
                  file=sys.stderr)
예제 #2
0
def get_elan_tier_attributes(input_eafs_files):
    """
    Iterate a dir of elan files and compiles info about all the files' tiers:
    unique tier types, unique tier names, and the num of tiers
    """
    # Use sets internally for easy uniqueness, conver to lists when done
    _tier_types: Set[str] = set()
    _tier_names: Set[str] = set()
    _tier_max_count: int = 0
    for file_ in input_eafs_files:
        input_eaf = Eaf(file_)
        for tier_type in list(input_eaf.get_linguistic_type_names()):
            _tier_types.add(tier_type)
            tier_ids: List[str] = input_eaf.get_tier_ids_for_linguistic_type(
                tier_type)
            for tier_id in tier_ids:
                _tier_names.add(tier_id)
        # count the number of tiers, use the max from all files
        tier_count = len(list(input_eaf.get_tier_names()))
        if tier_count > _tier_max_count:
            _tier_max_count = tier_count
    tier_types = list(_tier_types)
    tier_names = list(_tier_names)
    tier_max_count = _tier_max_count
    return (tier_types, tier_names, tier_max_count)
예제 #3
0
def update_ui(file_paths: List[Path], ui):
    """
    Iterate a dir of elan files and compiles info about all the files' tiers:
    unique tier types, unique tier names, and the num of tiers
    """
    # Use sets internally for easy uniqueness, conver to lists when done
    _tier_types: Set[str] = set(ui['data']['tier_type']['options'])
    _tier_names: Set[str] = set(ui['data']['tier_name']['options'])
    tier_max_count = 0

    print('**** ui data')
    print(ui['data'])

    print('**** _tier_types')
    print(_tier_types)

    eaf_paths = [p for p in file_paths if f'{p}'.endswith('.eaf')]
    for eaf_path in eaf_paths:
        input_eaf = Eaf(eaf_path)
        for tier_type in list(input_eaf.get_linguistic_type_names()):
            _tier_types.add(tier_type)
            tier_ids: List[str] = input_eaf.get_tier_ids_for_linguistic_type(
                tier_type)
            for tier_id in tier_ids:
                _tier_names.add(tier_id)
        # count the number of tiers, use the max from all files
        tier_count = len(list(input_eaf.get_tier_names()))
        if tier_count > tier_max_count:
            tier_max_count = tier_count

    ui['data']['tier_type']['options'] = list(_tier_types)
    ui['data']['tier_name']['options'] = list(_tier_names)
    ui['data']['tier_order']['options'] = [i for i in range(tier_max_count)]
    return ui
예제 #4
0
def main() -> None:
    """
    A command line utility to silence the audio files in a given directory 
    
    Usage: python3 silence_audio.py [-h] -c CORPUS [-s SILENCE_TIER] [-o]
    """

    global silence_mono
    global silence_stereo
    global do_not_publish

    parser = argparse.ArgumentParser(
        description=
        "This script will silence a wave file based on annotations in an Elan tier"
    )
    parser.add_argument('-c',
                        '--corpus',
                        help='Directory of audio and eaf files',
                        type=str,
                        required=True)
    parser.add_argument(
        '-s',
        '--silence_tier',
        help='Silence audio when annotations are found on this tier',
        type=str,
        default='Silence')
    parser.add_argument('-o',
                        '--overwrite',
                        help='Write over existing files',
                        action="store_true")
    arguments = parser.parse_args()

    silence_mono = 0
    silence_stereo = [0, 0]
    suffix = 'S'
    '''
    Look for .eaf files, recursively from the passed arguments.corpus dir for file_path in 
    glob.iglob(arguments.corpus + '/**/*.eaf', recursive=True)
    '''

    for file_path in glob.iglob(arguments.corpus + '/*.eaf'):
        eaf_file = Eaf(file_path)
        names = eaf_file.get_tier_names()

        # Check for existence of silence tier
        if arguments.silence_tier in names:

            print("Have tier %s in %s" % (arguments.silence_tier, file_path))

            basename, extension = os.path.splitext(file_path)

            input = basename + ".wav"
            if arguments.overwrite:
                output = basename + ".wav"
            else:
                output = basename + suffix + ".wav"

            silence_audio(eaf_file, output)
예제 #5
0
    def process_file(self, file_name):
        """
        Processes one file.

        :param file_name:
        :return:
        """
        try:
            print("File: " + file_name, file=sys.stderr)
            eaf = Eaf(file_name)
            eaf.add_lexicon_ref(LEXICON_REF, NAME, TYPE, URL, LEXICON_ID,
                                LEXICON_NAME, DATCAT_ID, DATCAT_NAME)

            # Remove old referred lexicon
            if LINGUISTIC_TYPE_ID in eaf.linguistic_types:
                if "LEXICON_REF" in eaf.linguistic_types[LINGUISTIC_TYPE_ID]:
                    old_lexicon_ref = eaf.linguistic_types[LINGUISTIC_TYPE_ID][
                        "LEXICON_REF"]
                    del eaf.lexicon_refs[old_lexicon_ref]

            eaf.linguistic_types[LINGUISTIC_TYPE_ID][
                "LEXICON_REF"] = LEXICON_REF

            if self.output_dir is not None:
                eaf.to_file(self.output_dir + os.sep +
                            os.path.basename(urlparse(file_name).path),
                            pretty=True)
            else:
                eaf.to_file(file_name, pretty=True)
        except Exception:
            print("The EAF %s could not be processed." % file_name,
                  file=sys.stderr)
            print(sys.exc_info()[0])
예제 #6
0
def process_eaf(input_elan_file: str, tier_name: str) -> List[dict]:
    """
    Method to process a particular tier in an eaf file (ELAN Annotation Format). It stores the transcriptions in the 
    following format:
                    {'speaker_id': <speaker_id>,
                    'audio_file_name': <file_name>,
                    'transcript': <transcription_label>,
                    'start_ms': <start_time_in_milliseconds>,
                    'stop_ms': <stop_time_in_milliseconds>}
                    
    :param input_elan_file: name of input_scripts elan file
    :param tier_name: name of the elan tier to process. these tiers are nodes from the tree structure in the .eaf file.
    :return: a list of dictionaries, where each dictionary is an annotation
    """
    # Get paths to files
    input_directory, full_file_name = os.path.split(input_elan_file)
    file_name, extension = os.path.splitext(full_file_name)

    input_eaf = Eaf(input_elan_file)

    # Look for wav file matching the eaf file in same directory
    if os.path.isfile(os.path.join(input_directory, file_name + ".wav")):
        print("WAV file found for " + file_name, file=sys.stderr)
    else:
        raise ValueError(
            f"WAV file not found for {full_file_name}. "
            f"Please put it next to the eaf file in {input_directory}.")

    # Get annotations and parameters (things like speaker id) on the target tier
    annotations = sorted(input_eaf.get_annotation_data_for_tier(tier_name))
    parameters = input_eaf.get_parameters_for_tier(tier_name)
    speaker_id = parameters.get("PARTICIPANT", "")

    annotations_data = []

    for annotation in annotations:
        start = annotation[0]
        end = annotation[1]
        annotation = annotation[2]

        # print("processing annotation: " + annotation, start, end)
        obj = {
            "audio_file_name": f"{file_name}.wav",
            "transcript": annotation,
            "start_ms": start,
            "stop_ms": end
        }
        if "PARTICIPANT" in parameters:
            obj["speaker_id"] = speaker_id
        annotations_data.append(obj)

    return annotations_data
    def process_file(self, file_name):
        """
        Processes one file.

        :param file_name:
        :return:
        """
        try:
            print("File: " + file_name, file=sys.stderr)
            eaf = Eaf(file_name)
            eaf.add_lexicon_ref(LEXICON_REF, NAME, TYPE, URL,
                                LEXICON_ID, LEXICON_NAME, DATCAT_ID, DATCAT_NAME)

            # Remove old referred lexicon
            if LINGUISTIC_TYPE_ID in eaf.linguistic_types:
                if "LEXICON_REF" in eaf.linguistic_types[LINGUISTIC_TYPE_ID]:
                    old_lexicon_ref = eaf.linguistic_types[LINGUISTIC_TYPE_ID]["LEXICON_REF"]
                    del eaf.lexicon_refs[old_lexicon_ref]

            eaf.linguistic_types[LINGUISTIC_TYPE_ID]["LEXICON_REF"] = LEXICON_REF

            if self.output_dir is not None:
                eaf.to_file(self.output_dir + os.sep + os.path.basename(urlparse(file_name).path), pretty=True)
            else:
                eaf.to_file(file_name, pretty=True)
        except Exception:
            print("The EAF %s could not be processed." % file_name, file=sys.stderr)
            print(sys.exc_info()[0])
예제 #8
0
    def process_file(self, file_name):
        """
        Processes one file.

        :param file_name:
        :return:
        """
        try:
            eaf = Eaf(file_name)
            self.process_eaf(eaf, file_name)
            eaf.to_file(self.output_dir + os.sep + os.path.basename(urlparse(file_name).path), pretty=True)
        except IOError:
            print("The EAF %s could not be processed." % file_name, file=sys.stderr)
            print(sys.exc_info()[0])
예제 #9
0
    def process_file(self, file_name):
        """
        Processes one file.

        :param file_name:
        :return:
        """
        try:
            eaf = Eaf(file_name)
            self.process_eaf(eaf, file_name)
            eaf.to_file(self.output_dir + os.sep + os.path.basename(urlparse(file_name).path), pretty=True)
        except IOError:
            print("The EAF %s could not be processed." % file_name, file=sys.stderr)
            print(sys.exc_info()[0])
예제 #10
0
def make_elan(source_parent_dir, target_parent_dir):
    """
    Make elan files based on filenames of wav files
    Written for the TIDIGITS corpus, so some things are specific to the name formats of that corpus
    """

    for dirname, dirnames, filenames in os.walk(source_parent_dir):

        # print path to all subdirectories first.
        for subdirname in dirnames:
            print(os.path.join(dirname, subdirname))

        # print path to all filenames.
        for filename in filenames:
            if '.wav' in filename:
                parent, gender, child = dirname.split(os.path.sep)
                basename, ext = os.path.splitext(os.path.basename(filename))
                print(parent, gender, child, filename)

                source_path = os.path.join(source_parent_dir, gender, child)
                target_path = os.path.join(target_parent_dir, gender, child)

                if not os.path.exists(target_path):
                    print(target_path)
                    os.makedirs(target_path)

                # Audio file duration - use this as end timeslot
                duration = int(librosa.get_duration(filename=os.path.join(source_path, filename))*1000)

                # Make file annotation from filename (minus the suffix)
                annotation = " ".join([char for char in basename[:-1]])
                # These are specific to the TIDIGITS naming convention
                annotation = annotation.replace("o", "oh")
                annotation = annotation.replace("z", "zero")

                text = re.sub(r"(\d+)", lambda x: num2words.num2words(int(x.group(0))), annotation)

                print(filename, duration, annotation, text)

                # Make elan
                output_eaf = Eaf()
                output_eaf.add_tier('tx')
                output_eaf.insert_annotation('tx', 0, duration, text)
                output_eaf.add_linked_file(os.path.join(target_path, f'{basename}.wav'))

                output_eaf.to_file(os.path.join(target_path, f'{basename}.eaf'))
예제 #11
0
def explore_elan_files(elan_paths):
    """
    A function to explore the tiers of ELAN files.
    """

    for elan_path in elan_paths:
        print(elan_path)
        eafob = Eaf(elan_path)
        tier_names = eafob.get_tier_names()
        for tier in tier_names:
            print("\t", tier)
            try:
                for annotation in eafob.get_annotation_data_for_tier(tier):
                    print("\t\t", annotation)
            except KeyError:
                continue

        input()
예제 #12
0
    def process_file(self, file_name):
        """
        Processes one file.

        :param file_name:
        :return:
        """

        gloss_append_lingtype = "gloss-append"

        try:
            eaf = Eaf(file_name)
            eaf.add_linguistic_type(gloss_append_lingtype, constraints="Symbolic_Association")
            gloss_tiers = self.find_gloss_tiers(eaf)
            self.add_gloss_tier_children(eaf, gloss_tiers, gloss_append_lingtype, file_name)
            eaf.to_file(self.output_dir + os.sep + os.path.basename(urlparse(file_name).path), pretty=True)
        except IOError:
            print("The EAF %s could not be processed." % file_name, file=sys.stderr)
            print(sys.exc_info()[0])
예제 #13
0
def make_elans(spreadsheet: str, source: str, target: str):
    """
    Make ELAN files based on filenames of WAV files
    :param spreadsheet: Path and file name of the spreadsheet containing WAV filenames and matching annotations
    :param source: Directory name of folder containing WAV audio files
    :param  target: Directory name to save EAF files into
    """

    # Read spreadsheet data and convert to JSON format
    print('Loading data from spreadsheet')
    annotations = get_annotations(spreadsheet)

    # Process each file
    print('Processing WAVs')
    for _, _, filenames in os.walk(source):

        for filename in filenames:
            if '.wav' in filename:
                basename, ext = os.path.splitext(os.path.basename(filename))

                # Get audio file duration - use this as the EAF annotation's end timeslot
                duration = int(
                    librosa.get_duration(
                        filename=os.path.join(source, filename)) * 1000)

                # Get annotation from the source data matching on filename
                annotation = get_annotation(annotations, filename)

                # Add any annotation cleaning here
                # annotation = re.sub(r"(\d+)", lambda x: num2words.num2words(int(x.group(0))), annotation)

                print(filename, duration, annotation)

                # Make EAF file
                output_eaf = Eaf()
                output_eaf.add_tier('tx')
                output_eaf.insert_annotation('tx', 0, duration, annotation)
                output_eaf.add_linked_file(
                    os.path.join(target, f'{basename}.wav'))
                output_eaf.to_file(os.path.join(target, f'{basename}.eaf'))
    print('>>> Done')
예제 #14
0
def save_tier_info(input_eaf: Eaf = None,
                   file_name: str = '',
                   tier_types: List = [],
                   corpus_tiers_file: str = 'corpus_tiers.json'):
    tiers = []
    for tier_type in tier_types:
        tier_names = input_eaf.get_tier_ids_for_linguistic_type(tier_type)
        tiers.append({tier_type: tier_names})
    file_data = {"file": file_name, "tiers": tiers}
    corpus_tiers = load_json_file(corpus_tiers_file)
    corpus_tiers.append(file_data)
    write_data_to_json_file(data=corpus_tiers, output=corpus_tiers_file)
예제 #15
0
    def process_file(self, file_name):
        """
        Processes one file.

        :param file_name:
        :return:
        """

        try:
            eaf = Eaf(file_name)
            videos = [os.path.basename(urlparse(media_descriptors['MEDIA_URL']).path)
                      for media_descriptors in eaf.media_descriptors
                      if media_descriptors['MIME_TYPE'] == 'video/mpeg']
            duration = self.find_max_duration(videos)
            if duration == 0.0:
                print("Duration could not be determined.", file=sys.stderr)
            else:
                annotation_values = self.create_annotation_values(file_name)
                self.add_new_annotations(eaf, annotation_values, duration)
                eaf.to_file(self.output_dir + os.sep + os.path.basename(urlparse(file_name).path), pretty=True)
        except IOError:
            print("The EAF %s could not be processed." % file_name, file=sys.stderr)
예제 #16
0
def read_eaf(ie):
    # Get paths to files
    inDir, name = os.path.split(ie)
    basename, ext = os.path.splitext(name)

    input_eaf = Eaf(ie)

    # I want the media in the same folder as the eaf. error if not found
    # We could also parse the linked media.. let try this later
    # files = input_eaf.get_linked_files()

    # look for wav file matching the eaf file
    if os.path.isfile(os.path.join(inDir, basename + ".wav")):
        print("WAV file found for " + basename, file=sys.stderr)
    else:
        raise ValueError('Eeeek! WAV file not found for ' + basename +
                         '. Please put it next to the eaf file in ' + inDir)

    # Get annotations and params (thigs like speaker id) on the target tier
    annotations = sorted(input_eaf.get_annotation_data_for_tier(tier))
    params = input_eaf.get_parameters_for_tier(tier)
    if 'PARTICIPANT' in params:
        speaker_id = params['PARTICIPANT']

    for ann in annotations:
        start = ann[0]
        end = ann[1]
        annotation = ann[2]

        # print('processing annotation: ' + annotation, start, end)
        obj = {
            'audioFileName': basename + ".wav",
            'transcript': annotation,
            'startMs': start,
            'stopMs': end
        }
        if 'PARTICIPANT' in params:
            obj["speakerId"] = speaker_id
        annotations_data.append(obj)
예제 #17
0
파일: Praat.py 프로젝트: khoidt/pympi
    def to_eaf(self, skipempty=True, pointlength=0.1):
        """Convert the object to an pympi.Elan.Eaf object

        :param int pointlength: Length of respective interval from points in
                                seconds
        :param bool skipempty: Skip the empty annotations
        :returns: :class:`pympi.Elan.Eaf` object
        :raises ImportError: If the Eaf module can't be loaded.
        :raises ValueError: If the pointlength is not strictly positive.
        """
        from pympi.Elan import Eaf
        eaf_out = Eaf()
        if pointlength <= 0:
            raise ValueError('Pointlength should be strictly positive')
        for tier in self.get_tiers():
            eaf_out.add_tier(tier.name)
            for ann in tier.get_intervals(True):
                if tier.tier_type == 'TextTier':
                    ann = (ann[0], ann[0]+pointlength, ann[1])
                if ann[2].strip() or not skipempty:
                    eaf_out.add_annotation(tier.name, int(round(ann[0]*1000)),
                                           int(round(ann[1]*1000)), ann[2])
        return eaf_out
예제 #18
0
def make_elans(input_dir: str, output_dir: str, copy_wavs: bool):
    """
    Make ELAN files based on filenames of WAV files and annotation from matching text file
    :param input_dir: Directory name of folder containing TXT and WAV audio files
    :param  output_dir: Directory name to save EAF files into
    :param copy_wavs: Setting whether or not to copy the WAV file to the output dir
    """
    # Process each file
    for _, _, filenames in os.walk(input_dir):

        for filename in filenames:
            if '.wav' in filename:
                basename, ext = os.path.splitext(os.path.basename(filename))
                print(basename)

                # Get audio file duration - use this as the EAF annotation's end timeslot
                duration = int(
                    librosa.get_duration(
                        filename=os.path.join(input_dir, filename)) * 1000)

                # Get annotation from the text file matching on file basename
                annotation = get_annotation(input_dir, basename)

                # Add any annotation cleaning here
                # annotation = re.sub(r"(\d+)", lambda x: num2words.num2words(int(x.group(0))), annotation)

                print(duration, annotation)

                # Make EAF file
                output_eaf = Eaf()
                # output_eaf.add_tier('default')
                output_eaf.insert_annotation('default', 0, duration,
                                             annotation)
                output_eaf.add_linked_file(
                    os.path.join(output_dir, f'{basename}.wav'))
                output_eaf.to_file(os.path.join(output_dir, f'{basename}.eaf'))

                # Copy WAV?
                if copy_wavs:
                    shutil.copyfile(os.path.join(input_dir, filename),
                                    os.path.join(output_dir, filename))
    print('>>> Done')
예제 #19
0
    def toEaf(self, filepath):
        """
        Write to eaf<br />
<br />
        filepath -- Filepath to write to - for stdout"""
        try:
            from pympi.Elan import Eaf
        except ImportError:
            warnings.warn('toEaf: Please install the pympi.Elan.Eaf module f' +
                          'from the pympi package found at https://github.co' +
                          'm/dopefishh/pympi')
            return 1
        eafOut = Eaf()
        for tier in self.tiers:
            eafOut.addTier(tier)
            for annotation in self.tiers[tier].intervals:
                eafOut.insertAnnotation(tier, int(annotation[0] * 1000),
                                        int(annotation[1] * 1000),
                                        annotation[2])
        eafOut.tofile(filepath)
        return 0
예제 #20
0
    def toEaf(self, filepath):
        """
        Write to eaf<br />
<br />
        filepath -- Filepath to write to - for stdout"""
        try:
            from pympi.Elan import Eaf
        except ImportError:
            warnings.warn('toEaf: Please install the pympi.Elan.Eaf module f' +
                          'from the pympi package found at https://github.co' +
                          'm/dopefishh/pympi')
            return 1
        eafOut = Eaf()
        for tier in self.tiers:
            eafOut.addTier(tier)
            for annotation in self.tiers[tier].intervals:
                eafOut.insertAnnotation(tier, int(annotation[0]*1000),
                                        int(annotation[1]*1000), annotation[2])
        eafOut.tofile(filepath)
        return 0
예제 #21
0
    def process_file(self, file_name):
        """
        Processes one file.

        :param file_name:
        :return:
        """
        try:
            eaf = Eaf(file_name)
            lex_ref = "signbank-lexicon-ref"
            eaf.add_lexicon_ref(lex_ref, "NGT-Signbank", "Signbank", "https://signbank.science.ru.nl/",
                                "NGT", "NGT", "Annotation Id Gloss", "Annotation Id Gloss")
            eaf.linguistic_types["gloss"]["LEXICON_REF"] = lex_ref

            eaf.to_file(self.output_dir + os.sep + os.path.basename(urlparse(file_name).path), pretty=True)
        except IOError:
            print("The EAF %s could not be processed." % file_name, file=sys.stderr)
            print(sys.exc_info()[0])
예제 #22
0
    def to_eaf(self, pointlength=0.1):
        """Convert the object to an pympi.Elan.Eaf object

        :param int pointlength: Length of respective interval from points in
                                seconds
        :returns: :class:`pympi.Elan.Eaf` object
        :raises ImportError: If the Eaf module can't be loaded.
        :raises ValueError: If the pointlength is not strictly positive.
        """
        from pympi.Elan import Eaf
        eaf_out = Eaf()
        if pointlength <= 0:
            raise ValueError('Pointlength should be strictly positive')
        for tier in self.get_tiers():
            eaf_out.add_tier(tier.name)
            for ann in tier.get_intervals(True):
                if tier.tier_type == 'TextTier':
                    ann = (ann[0], ann[0] + pointlength, ann[1])
                eaf_out.insert_annotation(tier.name, int(round(ann[0] * 1000)),
                                          int(round(ann[1] * 1000)), ann[2])
        return eaf_out
예제 #23
0
    def process_file(self, file_name):
        """
        Processes one file.

        :param file_name:
        :return:
        """

        gloss_append_lingtype = "gloss-append"

        try:
            eaf = Eaf(file_name)
            eaf.add_linguistic_type(gloss_append_lingtype,
                                    constraints="Symbolic_Association")
            gloss_tiers = self.find_gloss_tiers(eaf)
            self.add_gloss_tier_children(eaf, gloss_tiers,
                                         gloss_append_lingtype, file_name)
            eaf.to_file(self.output_dir + os.sep +
                        os.path.basename(urlparse(file_name).path),
                        pretty=True)
        except IOError:
            print("The EAF %s could not be processed." % file_name,
                  file=sys.stderr)
            print(sys.exc_info()[0])
예제 #24
0
def import_eaf_file(eaf_paths, context, add_annotation, tmp_dir):
    """
    Import handler for processing all .wav and .eaf files.

    :param wav_paths: List of string paths to Wave files.
    :param eaf_paths: List of string paths to Elan files.
    """
    """
    Import handler for processing all .eaf files.

    Method to process a particular tier in an eaf file (ELAN Annotation Format). It stores the transcriptions in the 
    following format:
                    {'speaker_id': <speaker_id>,
                    'audio_file_name': <file_name>,
                    'transcript': <transcription_label>,
                    'start_ms': <start_time_in_milliseconds>,
                    'stop_ms': <stop_time_in_milliseconds>}

    :param eaf_paths: List of string paths to Elan files.
    :return: a list of dictionaries, where each dictionary is an annotation
    """
    tier_order = context['tier_order']
    tier_name = context['tier_name']
    tier_type = context['tier_type']
    punctuation_to_collapse_by = context['punctuation_to_collapse_by']
    punctuation_to_explode_by = context['punctuation_to_explode_by']
    # Convert dirty words and tokens from str to set, split by '\n'
    special_cases = set(context['special_cases'].splitlines())
    translation_tags = set(context['translation_tags'].splitlines())

    for input_elan_file in eaf_paths:
        # Get paths to files
        input_directory, full_file_name = os.path.split(input_elan_file)
        file_name, extension = os.path.splitext(full_file_name)

        input_eaf = Eaf(input_elan_file)
        tier_types: List[str] = list(input_eaf.get_linguistic_type_names())
        tier_names: List[str] = list(input_eaf.get_tier_names())

        # TODO: Check if this is necessary? It is possible to process transcription and audio file separately.
        # # Look for wav file matching the eaf file in same directory
        # if os.path.isfile(os.path.join(input_directory, file_name + ".wav")):
        #     print("WAV file found for " + file_name, file=sys.stderr)
        # else:
        #     raise ValueError(f"WAV file not found for {full_file_name}. "
        #                     f"Please put it next to the eaf file in {input_directory}.")

        # Get annotations and parameters (things like speaker id) on the target tier
        annotations: List[Tuple[str, str, str]] = []
        annotation_data: List[dict] = []

        # Determine tier_name
        # First try using tier order to get tier name
        if tier_order:
            # Watch out for files that may not have this many tiers
            # tier_order is 1-index but List indexing is 0-index
            try:
                tier_name = tier_names[tier_order - 1]
                print(
                    f"using tier order {tier_order} to get tier name {tier_name}"
                )
            except IndexError:
                print("couldn't find a tier")
                pass
        else:
            # else use tier type to get a tier name
            if tier_type in tier_types:
                print(f"found tier type {tier_type}")
                tier_names = input_eaf.get_tier_ids_for_linguistic_type(
                    tier_type)
                tier_name = tier_names[0]
                if tier_name:
                    print(f"found tier name {tier_name}")
            else:
                print("tier type not found in this file")

        if tier_name in tier_names:
            print(f"using tier name {tier_name}")
            annotations = input_eaf.get_annotation_data_for_tier(tier_name)
        else:
            pass  # TODO: Alert user of a skip due to missing tier_name in file

        if annotations:
            annotations = sorted(annotations)
            parameters: Dict[str, str] = input_eaf.get_parameters_for_tier(
                tier_name)
            speaker_id: str = parameters.get("PARTICIPANT", "")

        for annotation in annotations:
            start = annotation[0]
            end = annotation[1]
            annotation = annotation[2]

            utterance = {
                "audio_file_name": f"{file_name}.wav",
                "transcript": annotation,
                "start_ms": start,
                "stop_ms": end
            }
            # TODO: re-enable later
            # if "PARTICIPANT" in parameters:
            #     obj["speaker_id"] = speaker_id

            utterance_cleaned = clean_json_utterance(
                utterance=utterance,
                punctuation_to_collapse_by=punctuation_to_collapse_by,
                punctuation_to_explode_by=punctuation_to_explode_by,
                special_cases=special_cases,
                translation_tags=translation_tags,
                remove_english=False,
                use_langid=False)
            add_annotation(file_name, utterance_cleaned)
예제 #25
0
    def process_file(self, file_name):
        """
        Processes one file.

        :param file_name:
        :return:
        """

        gloss_lingtypes = ["Gloss Child", "Gloss Adult"]
        external_ref = "ecv_ref"
        ecv_name = "ASL Signbank lexicon"

        try:
            eaf = Eaf(file_name)

            # Add linguistic types
            for lingtype in gloss_lingtypes:
                eaf.add_linguistic_type(lingtype, constraints=None)

            # Add linguistic types to tiers
            gloss_tiers = self.find_gloss_tiers(eaf)
            for tier in gloss_tiers:
                if "Adult" in tier:
                    eaf.tiers[tier][2]['LINGUISTIC_TYPE_REF'] = "Gloss Adult"
                elif "Child" in tier:
                    eaf.tiers[tier][2]['LINGUISTIC_TYPE_REF'] = "Gloss Child"

            # Add an ECV external reference
            eaf.add_external_ref(external_ref, "ecv", "http://applejack.science.ru.nl/asl-signbank/static/ecv/asl.ecv")

            # Add a Controlled Vocabulary
            eaf.add_controlled_vocabulary(ecv_name, external_ref)

            # Add the CV to linguistic types
            for lingtype in gloss_lingtypes:
                eaf.linguistic_types[lingtype]['CONTROLLED_VOCABULARY_REF'] = ecv_name

            eaf.to_file(self.output_dir + os.sep + os.path.basename(urlparse(file_name).path), pretty=True)
        except IOError:
            print("The EAF %s could not be processed." % file_name, file=sys.stderr)
            print(sys.exc_info()[0])
예제 #26
0
def main():
    """
    File 1 has the utterance and utterance translation
    File 2 has the gloss
    File 3 is the destination
    """
    # Input files
    file_1 = 'input/file-1.eaf'
    file_2 = 'input/file-2.eaf'
    file_3 = 'input/new.eaf'

    # Tier names
    utterance_id_source_tier = "A_phrase-segnum-en"
    utterance_id_target_tier = "utterance_id"
    utterance_source_tier = "DDD_Transcription-txt-qaa-fonipa-x-eib"
    utterance_target_tier = "utterance"
    utterance_translation_source_tier = "DDD_Translation-gls-en"
    utterance_translation_target_tier = "utterance_translation"
    word_source_tier = "A_word-txt-qaa-fonipa-x-eib"
    word_target_tier = "grammatical_words"
    morph_source_tier = "A_morph-txt-qaa-fonipa-x-eib"
    gloss_source_tier = "A_morph-gls-en"
    gloss_target_tier = "gloss"

    # Set up the eaf objects
    eaf_1 = Eaf(file_1)
    eaf_2 = Eaf(file_2)
    eaf_3 = Eaf()

    # Remove default tier and copy media
    eaf_3.remove_tier("default")
    # eaf_3 = copy_media(eaf_1, eaf_3)
    """
    Copy annotation number tier from file 2
    tier-type default-lt
    <LINGUISTIC_TYPE GRAPHIC_REFERENCES="false" LINGUISTIC_TYPE_ID="default-lt" TIME_ALIGNABLE="true"/>
    """
    print("Copying annotation numbers from file 2")
    utterance_id_type_params = {
        'LINGUISTIC_TYPE_ID': 'default-lt',
        'TIME_ALIGNABLE': 'true'
    }
    utterance_id_tier_params = {
        'LINGUISTIC_TYPE_REF': 'default-lt',
        'TIER_ID': utterance_id_target_tier
    }
    _tier_copy(source_eaf=eaf_2,
               target_eaf=eaf_3,
               source_tier_name=utterance_id_source_tier,
               target_tier_name=utterance_id_target_tier,
               override_params=utterance_id_tier_params)
    """
    Copy utterance tier from file 1
    LINGUISTIC_TYPE_REF="Blank"
    <LINGUISTIC_TYPE CONSTRAINTS="Symbolic_Association" GRAPHIC_REFERENCES="false" LINGUISTIC_TYPE_ID="Blank" TIME_ALIGNABLE="false"/>
    """
    print("Copying utterance tier from file 1")
    blank_type_params = {
        'LINGUISTIC_TYPE_ID': 'Blank',
        'CONSTRAINTS': 'Symbolic_Association',
        'TIME_ALIGNABLE': 'false'
    }
    eaf_3.add_linguistic_type('Blank', param_dict=blank_type_params)
    utterance_tier_params = {
        'LINGUISTIC_TYPE_REF': 'Blank',
        'PARENT_REF': utterance_id_target_tier,
        'TIER_ID': utterance_target_tier
    }
    _tier_copy_to_ref(source_eaf=eaf_1,
                      target_eaf=eaf_3,
                      source_tier_name=utterance_source_tier,
                      target_tier_name=utterance_target_tier,
                      target_parent_tier_name=utterance_id_target_tier,
                      override_params=utterance_tier_params)
    """
    Copy utterance translation tier from file 1
    LINGUISTIC_TYPE_REF="Blank"
    <LINGUISTIC_TYPE CONSTRAINTS="Symbolic_Association" GRAPHIC_REFERENCES="false" LINGUISTIC_TYPE_ID="Blank" TIME_ALIGNABLE="false"/>
    <TIER LINGUISTIC_TYPE_REF="Blank" PARENT_REF="utterance" PARTICIPANT="DDD" TIER_ID="utterance_translation">    
    """
    print("Copying utterance translation tier from file 1")
    utterance_translation_tier_params = {
        'LINGUISTIC_TYPE_REF': 'Blank',
        'PARENT_REF': utterance_target_tier,
        'TIER_ID': utterance_translation_target_tier
    }
    _ref_tier_copy(source_eaf=eaf_1,
                   target_eaf=eaf_3,
                   source_tier_name=utterance_translation_source_tier,
                   target_tier_name=utterance_translation_target_tier,
                   target_parent_tier_name=utterance_target_tier,
                   override_params=utterance_translation_tier_params)
    """
    Copy the word tier from file 2
        <LINGUISTIC_TYPE CONSTRAINTS="Symbolic_Subdivision" GRAPHIC_REFERENCES="false" LINGUISTIC_TYPE_ID="word" TIME_ALIGNABLE="false"/>
        <TIER DEFAULT_LOCALE="qaa-fonipa-x-eib" LINGUISTIC_TYPE_REF="word" PARENT_REF="A_phrase-segnum-en" PARTICIPANT="DDD" TIER_ID="A_word-txt-qaa-fonipa-x-eib">

    """
    print("Copying word tier from file 2")
    word_type_params = {
        'LINGUISTIC_TYPE_ID': 'word',
        'CONSTRAINTS': 'Symbolic_Subdivision',
        'TIME_ALIGNABLE': 'false'
    }
    eaf_3.add_linguistic_type('word', param_dict=word_type_params)

    word_tier_params = {
        'LINGUISTIC_TYPE_REF': 'word',
        'PARENT_REF': utterance_target_tier,
        'TIER_ID': word_target_tier
    }

    _copy_symbolic_subdivision_tier(
        source_eaf=eaf_2,
        target_eaf=eaf_3,
        source_tier_name=word_source_tier,
        target_tier_name=word_target_tier,
        target_parent_tier_name=utterance_id_target_tier,
        override_params=word_tier_params)
    """
    Get all the annotations from -2 gloss tier (gloss_source_tier A_morph-gls-en)
    Join the glosses with "-" so there is a 1:1 match with word annotations
    <LINGUISTIC_TYPE CONSTRAINTS="Symbolic_Association" GRAPHIC_REFERENCES="false" LINGUISTIC_TYPE_ID="Blank" TIME_ALIGNABLE="false"/>
    <TIER LINGUISTIC_TYPE_REF="Blank" PARENT_REF="grammatical_words" TIER_ID="gloss">
    """
    print("Epic battle with words to get glosses from file 2")
    gloss_tier_params = {
        'LINGUISTIC_TYPE_REF': 'Blank',
        'PARENT_REF': word_target_tier,
        'TIER_ID': gloss_target_tier
    }
    # None of the pympi methods will suit this task, so let's do it manually.
    # Get all the data
    eaf_2_tiers = eaf_2.tiers
    eaf_2_timeslots = eaf_2.timeslots
    # A tier is of the form: {tier_name -> (aligned_annotations, reference_annotations, attributes, ordinal)},
    # Word and gloss tiers are ref_annotations, the second item in the tiers dict. See docs for more info about format.
    word_tier = eaf_2_tiers[word_source_tier][1]
    morph_tier = eaf_2_tiers[morph_source_tier][1]
    gloss_tier = eaf_2_tiers[gloss_source_tier][1]

    # Each reference annotation is of the form: [{id -> (reference, value, previous, svg_ref)}].
    # Start at the top of the hierarchy
    utterance_id_tier = eaf_2_tiers[utterance_id_source_tier][0]

    new_dict = dict()
    # For each utterance, get the words. For each word, get the glosses. Merge glosses for each word
    for utterance_id, utterance in utterance_id_tier.items():
        utt_start = eaf_2_timeslots[utterance[0]]
        utt_end = eaf_2_timeslots[utterance[1]]
        word_gloss: List[Union[int, List[str]]] = []
        for word_id, word in word_tier.items():
            if word[0] == utterance_id:
                glosses = []
                # Find morphs of this word...
                for morph_id, morph in morph_tier.items():
                    # ...by filtering on morph parents id matching the word id
                    if morph[0] == word_id:
                        for gloss_id, gloss in gloss_tier.items():
                            if gloss[0] == morph_id:
                                glosses.append(gloss[1])
                # Join glosses for this word with a dash
                word_gloss.append([word[1], '-'.join(glosses)])
        # Now, work out word duration (it is an even division of parent utterance duration)
        # Make this value the first item in the data list eg [word_duration, [word, gloss], [word, gloss], ...]
        num_segments = len(word_gloss)
        utt_dur = utt_end - utt_start
        word_dur = int(utt_dur / num_segments)
        word_gloss = [utt_start, word_dur] + word_gloss
        print("word gloss", word_gloss)
        new_dict[utterance_id] = word_gloss

    # Having worked all that out, now we can add a ref annotation tier.
    # but parent seems to now bubble all the way to the top.
    eaf_3.add_tier(gloss_target_tier,
                   ling='Blank',
                   parent=word_target_tier,
                   tier_dict=gloss_tier_params)
    # And some annotations
    for ann_id, annotation in new_dict.items():
        utt_start = annotation[0]
        word_dur = annotation[1]
        count = 0

        for ann in annotation[2:]:
            word_start = utt_start + word_dur * count
            id_tier = gloss_target_tier
            tier2 = word_target_tier
            value = ann[1]
            prev = None
            svg = None

            for aid, (ref_id, _value, _prev,
                      _) in eaf_3.tiers[tier2][1].items():
                if ann[0] == _value:
                    new_aid = eaf_3.generate_annotation_id()
                    eaf_3.tiers[id_tier][1][new_aid] = (aid, value, prev, svg)

            count = count + 1

    # Save the new file
    print("Saving object to file")
    eaf_3.to_file(file_3)
예제 #27
0
 def test_overlapping_utters(self, prep_org_data):
     tier1 = "rf"
     tier2 = "rf@MN"
     eaf_path = prep_org_data / "Marys_Yirlinkirrkirr.eaf"
     eaf = Eaf(str(eaf_path))
예제 #28
0
def process_eaf(input_elan_file: str = '',
                tier_order: int = 0,
                tier_name: str = '',
                tier_type: str = '',
                corpus_tiers_file: str = '') -> List[dict]:
    """
    Method to process a particular tier in an eaf file (ELAN Annotation Format).
    Transcriptions are read from an elan file tier.
    Tiers are nodes from the tree structure in the .eaf file.
    The tier to read from is determined by tier order (eg top tier would be order 1),
    tier type (eg default-lt) or tier name (eg Phrase).
    If tier type is used, the first tier matching this type is used.
    Elan can have multiple tiers of same type, future work would support reading data
    from multiple tiers of the selected type.

    It stores the transcriptions in the following format:
                    {'speaker_id': <speaker_id>,
                    'audio_file_name': <file_name>,
                    'transcript': <transcription_label>,
                    'start_ms': <start_time_in_milliseconds>,
                    'stop_ms': <stop_time_in_milliseconds>}

    :param input_elan_file: name of input elan file
    :param tier_order: index of the elan tier to process
    :param tier_type:  type of the elan tier to process
    :param tier_name:  name of the elan tier to process
    :return: a list of dictionaries, where each dictionary is an annotation
    """

    print(
        f"processing eaf {input_elan_file} using {tier_order} {tier_type} {tier_name}"
    )

    # Get paths to files
    input_directory, full_file_name = os.path.split(input_elan_file)
    file_name, extension = os.path.splitext(full_file_name)

    # Look for wav file matching the eaf file in same directory
    if os.path.isfile(os.path.join(input_directory, file_name + ".wav")):
        print("WAV file found for " + file_name, file=sys.stderr)
    else:
        raise ValueError(
            f"WAV file not found for {full_file_name}. "
            f"Please put it next to the eaf file in {input_directory}.")

    # Get tier data from Elan file
    input_eaf = Eaf(input_elan_file)
    tier_types: List[str] = list(input_eaf.get_linguistic_type_names())
    tier_names: List[str] = list(input_eaf.get_tier_names())

    # Keep this data handy for future corpus analysis
    # save_tier_info(input_eaf=input_eaf,
    #               tier_types=tier_types,
    #               file_name=file_name,
    #               corpus_tiers_file=corpus_tiers_file)

    # Get annotations and parameters (things like speaker id) on the target tier
    annotations: List[Tuple[str, str, str]] = []
    annotations_data: List[dict] = []

    # First try using tier order to get tier name
    if tier_order:
        # Watch out for files that may not have this many tiers
        # tier_order is 1-index but List indexing is 0-index
        try:
            tier_name = tier_names[tier_order - 1]
            print(
                f"using tier order {tier_order} to get tier name {tier_name}")
        except IndexError:
            print("couldn't find a tier")
            pass
    else:
        # else use tier type to get a tier name
        if tier_type in tier_types:
            print(f"found tier type {tier_type}")
            tier_names = input_eaf.get_tier_ids_for_linguistic_type(tier_type)
            tier_name = tier_names[0]
            if tier_name:
                print(f"found tier name {tier_name}")
        else:
            print("tier type not found in this file")

    if tier_name in tier_names:
        print(f"using tier name {tier_name}")
        annotations = input_eaf.get_annotation_data_for_tier(tier_name)

    if annotations:
        print(f"annotations {annotations}")
        annotations = sorted(annotations)
        parameters: Dict[str,
                         str] = input_eaf.get_parameters_for_tier(tier_name)
        print(f"parameters {parameters}")
        speaker_id: str = parameters.get("PARTICIPANT", "")

    for annotation in annotations:
        start: str = annotation[0]
        end: str = annotation[1]
        annotation_text: str = annotation[2]
        print(f"annotation {annotation} {start} {end}")
        obj = {
            "audio_file_name": f"{file_name}.wav",
            "transcript": annotation_text,
            "start_ms": start,
            "stop_ms": end
        }
        if "PARTICIPANT" in parameters:
            obj["speaker_id"] = speaker_id
        annotations_data.append(obj)

    return annotations_data
예제 #29
0
def read_eaf(ie):

    if verbose:
        print("input file is", ie)

    input_eaf = Eaf(ie)

    # Check if the tiers we have been given exist
    tier_names = list(input_eaf.get_tier_names())
    if verbose:
        print("tier_names", tier_names, file=sys.stderr)

    # Are we working by slice_tier name or order?
    if slice_tier != "default":
        if verbose:
            print("using slice_tier by name:", slice_tier, file=sys.stderr)
    else:

        # Sanity check that the slice_tier num is not greater than the num of tiers
        if tier_order > len(tier_names):
            print("Error: tier number is greater than the number of tiers",
                  file=sys.stderr)
            return False
        if verbose:
            print("using slice_tier by number:",
                  tier_names[tier_order - 1],
                  file=sys.stderr)

    if slice_tier not in tier_names:
        print('Error: missing slice_tier ' + slice_tier, file=sys.stderr)
        return False

    if silence_tier not in tier_names:
        if verbose:
            print('silence tier not found: ' + silence_tier, file=sys.stderr)

    # get the input audio file
    inDir, name = os.path.split(ie)
    basename, ext = os.path.splitext(name)

    # we can write out mp3 or whatever, still require wav input
    ia = os.path.join(inDir, basename + ".wav")
    input_audio = AudioSegment.from_wav(ia)

    # We can pass in an arg for a ref tier that has silence labels
    check_silence_ref_tier = False
    if silence_tier in tier_names:
        silence_tier_info = input_eaf.get_parameters_for_tier(silence_tier)
        if silence_tier_info.get("PARENT_REF") == tier:
            check_silence_ref_tier = True

    # Get annotation values, start and end times, and speaker id
    if text_tier not in tier_names:
        print('Error: missing text tier')
        return False

    annotations = sorted(input_eaf.get_annotation_data_for_tier(text_tier))

    params = input_eaf.get_parameters_for_tier(text_tier)
    if 'PARTICIPANT' in params:
        speaker_id = params['PARTICIPANT']

    annotations_data = []
    i = 0
    for ann in annotations:
        skip = False
        ref_annotation = []
        start = ann[0]
        end = ann[1]
        # output new values, not the original clip start end times
        clip_start = 0
        clip_end = ann[1] - ann[0]
        annotation = ann[2]

        # Check for annotations labelled with a particular symbol on the main tier
        if annotation == silence_marker:
            skip = True

        # Check for existence of an annotation in ref tier to silence
        # Annotation value doesn't matter
        if check_silence_ref_tier:
            ref_annotation = input_eaf.get_ref_annotation_at_time(
                silence_tier, start)
            if len(ref_annotation) is True:
                skip = True

        if skip is True:
            print('skipping annotation: ' + annotation, start, end)
        else:
            print('processing annotation: ' + annotation, start, end)
            # build the output audio/text filename
            fname = basename + "_" + str(i)
            if name_with_annotation:
                fname = slugify(annotation)

            if prefix != '':
                fname = prefix + '_' + fname
            obj = {
                'audioFileName': os.path.join(".", fname + ".wav"),
                'transcript': annotation,
                'startMs': clip_start,
                'stopMs': clip_end
            }
            if 'PARTICIPANT' in params:
                obj["speakerId"] = speaker_id
            annotations_data.append(obj)
            split_audio_by_start_end(input_audio, start, end, fname)
            write_text(annotation, fname)
            i += 1
    # output the json data for the next step in kaldi pipeline
    write_json(annotations_data)

    if verbose:
        print(annotations_data)
예제 #30
0
    parser = argparse.ArgumentParser(description='Converts an EAF file into a Kaldi data dir')
    parser.add_argument('eaf', help='Input EAF file')
    parser.add_argument('data', help='Output data directory')
    parser.add_argument('--skip-tiers', help='Comma-separated list of tiers to skip.')
    parser.add_argument('--spk-tier', action='store_true',
                        help='Each tier is one speaker, otherwise each segment is new speaker.')

    args = parser.parse_args()

    eaf_path = Path(args.eaf)
    data_path = Path(args.data)
    spk_tier = args.spk_tier

    data_path.mkdir(exist_ok=True)

    eaf = Eaf(str(eaf_path))

    segments = []
    tier_names = list(eaf.tiers.keys())
    if args.skip_tiers:
        for t in args.skip_tiers.split(','):
            tier_names.remove(t)

    num = 1
    for t in tier_names:
        for id, s in eaf.tiers[t][0].items():
            start = eaf.timeslots[s[0]]
            end = eaf.timeslots[s[1]]
            text = s[2].strip()
            if spk_tier:
                spk = t + '_'
예제 #31
0
def read_eaf(ie, tier, silence_tier, silence_marker, json_data, output_text_dir, output_audio_dir):

    input_eaf = Eaf(ie)

    # Check if the tiers we have been given exist
    tier_names = input_eaf.get_tier_names()
    if tier not in tier_names:
        print('missing tier: ' + tier, file=sys.stderr)
        return False
    if silence_tier not in tier_names:
        print('missing silence tier: ' + silence_tier, file=sys.stderr)

    # get the input_scripts audio file
    inDir, name = os.path.split(ie)
    basename, ext = os.path.splitext(name)
    ia = os.path.join(inDir, basename + ".wav")
    input_audio = AudioSegment.from_wav(ia)

    # We can pass in an arg for a ref tier that has silence labels
    check_silence_ref_tier = False
    if silence_tier in tier_names:
        silence_tier_info = input_eaf.get_parameters_for_tier(silence_tier)
        if silence_tier_info.get("PARENT_REF") == tier:
            check_silence_ref_tier = True

    # Get annotation values, start and end times, and speaker id
    annotations = sorted(input_eaf.get_annotation_data_for_tier(tier))
    params = input_eaf.get_parameters_for_tier(tier)
    if 'PARTICIPANT' in params:
        speaker_id = params['PARTICIPANT']

    i = 0
    for ann in annotations:
        skip = False
        start = ann[0]
        end = ann[1]
        # output_scripts new values, not the original clip start end times
        clip_start = 0
        clip_end = ann[1] - ann[0]
        annotation = ann[2]

        # Check for annotations labelled with a particular symbol on the main tier
        if annotation == silence_marker:
            skip = True

        # Check for existence of an annotation in ref tier to silence
        # Annotation value doesn't matter
        if check_silence_ref_tier and len(input_eaf.get_ref_annotation_at_time(silence_tier, start)):
            skip = True

        if skip is True:
            # print('skipping annotation: ' + annotation, start, end)
            print("skipping" + str(i))
        else:
            print("processing" + str(i))
            # print('processing annotation: ' + annotation, start, end)
            # build the output_scripts audio/text filename
            fname = basename + "_" + str(i)
            obj = {
                'audioFileName': os.path.join(".", fname + ".wav"),
                'transcript': annotation,
                'startMs': clip_start,
                'stopMs': clip_end
            }
            if 'PARTICIPANT' in params:
                obj["speakerId"] = speaker_id
            json_data.append(obj)
            split_audio_by_start_end(input_audio, start, end, fname, ".wav", output_audio_dir)
            write_text(annotation, fname, ".txt", output_text_dir)
            i += 1
예제 #32
0
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('words_ctm', type=Path)
    parser.add_argument('seg2tier', type=Path)
    parser.add_argument('eaf_in', type=Path)
    parser.add_argument('eaf_out', type=Path)
    parser.add_argument('--phones-ctm', default=None, type=Path)

    args = parser.parse_args()

    words_ctm_path = args.words_ctm
    seg2tier_path = args.seg2tier
    eaf_in_path = args.eaf_in
    eaf_out_path = args.eaf_out

    eaf = Eaf(str(eaf_in_path))

    seg2tier = {}
    with open(seg2tier_path) as f:
        for l in f:
            tok = l.strip().split()
            seg2tier[tok[0]] = tok[1]

    tiers = read_ctm(words_ctm_path, seg2tier, eaf)

    for tier in tiers.keys():
        part = eaf.tiers[tier][2]['PARTICIPANT']
        t = eaf.add_tier('{}_words'.format(tier), parent='tier', part=part, ann='Clarin-PL-service')

    for tier, segs in tiers.items():
        for seg in segs:
예제 #33
0
    def process_file(self, file_name):
        """
        Processes one file.

        :param file_name:
        :return:
        """

        gloss_lingtypes = ["Gloss Child", "Gloss Adult"]
        external_ref = "ecv_ref"
        ecv_name = "ASL Signbank lexicon"

        try:
            eaf = Eaf(file_name)

            # Add linguistic types
            for lingtype in gloss_lingtypes:
                eaf.add_linguistic_type(lingtype, constraints=None)

            # Add linguistic types to tiers
            gloss_tiers = self.find_gloss_tiers(eaf)
            for tier in gloss_tiers:
                if "Adult" in tier:
                    eaf.tiers[tier][2]['LINGUISTIC_TYPE_REF'] = "Gloss Adult"
                elif "Child" in tier:
                    eaf.tiers[tier][2]['LINGUISTIC_TYPE_REF'] = "Gloss Child"

            # Add an ECV external reference
            eaf.add_external_ref(
                external_ref, "ecv",
                "http://applejack.science.ru.nl/asl-signbank/static/ecv/asl.ecv"
            )

            # Add a Controlled Vocabulary
            eaf.add_controlled_vocabulary(ecv_name, external_ref)

            # Add the CV to linguistic types
            for lingtype in gloss_lingtypes:
                eaf.linguistic_types[lingtype][
                    'CONTROLLED_VOCABULARY_REF'] = ecv_name

            eaf.to_file(self.output_dir + os.sep +
                        os.path.basename(urlparse(file_name).path),
                        pretty=True)
        except IOError:
            print("The EAF %s could not be processed." % file_name,
                  file=sys.stderr)
            print(sys.exc_info()[0])
예제 #34
0
def make_elans(input_dir: str, output_dir: str, copy_wavs: bool):
    """
    Make ELAN files based on filenames of WAV files and annotation from matching text file
    :param input_dir: Directory name of folder containing TXT and WAV audio files
    :param  output_dir: Directory name to save EAF files into
    :param copy_wavs: Setting whether or not to copy the WAV file to the output dir
    """
    # Process each file
    files = glob.glob(f'{input_dir}/**/*.txt', recursive=True)
    print(files)

    for filename in files:

        filepath, ext = os.path.splitext(filename)
        basename = os.path.splitext(os.path.basename(filepath))[0]
        subdirname = os.path.basename(os.path.dirname(filepath))

        sex = subdirname[0]
        participant = subdirname[1:]

        # SEX :== m | f
        # SPEAKER_ID :== <INITIALS><DIGIT>
        # INITIALS :== speaker initials, 3 letters
        # DIGIT :== number 0-9 to differentiate speakers with identical initials

        # print(filename)     # input/dr1/fmem0/sa2.txt
        # print(filepath)     # input/dr1/fmem0/sa2
        # print(subdirname)   # fmem0
        # print(basename)     # sa2
        # print(ext)          # txt

        # Get audio file duration - use this as the EAF annotation's end timeslot
        # duration = int(librosa.get_duration(filename=os.path.join(input_dir, filename))*1000)

        # Get annotation from the text file matching on file basename
        with open(filename, 'r', encoding='utf-8') as text_file:
            annotation = text_file.read()
        annotation_split = annotation.split()
        start = int(annotation_split[0])
        duration = int(annotation_split[1])
        # convert audio samples to seconds to ms
        duration = int(duration / 16000 * 1000)
        annotation_text = " ".join(annotation_split[2:])

        # Add any annotation cleaning here
        # annotation = re.sub(r"(\d+)", lambda x: num2words.num2words(int(x.group(0))), annotation)

        print(start, duration, annotation_text)

        # Make EAF file
        output_eaf = Eaf()
        output_eaf.add_tier('default', part=participant)
        output_eaf.add_annotation('default', start, duration, annotation_text)
        output_eaf.add_linked_file(
            os.path.join(output_dir, f'{subdirname}-{basename}.wav'))
        output_eaf.to_file(
            os.path.join(output_dir, f'{subdirname}-{basename}.eaf'))

        # Copy WAV?
        # if copy_wavs:
        shutil.copyfile(
            f'{filepath}.wav',
            os.path.join(output_dir, f'{subdirname}-{basename}.wav'))

    print('>>> Done')
예제 #35
0
def import_eaf_file(eaf_paths: List[str], context: Dict[str, str],
                    reset_annotations: Callable, add_annotation: Callable,
                    tmp_dir):
    """
    Import handler for processing .eaf files.

    :param eaf_paths: List of string paths to Elan files.
    :param context: The settings that will be used to process data from the Elan files.
    :param reset_annotations: Callback to wipe all annotations that have been previously read.
        Settings such as the tier type/name/order will determine which annotations are read
        into the dataset _annotation_store. When settings are changed—
        (Wait, what? Users change their minds?? OMG yes.)
        —reset_annotations will reset dataset _annotation_store to {}, ready for annotations derived from the new
        settings to be added. Without this, changing settings will result in annotations derived from application
        of new settings being appended to previous annotations.
    :param add_annotation: Callback to append an annotation from selected tier
    :param tmp_dir: Honestly, no idea...
    """

    tier_order = context['tier_order']
    tier_name = context['tier_name']
    tier_type = context['tier_type']
    punctuation_to_collapse_by = context['punctuation_to_collapse_by']
    punctuation_to_explode_by = context['punctuation_to_explode_by']
    special_cases = set(context['special_cases'].splitlines())
    translation_tags = set(context['translation_tags'].splitlines())

    reset_annotations()

    for input_elan_file in eaf_paths:
        # Get paths to files
        input_directory, full_file_name = os.path.split(input_elan_file)
        file_name, extension = os.path.splitext(full_file_name)

        input_eaf = Eaf(input_elan_file)
        tier_types: List[str] = list(input_eaf.get_linguistic_type_names())
        tier_names: List[str] = list(input_eaf.get_tier_names())

        # Get annotations and parameters (things like speaker id) on the target tier
        annotations: List[Tuple[str, str, str]] = []
        annotation_data: List[dict] = []

        # Try using tier_order. Watch out for mixed type, empty str if not selected, int if selected
        if isinstance(tier_order, int):
            try:
                tier_name = tier_names[tier_order]
                print(
                    f"using tier order {tier_order} to get tier name {tier_name}"
                )
            except IndexError:
                print("couldn't find a tier")
                pass
        else:
            # else use tier type to get a tier name
            if tier_type in tier_types:
                print(f"found tier type {tier_type}")
                tier_names = input_eaf.get_tier_ids_for_linguistic_type(
                    tier_type)
                tier_name = tier_names[0]
                if tier_name:
                    print(f"found tier name {tier_name}")
            else:
                print("tier type not found in this file")

        if tier_name in tier_names:
            print(f"using tier name {tier_name}")
            annotations = input_eaf.get_annotation_data_for_tier(tier_name)
        else:
            pass  # TODO: Alert user of a skip due to missing tier_name in file

        if annotations:
            annotations = sorted(annotations)
            parameters: Dict[str, str] = input_eaf.get_parameters_for_tier(
                tier_name)
            speaker_id: str = parameters.get("PARTICIPANT", "")

        for annotation in annotations:
            start = annotation[0]
            end = annotation[1]
            annotation = annotation[2]

            utterance = {
                "audio_file_name": f"{file_name}.wav",
                "transcript": annotation,
                "start_ms": start,
                "stop_ms": end,
                "speaker_id": speaker_id
            }

            utterance_cleaned = clean_json_utterance(
                utterance=utterance,
                punctuation_to_collapse_by=punctuation_to_collapse_by,
                punctuation_to_explode_by=punctuation_to_explode_by,
                special_cases=special_cases,
                translation_tags=translation_tags,
                remove_english=False,
                use_langid=False)
            add_annotation(file_name, utterance_cleaned)
예제 #36
0
    # write audio
    with wave.open(output, 'wb') as audio:
        samples.shape = params.nframes if num_channels == 1 else (
            params.nframes * 2)
        audio.setparams(params)
        audio.writeframesraw(samples)

    print("Silenced {} intervals ({:.1f}s)".format(
        len(annotations), num_samples / params.framerate))


# look for .eaf files, recirsively from the passed corpus dir
# for fpath in glob.iglob(corpus + '/**/*.eaf', recursive=True):
for fpath in glob.iglob(corpus + '/*.eaf'):
    print(fpath)
    eaffile = Eaf(fpath)
    names = eaffile.get_tier_names()
    # print(names)

    # check for existence of silence tier
    #
    if DO_NOT_PUBLISH in names:
        print("have tier %s in %s" % (DO_NOT_PUBLISH, fpath))

        basename, extn = os.path.splitext(fpath)

        input = basename + ".wav"
        if overwrite == 'yes':
            output = basename + ".wav"
        else:
            output = basename + SUFFIX + ".wav"