예제 #1
0
    def save(self):

        self.Eaf.clean_time_slots()
        try:
            os.remove(self.path + '.bak')
        except OSError:
            pass
        Elan.to_eaf(self.path, self.Eaf, pretty=True)
        os.remove(self.path + '.bak')
예제 #2
0
  def save(self):

    self.Eaf.clean_time_slots()
    try:
      os.remove(self.path+'.bak')
    except OSError:
      pass
    Elan.to_eaf(self.path, self.Eaf, pretty=True)
    os.remove(self.path+'.bak')
예제 #3
0
def process_eaf(file_path):
    dir_path, file_name = os.path.split(file_path)
    recording_id, _ = os.path.splitext(file_name)
    base_path = os.path.join(dir_path, recording_id)
    print('Recording:', recording_id)

    speakers = set()

    with open(base_path + '.segments', 'w') as segments_file, \
         open(base_path + '.text', 'w') as text_file, \
         open(base_path + '.utt2spk', 'w') as utt2spk_file, \
         open(base_path + '.speakers', 'w') as speakers_file :

        eaf = Elan.Eaf(file_path)
        for tier in eaf.get_tier_names():
            speaker = normalize_speaker(tier)
            speakers.add(speaker)
            print('Speaker:', speaker)
            utterances = words_to_utterances(
                eaf.get_annotation_data_for_tier(tier))
            for start, end, words in utterances:
                utterance_id = UTT_FORMAT.format(speaker=speaker,
                                                 recording=recording_id,
                                                 start=start,
                                                 end=end)
                # segments file to automatically split
                # a large recording file into utterances
                print('{utterance} {recording} {start} {end}'.format(
                    utterance=utterance_id,
                    recording=recording_id,
                    start=start,
                    end=end),
                      file=segments_file)
                # utterance -> text
                print('{utterance} {words}'.format(utterance=utterance_id,
                                                   words=words),
                      file=text_file)
                # utterance -> speaker
                print('{utterance} {speaker}'.format(utterance=utterance_id,
                                                     speaker=speaker),
                      file=utt2spk_file)
        # speakers present in this recording
        for speaker in sorted(speakers):
            print(speaker, file=speakers_file)
예제 #4
0
# Intended for converting Plenary Sessions of the Parliament of Finland, Downloadable Version 1 EAF
# files into suitable form for use in speaker verification testing.
#
# Replaces the original EAF file (original is moved out of the way by appending ".bak" to the filename).
import os
import argparse
import pympi.Elan as e

MIN_DURATION = 3000
MAX_DURATION = 25000

parser = argparse.ArgumentParser()
parser.add_argument('--file_path', type=str)
args = parser.parse_args()

eaf = e.Eaf(args.file_path)

# Dataset MEDIA_DESCRIPTORs only have MEDIA_URL defined to the original mp4 video,
# however we need path to the wav files for the extraction step.
linked_path, ext = os.path.splitext(args.file_path)
for linked in eaf.get_linked_files():
    eaf.remove_linked_files(linked['MEDIA_URL'])
eaf.add_linked_file(linked_path + '.wav',
                    relpath=os.path.basename(linked_path) + '.wav',
                    mimetype='audio/wav')

# Try to group word annotations into sentences, only keep sentences < MAX_DURATION.
for tid, (anno, _, _, _) in eaf.tiers.items():
    utterances = []
    utterance = []
    utterance_str = []
예제 #5
0
def convert(filename):
    eaf = Elan.Eaf(filename)

    utts = []

    def translate_pos(pos, dep, gloss):
        if '?' in pos:
            print('Warning: removing ? from "%s"' % pos, file=sys.stderr)
            pos = pos.replace('?', '')
        elif pos not in POS_TABLE:
            print('Warning: unknown PoS tag "%s"' % pos, file=sys.stderr)
        if pos[:2] == 'VB': return 'VERB'
        if pos == 'G':
            return GESTURES_POS.get(gloss + '[G]', 'X')
        if pos == 'PEK':
            return 'DET' if dep == 'det' else 'PRON'
        return POS_TABLE.get(pos, 'X')

    def utt_to_conllu(utt):
        base = utt[0]['index']

        def process_sign(sign):
            return [
                str(sign['index'] - base + 1), sign['gloss'], '_',
                translate_pos(sign['pos'], sign['dep'],
                              sign['gloss']), sign['pos'], '_',
                str(0 if sign['head'] == 0 else sign['head'] - base + 1),
                sign['dep'], '_', '_'
            ]

        return list(map(process_sign, utt))

    for signer in (1, 2):

        def get_annotation_from_hand(tier, hand):
            return [
                (hand, ) + t
                for t in eaf.get_annotation_data_for_tier('%s_%s S%d' %
                                                          (tier, hand, signer))
            ]

        def get_annotation(tier):
            return get_annotation_from_hand(tier, 'DH') + \
                   get_annotation_from_hand(tier, 'NonDH')

        #ann_glosses = get_annotation('Glosa')
        ann_index = get_annotation('Index')
        ann_dep = get_annotation('UD')
        ann_head = get_annotation('Link')

        slots = defaultdict(dict)

        for hand, t0, t1, i, gloss_pos in ann_index:
            try:
                slots[(hand, t0, t1)]['index'] = int(i)
            except ValueError:
                print('Warning: invalid index "%s"' % i, file=sys.stderr)
            gloss, pos = parse_gloss(gloss_pos)
            slots[(hand, t0, t1)]['gloss'] = gloss
            slots[(hand, t0, t1)]['pos'] = pos
            slots[(hand, t0, t1)]['t0'] = t0

        for hand, t0, t1, i, gloss in ann_head:
            try:
                slots[(hand, t0, t1)]['head'] = int(i)
            except ValueError:
                print('Warning: invalid head "%s" at index %d' %
                      (i, slots[(hand, t0, t1)]['index']),
                      file=sys.stderr)

        for hand, t0, t1, dep, gloss in ann_dep:
            if dep:
                # hack to fix typo
                if dep == 'reparandium': dep = 'reparandum'
                slots[(hand, t0, t1)]['dep'] = dep

        children = defaultdict(list)
        signs = {}
        roots = []

        for (hand, t0, t1), sign in slots.items():
            try:
                index = sign['index']
                dep = sign['dep']
                head = sign['head']
                gloss = sign['gloss']
                pos = sign['pos']
                children[head].append(index)
                signs[index] = sign
                if head == 0:
                    roots.append(sign)
            except KeyError:
                pass

        roots.sort(key=lambda sign: sign['index'])

        def get_flat_tree(index):
            return [signs[index]] + sum(
                [get_flat_tree(child) for child in children[index]], [])

        for root in roots:
            utt = get_flat_tree(root['index'])
            utt.sort(key=lambda sign: sign['index'])
            indexes = [sign['index'] for sign in utt]
            expected = set(range(min(indexes), max(indexes) + 1))
            missing = expected - set(indexes)
            root_nonzero = [
                sign for sign in utt
                if sign['dep'] == 'root' and sign['head'] != 0
            ]
            nonroot_zero = [
                sign for sign in utt
                if sign['dep'] != 'root' and sign['head'] == 0
            ]

            for sign in root_nonzero:
                print('Warning: sign %d is "root" but has index %d' %
                      (sign['index'], sign['head']),
                      file=sys.stderr)
            for sign in nonroot_zero:
                print('Warning: sign %d is "%s" but has index 0' %
                      (sign['index'], sign['dep']),
                      file=sys.stderr)
            if missing:
                print('Warning: signs %d and %d are connected to each other'
                      ' but not to the following signs between them: %s' %
                      (min(indexes), max(indexes), ', '.join(
                          map(str, sorted(missing)))),
                      file=sys.stderr)
            if missing or root_nonzero or nonroot_zero:
                continue
            utts.append(utt)

        #print('%d trees, %d signs' % (len(roots), len(signs)), file=sys.stderr)

    print('%s: %d trees with %d signs' %
          (filename, len(utts), sum(map(len, utts))),
          file=sys.stderr)
    return [utt_to_conllu(utt) for utt in utts]
예제 #6
0
        "type": ['simultaneous']
    }

    # Last moment addidion, didn't test yet
    # Comment if causes errors
    try:
        os.makedirs(target["op_folder"] + '/' + target["stu_part"])
    except FileExistsError:
        pass

    print(
        "\nSTART___________________________________________________________________________________________"
    )

    # Extract tier 3 annotations from the eaf file
    eaf_file = Elan.Eaf(files["eaf"])
    t3_annots = eaf_file.get_annotation_data_for_tier(target["tier"])

    # Selective split the audio file based on co-laughs
    print("Writing to - " + target["op_folder"] + '/' + target["stu_part"])

    audio = AudioSegment.from_wav(files['audio'])
    num_simul = 0
    for i in range(len(t3_annots)):
        (start, end, laugh_type) = (t3_annots[i][0], t3_annots[i][1],
                                    t3_annots[i][2])
        try:
            if (laugh_type in target["type"]):
                laugh = laugh_type.replace('/', '_')
                file_name = str(target["file_prefix"] + str(start / 1000) +
                                ' - ' + str(end / 1000) + ' - ' + laugh +
예제 #7
0
def add_tiers(filename):
    # Names of the controlled vocabulary / linguistic type specifications.
    # Arbitrary.
    cv = 'ud_dep'
    lang = 'und'
    lingtype = 'ud_lingtype'
    lingtype_cv = lingtype  # 'ud_lingtype_cv'

    # This list will be inserted as a controlled vocabulary.
    ud_deps = [
        'amod', 'advmod', 'advcl', 'acl', 'case', 'auxpass', 'aux', 'appos',
        'ccomp', 'cc', 'remnant', 'punct', 'root', 'reparandum', 'nsubjpass',
        'nsubj', 'parataxis', 'nummod', 'xcomp', 'vocative', 'dobj',
        'dislocated', 'csubj', 'cop', 'conj', 'compound', 'discourse', 'det',
        'dep', 'csubjpass', 'goeswith', 'iobj', 'expl', 'foreign', 'mwe',
        'name', 'list', 'mark', 'neg', 'nmod', 'acl:relcl'
    ]

    eaf = Elan.Eaf(filename)

    if 'swl' not in eaf.languages:
        eaf.add_language('swl', 'swl', 'Swedish Sign Language')

    # Add a controlled vocabulary for the UD labels.
    #eaf.add_controlled_vocabulary(cv)
    #eaf.add_cv_description(cv, lang, 'UD dependency labels')
    #for i,dep in enumerate(ud_deps):
    #    eaf.add_cv_entry(cv, 'cveid%d' % i, [(dep, lang, dep)])

    # Add a lingtype which ensures that the annotations are aligned with the
    # respective gloss tier.
    eaf.add_linguistic_type(lingtype,
                            constraints='Symbolic_Association',
                            timealignable=False)

    # Add another lingtype for the UD labels controlled vocabulary.
    #eaf.add_linguistic_type(lingtype_cv, param_dict={
    #    'LINGUISTIC_TYPE_ID': lingtype_cv,
    #    'TIME_ALIGNABLE': 'false',
    #    'GRAPHIC_REFERENCES': 'false',
    #    'CONTROLLED_VOCABULARY_REF': cv})

    for signer in (1, 2):

        def get_glosses(hand):
            return [(hand, ) + t
                    for t in eaf.get_annotation_data_for_tier('Glosa_%s S%d' %
                                                              (hand, signer))]

        # Get a list of glosses for both hands of this signer.
        glosses = get_glosses('DH') + get_glosses('NonDH')
        glosses.sort(key=lambda t: t[1])

        # Add the necessary tiers for this signer.
        for ud_part in ('Index', 'UD', 'Link'):
            for hand in ('DH', 'NonDH'):
                tier = '%s_%s S%d' % (ud_part, hand, signer)
                ref_tier = 'Glosa_%s S%d' % (hand, signer)
                eaf.add_tier(tier,
                             ling=lingtype_cv if ud_part == 'UD' else lingtype,
                             parent=ref_tier)

        # Enumerate each one of the merged DH+NonDH glosses and write the
        # index to the Index tier.
        for i, (hand, t1, t2, _) in enumerate(glosses):
            tier = 'Index_%s S%d' % (hand, signer)
            ref_tier = 'Glosa_%s S%d' % (hand, signer)
            eaf.add_ref_annotation(tier,
                                   ref_tier, (t1 + t2) // 2,
                                   value=str(i + 1))

    return eaf
예제 #8
0
            for hand in ('DH', 'NonDH'):
                tier = '%s_%s S%d' % (ud_part, hand, signer)
                ref_tier = 'Glosa_%s S%d' % (hand, signer)
                eaf.add_tier(tier,
                             ling=lingtype_cv if ud_part == 'UD' else lingtype,
                             parent=ref_tier)

        # Enumerate each one of the merged DH+NonDH glosses and write the
        # index to the Index tier.
        for i, (hand, t1, t2, _) in enumerate(glosses):
            tier = 'Index_%s S%d' % (hand, signer)
            ref_tier = 'Glosa_%s S%d' % (hand, signer)
            eaf.add_ref_annotation(tier,
                                   ref_tier, (t1 + t2) // 2,
                                   value=str(i + 1))

    return eaf


if __name__ == '__main__':
    if len(sys.argv) < 3:
        print 'Usage: add_tiers.py output-directory source-directory/*.eaf'
        sys.exit(1)
    out = sys.argv[1]
    for filename in sys.argv[2:]:
        print 'Converting %s...' % filename
        eaf = add_tiers(filename)
        base = os.path.splitext(os.path.basename(filename))[0]
        target = os.path.join(out, base + '_UD.eaf')
        Elan.to_eaf(target, eaf)