def save(self): self.Eaf.clean_time_slots() try: os.remove(self.path + '.bak') except OSError: pass Elan.to_eaf(self.path, self.Eaf, pretty=True) os.remove(self.path + '.bak')
def save(self): self.Eaf.clean_time_slots() try: os.remove(self.path+'.bak') except OSError: pass Elan.to_eaf(self.path, self.Eaf, pretty=True) os.remove(self.path+'.bak')
def process_eaf(file_path): dir_path, file_name = os.path.split(file_path) recording_id, _ = os.path.splitext(file_name) base_path = os.path.join(dir_path, recording_id) print('Recording:', recording_id) speakers = set() with open(base_path + '.segments', 'w') as segments_file, \ open(base_path + '.text', 'w') as text_file, \ open(base_path + '.utt2spk', 'w') as utt2spk_file, \ open(base_path + '.speakers', 'w') as speakers_file : eaf = Elan.Eaf(file_path) for tier in eaf.get_tier_names(): speaker = normalize_speaker(tier) speakers.add(speaker) print('Speaker:', speaker) utterances = words_to_utterances( eaf.get_annotation_data_for_tier(tier)) for start, end, words in utterances: utterance_id = UTT_FORMAT.format(speaker=speaker, recording=recording_id, start=start, end=end) # segments file to automatically split # a large recording file into utterances print('{utterance} {recording} {start} {end}'.format( utterance=utterance_id, recording=recording_id, start=start, end=end), file=segments_file) # utterance -> text print('{utterance} {words}'.format(utterance=utterance_id, words=words), file=text_file) # utterance -> speaker print('{utterance} {speaker}'.format(utterance=utterance_id, speaker=speaker), file=utt2spk_file) # speakers present in this recording for speaker in sorted(speakers): print(speaker, file=speakers_file)
# Intended for converting Plenary Sessions of the Parliament of Finland, Downloadable Version 1 EAF # files into suitable form for use in speaker verification testing. # # Replaces the original EAF file (original is moved out of the way by appending ".bak" to the filename). import os import argparse import pympi.Elan as e MIN_DURATION = 3000 MAX_DURATION = 25000 parser = argparse.ArgumentParser() parser.add_argument('--file_path', type=str) args = parser.parse_args() eaf = e.Eaf(args.file_path) # Dataset MEDIA_DESCRIPTORs only have MEDIA_URL defined to the original mp4 video, # however we need path to the wav files for the extraction step. linked_path, ext = os.path.splitext(args.file_path) for linked in eaf.get_linked_files(): eaf.remove_linked_files(linked['MEDIA_URL']) eaf.add_linked_file(linked_path + '.wav', relpath=os.path.basename(linked_path) + '.wav', mimetype='audio/wav') # Try to group word annotations into sentences, only keep sentences < MAX_DURATION. for tid, (anno, _, _, _) in eaf.tiers.items(): utterances = [] utterance = [] utterance_str = []
def convert(filename): eaf = Elan.Eaf(filename) utts = [] def translate_pos(pos, dep, gloss): if '?' in pos: print('Warning: removing ? from "%s"' % pos, file=sys.stderr) pos = pos.replace('?', '') elif pos not in POS_TABLE: print('Warning: unknown PoS tag "%s"' % pos, file=sys.stderr) if pos[:2] == 'VB': return 'VERB' if pos == 'G': return GESTURES_POS.get(gloss + '[G]', 'X') if pos == 'PEK': return 'DET' if dep == 'det' else 'PRON' return POS_TABLE.get(pos, 'X') def utt_to_conllu(utt): base = utt[0]['index'] def process_sign(sign): return [ str(sign['index'] - base + 1), sign['gloss'], '_', translate_pos(sign['pos'], sign['dep'], sign['gloss']), sign['pos'], '_', str(0 if sign['head'] == 0 else sign['head'] - base + 1), sign['dep'], '_', '_' ] return list(map(process_sign, utt)) for signer in (1, 2): def get_annotation_from_hand(tier, hand): return [ (hand, ) + t for t in eaf.get_annotation_data_for_tier('%s_%s S%d' % (tier, hand, signer)) ] def get_annotation(tier): return get_annotation_from_hand(tier, 'DH') + \ get_annotation_from_hand(tier, 'NonDH') #ann_glosses = get_annotation('Glosa') ann_index = get_annotation('Index') ann_dep = get_annotation('UD') ann_head = get_annotation('Link') slots = defaultdict(dict) for hand, t0, t1, i, gloss_pos in ann_index: try: slots[(hand, t0, t1)]['index'] = int(i) except ValueError: print('Warning: invalid index "%s"' % i, file=sys.stderr) gloss, pos = parse_gloss(gloss_pos) slots[(hand, t0, t1)]['gloss'] = gloss slots[(hand, t0, t1)]['pos'] = pos slots[(hand, t0, t1)]['t0'] = t0 for hand, t0, t1, i, gloss in ann_head: try: slots[(hand, t0, t1)]['head'] = int(i) except ValueError: print('Warning: invalid head "%s" at index %d' % (i, slots[(hand, t0, t1)]['index']), file=sys.stderr) for hand, t0, t1, dep, gloss in ann_dep: if dep: # hack to fix typo if dep == 'reparandium': dep = 'reparandum' slots[(hand, t0, t1)]['dep'] = dep children = defaultdict(list) signs = {} roots = [] for (hand, t0, t1), sign in slots.items(): try: index = sign['index'] dep = sign['dep'] head = sign['head'] gloss = sign['gloss'] pos = sign['pos'] children[head].append(index) signs[index] = sign if head == 0: roots.append(sign) except KeyError: pass roots.sort(key=lambda sign: sign['index']) def get_flat_tree(index): return [signs[index]] + sum( [get_flat_tree(child) for child in children[index]], []) for root in roots: utt = get_flat_tree(root['index']) utt.sort(key=lambda sign: sign['index']) indexes = [sign['index'] for sign in utt] expected = set(range(min(indexes), max(indexes) + 1)) missing = expected - set(indexes) root_nonzero = [ sign for sign in utt if sign['dep'] == 'root' and sign['head'] != 0 ] nonroot_zero = [ sign for sign in utt if sign['dep'] != 'root' and sign['head'] == 0 ] for sign in root_nonzero: print('Warning: sign %d is "root" but has index %d' % (sign['index'], sign['head']), file=sys.stderr) for sign in nonroot_zero: print('Warning: sign %d is "%s" but has index 0' % (sign['index'], sign['dep']), file=sys.stderr) if missing: print('Warning: signs %d and %d are connected to each other' ' but not to the following signs between them: %s' % (min(indexes), max(indexes), ', '.join( map(str, sorted(missing)))), file=sys.stderr) if missing or root_nonzero or nonroot_zero: continue utts.append(utt) #print('%d trees, %d signs' % (len(roots), len(signs)), file=sys.stderr) print('%s: %d trees with %d signs' % (filename, len(utts), sum(map(len, utts))), file=sys.stderr) return [utt_to_conllu(utt) for utt in utts]
"type": ['simultaneous'] } # Last moment addidion, didn't test yet # Comment if causes errors try: os.makedirs(target["op_folder"] + '/' + target["stu_part"]) except FileExistsError: pass print( "\nSTART___________________________________________________________________________________________" ) # Extract tier 3 annotations from the eaf file eaf_file = Elan.Eaf(files["eaf"]) t3_annots = eaf_file.get_annotation_data_for_tier(target["tier"]) # Selective split the audio file based on co-laughs print("Writing to - " + target["op_folder"] + '/' + target["stu_part"]) audio = AudioSegment.from_wav(files['audio']) num_simul = 0 for i in range(len(t3_annots)): (start, end, laugh_type) = (t3_annots[i][0], t3_annots[i][1], t3_annots[i][2]) try: if (laugh_type in target["type"]): laugh = laugh_type.replace('/', '_') file_name = str(target["file_prefix"] + str(start / 1000) + ' - ' + str(end / 1000) + ' - ' + laugh +
def add_tiers(filename): # Names of the controlled vocabulary / linguistic type specifications. # Arbitrary. cv = 'ud_dep' lang = 'und' lingtype = 'ud_lingtype' lingtype_cv = lingtype # 'ud_lingtype_cv' # This list will be inserted as a controlled vocabulary. ud_deps = [ 'amod', 'advmod', 'advcl', 'acl', 'case', 'auxpass', 'aux', 'appos', 'ccomp', 'cc', 'remnant', 'punct', 'root', 'reparandum', 'nsubjpass', 'nsubj', 'parataxis', 'nummod', 'xcomp', 'vocative', 'dobj', 'dislocated', 'csubj', 'cop', 'conj', 'compound', 'discourse', 'det', 'dep', 'csubjpass', 'goeswith', 'iobj', 'expl', 'foreign', 'mwe', 'name', 'list', 'mark', 'neg', 'nmod', 'acl:relcl' ] eaf = Elan.Eaf(filename) if 'swl' not in eaf.languages: eaf.add_language('swl', 'swl', 'Swedish Sign Language') # Add a controlled vocabulary for the UD labels. #eaf.add_controlled_vocabulary(cv) #eaf.add_cv_description(cv, lang, 'UD dependency labels') #for i,dep in enumerate(ud_deps): # eaf.add_cv_entry(cv, 'cveid%d' % i, [(dep, lang, dep)]) # Add a lingtype which ensures that the annotations are aligned with the # respective gloss tier. eaf.add_linguistic_type(lingtype, constraints='Symbolic_Association', timealignable=False) # Add another lingtype for the UD labels controlled vocabulary. #eaf.add_linguistic_type(lingtype_cv, param_dict={ # 'LINGUISTIC_TYPE_ID': lingtype_cv, # 'TIME_ALIGNABLE': 'false', # 'GRAPHIC_REFERENCES': 'false', # 'CONTROLLED_VOCABULARY_REF': cv}) for signer in (1, 2): def get_glosses(hand): return [(hand, ) + t for t in eaf.get_annotation_data_for_tier('Glosa_%s S%d' % (hand, signer))] # Get a list of glosses for both hands of this signer. glosses = get_glosses('DH') + get_glosses('NonDH') glosses.sort(key=lambda t: t[1]) # Add the necessary tiers for this signer. for ud_part in ('Index', 'UD', 'Link'): for hand in ('DH', 'NonDH'): tier = '%s_%s S%d' % (ud_part, hand, signer) ref_tier = 'Glosa_%s S%d' % (hand, signer) eaf.add_tier(tier, ling=lingtype_cv if ud_part == 'UD' else lingtype, parent=ref_tier) # Enumerate each one of the merged DH+NonDH glosses and write the # index to the Index tier. for i, (hand, t1, t2, _) in enumerate(glosses): tier = 'Index_%s S%d' % (hand, signer) ref_tier = 'Glosa_%s S%d' % (hand, signer) eaf.add_ref_annotation(tier, ref_tier, (t1 + t2) // 2, value=str(i + 1)) return eaf
for hand in ('DH', 'NonDH'): tier = '%s_%s S%d' % (ud_part, hand, signer) ref_tier = 'Glosa_%s S%d' % (hand, signer) eaf.add_tier(tier, ling=lingtype_cv if ud_part == 'UD' else lingtype, parent=ref_tier) # Enumerate each one of the merged DH+NonDH glosses and write the # index to the Index tier. for i, (hand, t1, t2, _) in enumerate(glosses): tier = 'Index_%s S%d' % (hand, signer) ref_tier = 'Glosa_%s S%d' % (hand, signer) eaf.add_ref_annotation(tier, ref_tier, (t1 + t2) // 2, value=str(i + 1)) return eaf if __name__ == '__main__': if len(sys.argv) < 3: print 'Usage: output-directory source-directory/*.eaf' sys.exit(1) out = sys.argv[1] for filename in sys.argv[2:]: print 'Converting %s...' % filename eaf = add_tiers(filename) base = os.path.splitext(os.path.basename(filename))[0] target = os.path.join(out, base + '_UD.eaf') Elan.to_eaf(target, eaf)