def process_file(self, file_name): """ Processes one file. :param file_name: :return: """ gloss_lingtypes = ["Gloss Child", "Gloss Adult"] external_ref = "ecv_ref" ecv_name = "ASL Signbank lexicon" try: eaf = Eaf(file_name) # Add linguistic types for lingtype in gloss_lingtypes: eaf.add_linguistic_type(lingtype, constraints=None) # Add linguistic types to tiers gloss_tiers = self.find_gloss_tiers(eaf) for tier in gloss_tiers: if "Adult" in tier: eaf.tiers[tier][2]['LINGUISTIC_TYPE_REF'] = "Gloss Adult" elif "Child" in tier: eaf.tiers[tier][2]['LINGUISTIC_TYPE_REF'] = "Gloss Child" # Add an ECV external reference eaf.add_external_ref( external_ref, "ecv", "http://applejack.science.ru.nl/asl-signbank/static/ecv/asl.ecv" ) # Add a Controlled Vocabulary eaf.add_controlled_vocabulary(ecv_name, external_ref) # Add the CV to linguistic types for lingtype in gloss_lingtypes: eaf.linguistic_types[lingtype][ 'CONTROLLED_VOCABULARY_REF'] = ecv_name eaf.to_file(self.output_dir + os.sep + os.path.basename(urlparse(file_name).path), pretty=True) except IOError: print("The EAF %s could not be processed." % file_name, file=sys.stderr) print(sys.exc_info()[0])
def process_file(self, file_name): """ Processes one file. :param file_name: :return: """ gloss_append_lingtype = "gloss-append" try: eaf = Eaf(file_name) eaf.add_linguistic_type(gloss_append_lingtype, constraints="Symbolic_Association") gloss_tiers = self.find_gloss_tiers(eaf) self.add_gloss_tier_children(eaf, gloss_tiers, gloss_append_lingtype, file_name) eaf.to_file(self.output_dir + os.sep + os.path.basename(urlparse(file_name).path), pretty=True) except IOError: print("The EAF %s could not be processed." % file_name, file=sys.stderr) print(sys.exc_info()[0])
def process_file(self, file_name): """ Processes one file. :param file_name: :return: """ gloss_lingtypes = ["Gloss Child", "Gloss Adult"] external_ref = "ecv_ref" ecv_name = "ASL Signbank lexicon" try: eaf = Eaf(file_name) # Add linguistic types for lingtype in gloss_lingtypes: eaf.add_linguistic_type(lingtype, constraints=None) # Add linguistic types to tiers gloss_tiers = self.find_gloss_tiers(eaf) for tier in gloss_tiers: if "Adult" in tier: eaf.tiers[tier][2]['LINGUISTIC_TYPE_REF'] = "Gloss Adult" elif "Child" in tier: eaf.tiers[tier][2]['LINGUISTIC_TYPE_REF'] = "Gloss Child" # Add an ECV external reference eaf.add_external_ref(external_ref, "ecv", "http://applejack.science.ru.nl/asl-signbank/static/ecv/asl.ecv") # Add a Controlled Vocabulary eaf.add_controlled_vocabulary(ecv_name, external_ref) # Add the CV to linguistic types for lingtype in gloss_lingtypes: eaf.linguistic_types[lingtype]['CONTROLLED_VOCABULARY_REF'] = ecv_name eaf.to_file(self.output_dir + os.sep + os.path.basename(urlparse(file_name).path), pretty=True) except IOError: print("The EAF %s could not be processed." % file_name, file=sys.stderr) print(sys.exc_info()[0])
def main(): """ File 1 has the utterance and utterance translation File 2 has the gloss File 3 is the destination """ # Input files file_1 = 'input/file-1.eaf' file_2 = 'input/file-2.eaf' file_3 = 'input/new.eaf' # Tier names utterance_id_source_tier = "A_phrase-segnum-en" utterance_id_target_tier = "utterance_id" utterance_source_tier = "DDD_Transcription-txt-qaa-fonipa-x-eib" utterance_target_tier = "utterance" utterance_translation_source_tier = "DDD_Translation-gls-en" utterance_translation_target_tier = "utterance_translation" word_source_tier = "A_word-txt-qaa-fonipa-x-eib" word_target_tier = "grammatical_words" morph_source_tier = "A_morph-txt-qaa-fonipa-x-eib" gloss_source_tier = "A_morph-gls-en" gloss_target_tier = "gloss" # Set up the eaf objects eaf_1 = Eaf(file_1) eaf_2 = Eaf(file_2) eaf_3 = Eaf() # Remove default tier and copy media eaf_3.remove_tier("default") # eaf_3 = copy_media(eaf_1, eaf_3) """ Copy annotation number tier from file 2 tier-type default-lt <LINGUISTIC_TYPE GRAPHIC_REFERENCES="false" LINGUISTIC_TYPE_ID="default-lt" TIME_ALIGNABLE="true"/> """ print("Copying annotation numbers from file 2") utterance_id_type_params = { 'LINGUISTIC_TYPE_ID': 'default-lt', 'TIME_ALIGNABLE': 'true' } utterance_id_tier_params = { 'LINGUISTIC_TYPE_REF': 'default-lt', 'TIER_ID': utterance_id_target_tier } _tier_copy(source_eaf=eaf_2, target_eaf=eaf_3, source_tier_name=utterance_id_source_tier, target_tier_name=utterance_id_target_tier, override_params=utterance_id_tier_params) """ Copy utterance tier from file 1 LINGUISTIC_TYPE_REF="Blank" <LINGUISTIC_TYPE CONSTRAINTS="Symbolic_Association" GRAPHIC_REFERENCES="false" LINGUISTIC_TYPE_ID="Blank" TIME_ALIGNABLE="false"/> """ print("Copying utterance tier from file 1") blank_type_params = { 'LINGUISTIC_TYPE_ID': 'Blank', 'CONSTRAINTS': 'Symbolic_Association', 'TIME_ALIGNABLE': 'false' } eaf_3.add_linguistic_type('Blank', param_dict=blank_type_params) utterance_tier_params = { 'LINGUISTIC_TYPE_REF': 'Blank', 'PARENT_REF': utterance_id_target_tier, 'TIER_ID': utterance_target_tier } _tier_copy_to_ref(source_eaf=eaf_1, target_eaf=eaf_3, source_tier_name=utterance_source_tier, target_tier_name=utterance_target_tier, target_parent_tier_name=utterance_id_target_tier, override_params=utterance_tier_params) """ Copy utterance translation tier from file 1 LINGUISTIC_TYPE_REF="Blank" <LINGUISTIC_TYPE CONSTRAINTS="Symbolic_Association" GRAPHIC_REFERENCES="false" LINGUISTIC_TYPE_ID="Blank" TIME_ALIGNABLE="false"/> <TIER LINGUISTIC_TYPE_REF="Blank" PARENT_REF="utterance" PARTICIPANT="DDD" TIER_ID="utterance_translation"> """ print("Copying utterance translation tier from file 1") utterance_translation_tier_params = { 'LINGUISTIC_TYPE_REF': 'Blank', 'PARENT_REF': utterance_target_tier, 'TIER_ID': utterance_translation_target_tier } _ref_tier_copy(source_eaf=eaf_1, target_eaf=eaf_3, source_tier_name=utterance_translation_source_tier, target_tier_name=utterance_translation_target_tier, target_parent_tier_name=utterance_target_tier, override_params=utterance_translation_tier_params) """ Copy the word tier from file 2 <LINGUISTIC_TYPE CONSTRAINTS="Symbolic_Subdivision" GRAPHIC_REFERENCES="false" LINGUISTIC_TYPE_ID="word" TIME_ALIGNABLE="false"/> <TIER DEFAULT_LOCALE="qaa-fonipa-x-eib" LINGUISTIC_TYPE_REF="word" PARENT_REF="A_phrase-segnum-en" PARTICIPANT="DDD" TIER_ID="A_word-txt-qaa-fonipa-x-eib"> """ print("Copying word tier from file 2") word_type_params = { 'LINGUISTIC_TYPE_ID': 'word', 'CONSTRAINTS': 'Symbolic_Subdivision', 'TIME_ALIGNABLE': 'false' } eaf_3.add_linguistic_type('word', param_dict=word_type_params) word_tier_params = { 'LINGUISTIC_TYPE_REF': 'word', 'PARENT_REF': utterance_target_tier, 'TIER_ID': word_target_tier } _copy_symbolic_subdivision_tier( source_eaf=eaf_2, target_eaf=eaf_3, source_tier_name=word_source_tier, target_tier_name=word_target_tier, target_parent_tier_name=utterance_id_target_tier, override_params=word_tier_params) """ Get all the annotations from -2 gloss tier (gloss_source_tier A_morph-gls-en) Join the glosses with "-" so there is a 1:1 match with word annotations <LINGUISTIC_TYPE CONSTRAINTS="Symbolic_Association" GRAPHIC_REFERENCES="false" LINGUISTIC_TYPE_ID="Blank" TIME_ALIGNABLE="false"/> <TIER LINGUISTIC_TYPE_REF="Blank" PARENT_REF="grammatical_words" TIER_ID="gloss"> """ print("Epic battle with words to get glosses from file 2") gloss_tier_params = { 'LINGUISTIC_TYPE_REF': 'Blank', 'PARENT_REF': word_target_tier, 'TIER_ID': gloss_target_tier } # None of the pympi methods will suit this task, so let's do it manually. # Get all the data eaf_2_tiers = eaf_2.tiers eaf_2_timeslots = eaf_2.timeslots # A tier is of the form: {tier_name -> (aligned_annotations, reference_annotations, attributes, ordinal)}, # Word and gloss tiers are ref_annotations, the second item in the tiers dict. See docs for more info about format. word_tier = eaf_2_tiers[word_source_tier][1] morph_tier = eaf_2_tiers[morph_source_tier][1] gloss_tier = eaf_2_tiers[gloss_source_tier][1] # Each reference annotation is of the form: [{id -> (reference, value, previous, svg_ref)}]. # Start at the top of the hierarchy utterance_id_tier = eaf_2_tiers[utterance_id_source_tier][0] new_dict = dict() # For each utterance, get the words. For each word, get the glosses. Merge glosses for each word for utterance_id, utterance in utterance_id_tier.items(): utt_start = eaf_2_timeslots[utterance[0]] utt_end = eaf_2_timeslots[utterance[1]] word_gloss: List[Union[int, List[str]]] = [] for word_id, word in word_tier.items(): if word[0] == utterance_id: glosses = [] # Find morphs of this word... for morph_id, morph in morph_tier.items(): # ...by filtering on morph parents id matching the word id if morph[0] == word_id: for gloss_id, gloss in gloss_tier.items(): if gloss[0] == morph_id: glosses.append(gloss[1]) # Join glosses for this word with a dash word_gloss.append([word[1], '-'.join(glosses)]) # Now, work out word duration (it is an even division of parent utterance duration) # Make this value the first item in the data list eg [word_duration, [word, gloss], [word, gloss], ...] num_segments = len(word_gloss) utt_dur = utt_end - utt_start word_dur = int(utt_dur / num_segments) word_gloss = [utt_start, word_dur] + word_gloss print("word gloss", word_gloss) new_dict[utterance_id] = word_gloss # Having worked all that out, now we can add a ref annotation tier. # but parent seems to now bubble all the way to the top. eaf_3.add_tier(gloss_target_tier, ling='Blank', parent=word_target_tier, tier_dict=gloss_tier_params) # And some annotations for ann_id, annotation in new_dict.items(): utt_start = annotation[0] word_dur = annotation[1] count = 0 for ann in annotation[2:]: word_start = utt_start + word_dur * count id_tier = gloss_target_tier tier2 = word_target_tier value = ann[1] prev = None svg = None for aid, (ref_id, _value, _prev, _) in eaf_3.tiers[tier2][1].items(): if ann[0] == _value: new_aid = eaf_3.generate_annotation_id() eaf_3.tiers[id_tier][1][new_aid] = (aid, value, prev, svg) count = count + 1 # Save the new file print("Saving object to file") eaf_3.to_file(file_3)