def add_elan(self, annotator: Annotator, eaf_path: Union[str, Path], selected_tiers: Optional[List[str]] = None, use_tier_as_annotation: bool = False): """ Add an Elan (.eaf) file's content to the Continuum Parameters ---------- annotator: str A string id for the annotator who produced that ELAN file. eaf_path: `Path` or str Path to the .eaf (ELAN) file. selected_tiers: optional list of str If set, will drop tiers that are not contained in this list. use_tier_as_annotation: optional bool If True, the annotation for each non-empty interval will be the name of its parent Tier. """ from pympi import Eaf eaf = Eaf(eaf_path) for tier_name in eaf.get_tier_names(): if selected_tiers is not None and tier_name not in selected_tiers: continue for start, end, value in eaf.get_annotation_data_for_tier( tier_name): if use_tier_as_annotation: self.add(annotator, Segment(start, end), tier_name) else: self.add(annotator, Segment(start, end), value)
def __init__(self, path_to_file): self.path = path_to_file self.Eaf = Eaf(path_to_file) self.Eaf.clean_time_slots() self.load_tiers() self.load_annotation_data() self.load_participants()
def convert(f_i, f_o=None): """ f_i/f_o: input/output file name/path without extension (str) ... """ doc_i = Eaf(''.join([f_i, '.eaf'])) if not f_o: f_o = f_i # https://pynlpl.readthedocs.io/en/latest/folia.html#editing-folia # https://pynlpl.readthedocs.io/en/latest/folia.html#adding-structure # https://pynlpl.readthedocs.io/en/latest/folia.html#structure-annotation-types print(os.path.basename(f_o)) doc_o = folia.Document(id=os.path.basename(f_o)) # https://github.com/proycon/folia/blob/master/foliatools/conllu2folia.py doc_o.declare(folia.LemmaAnnotation, set=SET_LEMMA_MYSTEM, annotator="Mystem") doc_o.declare(folia.PosAnnotation, set=SET_POS_MYSTEM, annotator="Mystem") doc_o.declare(folia.PosAnnotation, set=SET_POS, annotator="BiRCh group") doc_o.declare(folia.SyntacticUnit, set=SET_SU, annotator="BiRCh group") speech = doc_o.append(folia.Speech) for aa in create_conversation(get_aas(doc_i)): utterance = speech.append(folia.Utterance, id=aa[0], speaker=aa[1], begintime=aa[2], endtime=aa[3]) # https://docs.python.org/3/library/string.html#formatspec #utterance.append(folia.Word,'{:10}:'.format(aa[1])) utterance.append(folia.Word, '{}:'.format(aa[1].upper())) for w in get_tokens(aa[4]): # handle visibility of tokens in the form of tags if len(w) > 1 and w[0] == '<' and w[1] != '$': #print(w) w = '<$' + w[1:] token = utterance.append(folia.Word, w) if is_token_mystem(w): analysis_mystem = m.analyze(w)[0]['analysis'] if analysis_mystem: # mystem's lexeme -> lemma annotation (???) if 'lex' in analysis_mystem[0]: token.append(folia.LemmaAnnotation, cls=analysis_mystem[0]['lex'], set=SET_LEMMA_MYSTEM) if 'gr' in analysis_mystem[0]: pos_plus = analysis_mystem[0]['gr'].strip() pos, features = analyze_mystem_gr(pos_plus) an_pos = token.append(folia.PosAnnotation, head=pos, cls=pos_plus, set=SET_POS_MYSTEM) # https://pynlpl.readthedocs.io/en/latest/folia.html#features an_pos.append(folia.Feature, subset='all', cls=features) doc_o.save(''.join([f_i, '.folia.xml']))
def parse_eaf(eaffile): eaf = Eaf(eaffile) res = {} for k, v in eaf.tiers.iteritems(): print "Item" print k print v info = v[0].values()[0] res[k] = { "name": k, "startTime": eaf.timeslots[info[0]], "endTime": eaf.timeslots[info[1]], "text": info[2], } return res
def convert(f_i, f_o=None): """ f_i: input (ELAN) files (full path, with extension) (str) f_o: output (FoLiA) file (full path, with extension) (str) ... """ doc_i = Eaf(f_i) if not f_o: f_o = '.'.join([f_i.rpartition('.')[0], 'folia.xml']) # https://foliapy.readthedocs.io/en/latest/folia.html#editing-folia # https://foliapy.readthedocs.io/en/latest/folia.html#adding-structure # https://foliapy.readthedocs.io/en/latest/folia.html#structure-annotation-types # print(os.path.basename(f_o)) id_doc_o = os.path.basename(f_o).partition('.')[0] print(id_doc_o) # doc_o = folia.Document(id=os.path.basename(f_o)) doc_o = folia.Document(id=id_doc_o) # https://github.com/proycon/folia/blob/master/foliatools/conllu2folia.py # future: # https://foliapy.readthedocs.io/en/latest/folia.html#declarations # https://foliapy.readthedocs.io/en/latest/folia.html#provenance-information # doc_o.declare(folia.LemmaAnnotation, set=SET_LEMMA) # processor_mystem as a single processor for all annotation performed by this script processor_mystem = doc_o.declare(folia.LemmaAnnotation, set=SET_LEMMA, processor=folia.Processor(name="Mystem+")) # doc_o.declare(folia.PosAnnotation, set=SET_POS) doc_o.declare(folia.PosAnnotation, set=SET_POS, processor=processor_mystem) # doc_o.declare(folia.SyntacticUnit, set=SET_SU, annotator="BiRCh group") doc_o.declare(folia.Description, processor=processor_mystem) doc_o.declare(folia.Comment, processor=processor_mystem) doc_o.declare(folia.Utterance, processor=processor_mystem) doc_o.declare(folia.Word, processor=processor_mystem) doc_o.declare(folia.Hiddenword) # folia.Speech cannot be declared as an annotation type speech = doc_o.append(folia.Speech) for aa in create_conversation(get_aas(doc_i)): utterance = speech.append(folia.Utterance, id=aa[0], speaker=aa[1], begintime=aa[2], endtime=aa[3], processor=processor_mystem) # https://docs.python.org/3/library/string.html#formatspec utterance.append(folia.Word, '{}:'.format(aa[1].upper()), processor=processor_mystem) # aa[4]: utterance text tokens = get_tokens(aa[4]) len_tokens = len(tokens) for i in range(len_tokens): t = tokens[i] # consider the previous token in morphological analysis # pre_t = None # if i: # pre_t = tokens[i-1] pre_t = [None, None] if i > 1: pre_t = [tokens[i - 2], tokens[i - 1]] elif i == 1: pre_t[1] = tokens[i - 1] token = utterance.append(folia.Word, t, processor=processor_mystem) if i < (len_tokens - 1): t = ' '.join([t, tokens[i + 1]]) # lemma, pos, features = analyze_morphology(t) lemma, pos, features = analyze_morphology(pre_t, t) if lemma: token.append(folia.LemmaAnnotation, cls=lemma, set=SET_LEMMA, processor=processor_mystem # annotator='Mystem+' ) if pos: an_pos = token.append(folia.PosAnnotation, cls=pos, set=SET_POS, processor=processor_mystem # annotator='Mystem+' ) if features: # https://foliapy.readthedocs.io/en/latest/folia.html#features an_pos.append(folia.Description, value=re.sub(r'=', r',', features), processor=processor_mystem # annotator='Mystem+' ) an_pos.append(folia.Comment, value=' '.join(['Mystem+ features:', features]), processor=processor_mystem # annotator='Mystem+' ) doc_o.save(f_o)
class ElanObject: def __init__(self, path_to_file): self.path = path_to_file self.Eaf = Eaf(path_to_file) self.Eaf.clean_time_slots() self.load_tiers() self.load_annotation_data() self.load_participants() def load_participants(self): participants_lst = [] for tier_obj in self.tiers_lst: try: p_title = tier_obj.attributes['PARTICIPANT'].title() if p_title not in participants_lst: participants_lst.append(p_title) except KeyError: pass self.participants_lst = participants_lst def load_tiers(self): tiers_lst = [] for tier_name in self.Eaf.tiers.keys(): tiers_lst.append(Tier(tier_name, self.Eaf.tiers[tier_name])) self.tiers_lst = sorted(tiers_lst, key=lambda data: data.ordinal) def load_annotation_data(self): annot_data_lst = [] for tier_obj in self.tiers_lst: if tier_obj.top_level == True: for annot_data in self.Eaf.get_annotation_data_for_tier(tier_obj.name): annot_data_lst.append(annot_data+(tier_obj.name,)) self.annot_data_lst = sorted(annot_data_lst, key=lambda data: data[0]) def get_tier_obj_by_name(self, tier_name): for tier_obj in self.tiers_lst: if tier_obj.name == tier_name: return tier_obj return None def add_extra_tags(self, parent_tier_name, start, end, value, typ): if typ == 'annotation': tier_name = parent_tier_name+'_annotation' ling = 'tokenz_and_annot' elif typ == 'standartization': tier_name = parent_tier_name+'_standartization' ling = 'stndz_clause' else: return None if self.get_tier_obj_by_name(tier_name) == None: self.Eaf.add_tier(tier_name, ling=ling, parent=parent_tier_name, locale=None, part=None, ann=None, language=None, tier_dict=None) self.load_tiers() try: self.Eaf.remove_annotation(tier_name, (start+end) / 2, clean=True) except KeyError: pass self.Eaf.add_annotation(tier_name, start, end, value, svg_ref=None) def save(self): self.Eaf.clean_time_slots() try: os.remove(self.path+'.bak') except OSError: pass Elan.to_eaf(self.path, self.Eaf, pretty=True) os.remove(self.path+'.bak')
def to_eaf( # pylint: disable=too-many-arguments, too-many-locals, too-complex self, to_filepath=None, eafobj=None, linked_media_filepath=None, author="rennet.{}".format(rennet_version), annotinfo_fn=lambda label: EAFAnnotationInfo(tier_name=str(label)), ): labels = np.array([annotinfo_fn(label) for label in self.labels]) assert all( isinstance(l, EAFAnnotationInfo) for l in labels ), "`annotinfo_fn` should return an `EafAnnotationInfo` object for each label" # flatten everything with self.samplerate_as(1000): # pympi only supports milliseconds se, label_idx = self._flattened_indices() if se.dtype != np.int: # EAF requires integers as starts and ends # IDEA: Warn rounding? se = np.rint(se).astype(np.int) if eafobj is None: eaf = Eaf(author=author) eaf.annotations = OrderedDict() eaf.tiers = OrderedDict() eaf.timeslots = OrderedDict() else: eaf = eafobj if linked_media_filepath is not None: try: eaf.add_linked_file(abspath(linked_media_filepath)) except: # pylint: disable=bare-except warnings.warn( RuntimeWarning( "Provided file was not added as linked file due to `pympi` errors. " + "Provided File:\n{}\nError:\n{}".format( linked_media_filepath, sys.exc_info()))) # seen_tier_names = set() for (start, end), lix in zip(se, label_idx): curr_seen_tier_names = set() if lix: for ann in labels[lix, ...]: if ann.tier_name not in eaf.tiers: # FIXME: handle different participant and annotator for same tier_name eaf.add_tier(ann.tier_name, part=ann.participant, ann=ann.annotator) if ann.tier_name in curr_seen_tier_names: raise ValueError( "Duplicate annotations on the same tier in " "the same time-slot is not valid in ELAN.\n" "Found at time-slot {} ms \n{}".format( (start, end), "\n".join(map(str, labels[lix, ...])), # pylint: disable=bad-builtin )) eaf.add_annotation( ann.tier_name, start, end, value=ann.content, ) curr_seen_tier_names.add(ann.tier_name) if to_filepath is not None: eaf.to_file(abspath(to_filepath)) return eaf
def from_eaf(cls, filepath, tiers=(), **kwargs): # pylint: disable=too-many-locals """ Create instance of SequenceLabels from an ELAN annotation file. NOTE: Not all features of ELAN files are supported. For example: - Only the aligned annotations are read. - No attempt is made to read any external refernces, linked files, etc. - Multi-level tier heirarchies are not respected. NOTE: Annotations of duration <= 0 will be ignored. Parameters ---------- filepath: path to the ELAN file tiers: list or tuple of strings or unary callable list or tuple of tier names (as strings) to specify what tiers to be read. By default, this is an empty tuple (or list), and all tiers will be read. If it is an unary callable (i.e. taking one argument), the function will be used as the predicated to filter which tiers should be kept. kwargs: unused, present for proper sub-classing citizenship Returns ------- - if callee `cls` is not `SequenceLabels` (probably a child class): starts_ends: numpy.ndarray of numbers, of shape `(num_annotations_read, 2)`. labels: list of EafAnnotationInfo objects, of length `num_annotations_read`. samplerate: int (most likely 1000, due to limits of `pympi`), the samplerate **kwargs: passed through keyword arguments `**kwargs` - else: instance of SequenceLabels Raises ------ TypeError: if `tiers` is neither a tuple nor a list (of strings). KeyError: if any of the specified tier names are not available in the given file. RuntimeError: if no tiers are found, or if all tiers are empty """ filepath = abspath(filepath) eaf = Eaf(file_path=filepath) # FIXME: Check if the each element is a string, and support py2 as well. if not (isinstance(tiers, (tuple, list)) or callable(tiers)): raise TypeError( "`tiers` should be a tuple or a list of strings, or a predicate function, got: {}" .format(tiers)) tiers = tuple(tiers) if not callable(tiers) else tiers warnemptytier = True if tiers == (): # read all tiers tiers = tuple(eaf.get_tier_names()) # method returns dict_keys warnemptytier = False elif callable(tiers): tiers = tuple(name for name in eaf.get_tier_names() if tiers(name)) if not tiers: raise RuntimeError( "No tiers found in the given file:\n{}".format(filepath)) starts_ends = [] labels = [] samplerate = 1000 # NOTE: pympi only supports annotations in milliseconds for tier in tiers: annots = eaf.get_annotation_data_for_tier(tier) if warnemptytier and not annots: warnings.warn( RuntimeWarning( "No annotations found for tier: {} in file\n{}.". format(tier, filepath))) continue n_rawannots = len(annots) annots = list(zip(*[a for a in annots if a[1] > a[0]])) # filter away annotations that are <= zero duration long if warnemptytier and len(annots[0]) < n_rawannots: msg = "IGNORED {} zero- or negative-duration ".format( n_rawannots - len(annots[0])) msg += "annotations of {} annotations ".format(n_rawannots) msg += "in tier {} in file\n{}".format(tier, filepath) warnings.warn(RuntimeWarning(msg)) starts_ends.extend(zip(*annots[:2])) attrs = eaf.tiers[tier][2] # tier attributes contents = zip(*annots[2:]) # symbolic associations, etc. labels.extend( EAFAnnotationInfo( tier, annotator=attrs.get('ANNOTATOR', ""), participant=attrs.get('PARTICIPANT', ""), content=content, ) for content in contents) if not starts_ends: raise RuntimeError( "All tiers {} were found to be empty in file\n{}".format( tiers, filepath)) return (cls(starts_ends, labels, samplerate) if cls == SequenceLabels else (starts_ends, labels, samplerate, kwargs))
def test_periodic(project): """ os.makedirs('output/eaf', exist_ok = True) project = ChildProject('examples/valid_raw_data') project.read() am = AnnotationManager(project) am.read() """ data = pd.read_csv("tests/data/eaf_segments.csv") am = AnnotationManager(project) am.import_annotations( pd.DataFrame([{ "set": "vtc", "raw_filename": "file.rttm", "time_seek": 0, "recording_filename": "sound.wav", "range_onset": 0, "range_offset": 4000, "format": "vtc_rttm", }]), import_function=partial(fake_vocs, data), ) sampler = PeriodicSampler(project, 500, 500, 250, recordings=['sound.wav']) sampler.sample() sampler.segments.to_csv('output/eaf/segments.csv') ranges = sampler.segments.rename(columns={ "segment_onset": "range_onset", "segment_offset": "range_offset", }) annotations = am.get_within_ranges(ranges, [IMP_FROM], 'warn') #annotations = am.annotations[am.annotations["set"] == IMP_FROM].drop_duplicates(['set', 'recording_filename', 'time_seek', 'range_onset', 'range_offset', 'raw_filename', 'format', 'filter'],ignore_index=True) annot_segments = am.get_segments(annotations) eaf_builder = EafBuilderPipeline() eaf_builder.run( destination='output/eaf', segments='output/eaf/segments.csv', eaf_type='periodic', template='basic', context_onset=250, context_offset=250, path='output/eaf', import_speech_from='vtc', ) eaf = Eaf('output/eaf/sound/sound_periodic_basic.eaf') code = eaf.tiers['code_periodic'][0] segments = [] for pid in code: (start_ts, end_ts, value, svg_ref) = code[pid] (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts]) segments.append({ 'segment_onset': int(start_t), 'segment_offset': int(end_t) }) segments = pd.DataFrame(segments) pd.testing.assert_frame_equal( segments[['segment_onset', 'segment_offset' ]].sort_values(['segment_onset', 'segment_offset']).reset_index(drop=True), sampler.segments[['segment_onset', 'segment_offset' ]].sort_values(['segment_onset', 'segment_offset' ]).reset_index(drop=True)) segments = [] vtc_speech = eaf.tiers['VTC-SPEECH'][0] for pid in vtc_speech: (start_ts, end_ts, value, svg_ref) = vtc_speech[pid] (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts]) segments.append({ 'segment_onset': int(start_t), 'segment_offset': int(end_t) }) segments = pd.DataFrame(segments) speech_segs = annot_segments[pd.isnull(annot_segments['speaker_type'])] pd.testing.assert_frame_equal( segments[['segment_onset', 'segment_offset' ]].sort_values(['segment_onset', 'segment_offset']).reset_index(drop=True), speech_segs[['segment_onset', 'segment_offset' ]].sort_values(['segment_onset', 'segment_offset']).reset_index(drop=True)) segments = [] vtc_chi = eaf.tiers['VTC-CHI'][0] for pid in vtc_chi: (start_ts, end_ts, value, svg_ref) = vtc_chi[pid] (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts]) segments.append({ 'segment_onset': int(start_t), 'segment_offset': int(end_t) }) segments = pd.DataFrame(segments) chi_segs = annot_segments[annot_segments['speaker_type'] == 'CHI'] pd.testing.assert_frame_equal( segments[['segment_onset', 'segment_offset' ]].sort_values(['segment_onset', 'segment_offset']).reset_index(drop=True), chi_segs[['segment_onset', 'segment_offset' ]].sort_values(['segment_onset', 'segment_offset']).reset_index(drop=True)) segments = [] vtc_och = eaf.tiers['VTC-OCH'][0] for pid in vtc_och: (start_ts, end_ts, value, svg_ref) = vtc_och[pid] (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts]) segments.append({ 'segment_onset': int(start_t), 'segment_offset': int(end_t) }) segments = pd.DataFrame(segments) och_segs = annot_segments[annot_segments['speaker_type'] == 'OCH'] pd.testing.assert_frame_equal( segments[['segment_onset', 'segment_offset' ]].sort_values(['segment_onset', 'segment_offset']).reset_index(drop=True), och_segs[['segment_onset', 'segment_offset' ]].sort_values(['segment_onset', 'segment_offset']).reset_index(drop=True)) segments = [] vtc_fem = eaf.tiers['VTC-FEM'][0] for pid in vtc_fem: (start_ts, end_ts, value, svg_ref) = vtc_fem[pid] (start_t, end_t) = (eaf.timeslots[start_ts], eaf.timeslots[end_ts]) segments.append({ 'segment_onset': int(start_t), 'segment_offset': int(end_t) }) segments = pd.DataFrame(segments) fem_segs = annot_segments[annot_segments['speaker_type'] == 'FEM'] pd.testing.assert_frame_equal( segments[['segment_onset', 'segment_offset' ]].sort_values(['segment_onset', 'segment_offset']).reset_index(drop=True), fem_segs[['segment_onset', 'segment_offset' ]].sort_values(['segment_onset', 'segment_offset']).reset_index(drop=True)) assert eaf.media_descriptors[0]['MEDIA_URL'] == 'sound.wav'
class ElanObject: def __init__(self, path_to_file): self.path = path_to_file self.Eaf = Eaf(path_to_file) self.Eaf.clean_time_slots() self.load_tiers() self.load_annotation_data() self.load_participants() def load_participants(self): participants_lst = [] for tier_obj in self.tiers_lst: try: p_title = tier_obj.attributes['PARTICIPANT'].title() if p_title not in participants_lst: participants_lst.append(p_title) except KeyError: pass self.participants_lst = participants_lst def load_tiers(self): tiers_lst = [] for tier_name in self.Eaf.tiers.keys(): tiers_lst.append(Tier(tier_name, self.Eaf.tiers[tier_name])) self.tiers_lst = sorted(tiers_lst, key=lambda data: data.ordinal) def load_annotation_data(self): annot_data_lst = [] for tier_obj in self.tiers_lst: if tier_obj.top_level == True: for annot_data in self.Eaf.get_annotation_data_for_tier( tier_obj.name): annot_data_lst.append(annot_data + (tier_obj.name, )) self.annot_data_lst = sorted(annot_data_lst, key=lambda data: data[0]) def get_tier_obj_by_name(self, tier_name): for tier_obj in self.tiers_lst: if tier_obj.name == tier_name: return tier_obj return None def add_extra_tags(self, parent_tier_name, start, end, value, typ): if typ == 'annotation': tier_name = parent_tier_name + '_annotation' ling = 'tokenz_and_annot' elif typ == 'standartization': tier_name = parent_tier_name + '_standartization' ling = 'stndz_clause' else: return None if self.get_tier_obj_by_name(tier_name) == None: self.Eaf.add_tier(tier_name, ling=ling, parent=parent_tier_name, locale=None, part=None, ann=None, language=None, tier_dict=None) self.load_tiers() try: self.Eaf.remove_annotation(tier_name, (start + end) / 2, clean=True) except KeyError: pass self.Eaf.add_annotation(tier_name, start, end, value, svg_ref=None) def save(self): self.Eaf.clean_time_slots() try: os.remove(self.path + '.bak') except OSError: pass Elan.to_eaf(self.path, self.Eaf, pretty=True) os.remove(self.path + '.bak')