def __init__(self, settingsDir='conf'): Txt2JSON.__init__(self, settingsDir=settingsDir) self.mc = MediaCutter(settings=self.corpusSettings) self.srcExt = 'exb' # extension of the source files to be converted self.tlis = {} # time labels (id -> {'n': number, 'time': time value}) self.pID = 0 # id of last aligned segment self.glosses = set()
def __init__(self, settingsDir='conf'): Txt2JSON.__init__(self, settingsDir=settingsDir) self.speakerMeta = self.load_speaker_meta() self.mc = MediaCutter(settings=self.corpusSettings) self.srcExt = 'eaf' self.tlis = {} # time labels self.pID = 0 # id of last aligned segment self.glosses = set() self.participants = {} # main tier ID -> participant ID self.segmentTree = {} # aID -> (contents, parent aID, tli1, tli2) self.segmentChildren = {} # (aID, child tier type) -> [child aID]
def __init__(self, settingsDir='conf'): Txt2JSON.__init__(self, settingsDir=settingsDir) self.mc = MediaCutter(settings=self.corpusSettings) self.srcExt = 'xml' # extension of the source files to be converted self.participants = {} # participant ID -> dictionary of properties self.tlis = {} # time labels (id -> {'n': number, 'time': time value}) self.wordsByID = {} # word ID -> word object self.morph2wordID = {} # morph ID -> (word ID, position in the word) self.pID = 0 # id of last aligned segment self.seg2pID = {} # ids of <seg> tags -> parallel IDs of corresponding sentences self.wordIDseq = [] # sequence of word/punctuation/incident IDs # (needed to understand ranges such as "w13 to inc2") self.glosses = set() self.posRules = {} self.load_pos_rules(os.path.join(self.corpusSettings['corpus_dir'], 'conf/posRules.txt'))
def __init__(self, settingsDir='conf_conversion'): Txt2JSON.__init__(self, settingsDir=settingsDir) self.speakerMeta = self.load_speaker_meta() self.mc = MediaCutter(settings=self.corpusSettings) self.srcExt = 'eaf' self.tlis = {} # time labels self.pID = 0 # id of last aligned segment self.glosses = set() self.participants = {} # main tier ID -> participant ID self.segmentTree = {} # aID -> (contents, parent aID, tli1, tli2) self.segmentChildren = {} # (aID, child tier type) -> [child aID] self.spanAnnoTiers = {} # span annotation tier type -> {tier ID -> [(tli1, tli2, contents)} self.alignedSpanAnnoTiers = {} # aID of a segment -> {span annotation tier ID -> contents} self.additionalWordFields = [] # names of additional word-level fields associated with some analysis tiers self.privacySegments = {} # segments (start_ms, end_ms) that should be beeped out, one list per source file self.rxIgnoreTokens = None self.set_ignore_tokens() self.usedMediaFiles = set() # filenames of media fragments referenced in the JSONs
class Eaf2JSON(Txt2JSON): """ Contains methods to make JSONs ready for indexing from ELAN aligned files, a csv with metadata and a list with parsed word forms. """ mediaExtensions = {'.wav', '.mp3', '.mp4', '.avi'} rxSpaces = re.compile('[ \t]+') rxLetters = re.compile('\w+') def __init__(self, settingsDir='conf'): Txt2JSON.__init__(self, settingsDir=settingsDir) self.speakerMeta = self.load_speaker_meta() self.mc = MediaCutter(settings=self.corpusSettings) self.srcExt = 'eaf' self.tlis = {} # time labels self.pID = 0 # id of last aligned segment self.glosses = set() self.participants = {} # main tier ID -> participant ID self.segmentTree = {} # aID -> (contents, parent aID, tli1, tli2) self.segmentChildren = {} # (aID, child tier type) -> [child aID] def load_speaker_meta(self): speakerMeta = {} if 'speaker_meta_filename' not in self.corpusSettings: return speakerMeta try: f = open(os.path.join( '..', self.corpusSettings['speaker_meta_filename']), 'r', encoding='utf-8-sig') speakerMeta = json.loads(f.read()) f.close() except FileNotFoundError: print('The speaker metadata file not found.') return speakerMeta def get_tlis(self, srcTree): """ Retrieve and return all time labels from the XML tree. """ tlis = {} iTli = 0 for tli in srcTree.xpath('/ANNOTATION_DOCUMENT/TIME_ORDER/TIME_SLOT'): timeValue = '' if 'TIME_VALUE' in tli.attrib: timeValue = tli.attrib['TIME_VALUE'] tlis[tli.attrib['TIME_SLOT_ID']] = {'n': iTli, 'time': timeValue} iTli += 1 return tlis def traverse_tree(self, srcTree, callback): """ Iterate over all tiers in the XML tree and call the callback function for each of them. """ for tierNode in srcTree.xpath('/ANNOTATION_DOCUMENT/TIER'): if 'TIER_ID' not in tierNode.attrib: continue callback(tierNode) def cb_build_segment_tree(self, tierNode): tierType = '' # analysis tiers: word/POS/gramm/gloss etc. if 'analysis_tiers' in self.corpusSettings: for k, v in self.corpusSettings['analysis_tiers'].items(): if not k.startswith('^'): k = '^' + k if not k.endswith('$'): k += '$' try: rxTierID = re.compile(k) if (rxTierID.search(tierNode.attrib['TIER_ID']) is not None or rxTierID.search( tierNode.attrib['LINGUISTIC_TYPE_REF']) is not None): tierType = v break except: print('Except') for segNode in tierNode.xpath( 'ANNOTATION/REF_ANNOTATION | ANNOTATION/ALIGNABLE_ANNOTATION'): if 'ANNOTATION_ID' not in segNode.attrib: continue aID = segNode.attrib['ANNOTATION_ID'] try: segContents = segNode.xpath('ANNOTATION_VALUE')[0].text.strip() except AttributeError: segContents = '' try: segParent = segNode.attrib['ANNOTATION_REF'] except KeyError: segParent = None tli1, tli2 = None, None if 'TIME_SLOT_REF1' in segNode.attrib: tli1 = segNode.attrib['TIME_SLOT_REF1'] elif segParent in self.segmentTree and self.segmentTree[segParent][ 2] is not None: tli1 = self.segmentTree[segParent][2] if 'TIME_SLOT_REF2' in segNode.attrib: tli2 = segNode.attrib['TIME_SLOT_REF2'] elif segParent in self.segmentTree and self.segmentTree[segParent][ 3] is not None: tli2 = self.segmentTree[segParent][3] self.segmentTree[aID] = (segContents, segParent, tli1, tli2) if segParent is None: continue if len(tierType) > 0: try: self.segmentChildren[(segParent, tierType)].append(aID) except KeyError: self.segmentChildren[(segParent, tierType)] = [aID] def build_segment_tree(self, srcTree): """ Read the entire XML tree and save all segment data (contents, links to the parents and timestamps, if any). """ self.segmentTree = {} self.segmentChildren = {} self.traverse_tree(srcTree, self.cb_build_segment_tree) def fragmentize_src_alignment(self, alignment): """ Find corresponding media file fragment and transform a JSON dictionary with the information about the alignment. """ fileName, fileExt = os.path.splitext(alignment['src'].lower()) if fileExt not in self.mediaExtensions: return ts1 = alignment['off_start_src'] ts2 = alignment['off_end_src'] if len(ts1) <= 0 or len(ts2) <= 0: return ts1frag, ts2frag, srcFileFrag = self.mc.get_media_name( alignment['src'], float(ts1) / EAF_TIME_MULTIPLIER, float(ts2) / EAF_TIME_MULTIPLIER) alignment['src'] = srcFileFrag alignment['off_start_src'] = str(ts1frag) alignment['off_end_src'] = str(ts2frag) def add_src_alignment(self, sent, tli1, tli2, srcFile): """ Add the alignment of the sentence with the sound/video. If word-level time data is available, align words, otherwise align the whole sentence. """ sentAlignments = [] ts1 = self.tlis[tli1]['time'] ts2 = self.tlis[tli2]['time'] sentAlignments.append({ 'off_start_src': ts1, 'off_end_src': ts2, 'true_off_start_src': float(ts1) / EAF_TIME_MULTIPLIER, 'off_start_sent': 0, 'off_end_sent': len(sent['text']), 'mtype': 'audio', 'src_id': ts1 + '_' + ts2, 'src': srcFile }) for alignment in sentAlignments: self.fragmentize_src_alignment(alignment) sent['src_alignment'] = sentAlignments def add_punc(self, text, startOffset): """ Make one or several punctuation tokens out of the text. """ tokens = [] curToken = { 'wf': '', 'off_start': startOffset, 'off_end': startOffset, 'wtype': 'punc' } for i in range(len(text)): if self.rxSpaces.search(text[i]) is not None: if len(curToken['wf']) > 0: curToken['off_end'] = startOffset + i tokens.append(curToken) curToken = { 'wf': '', 'off_start': startOffset + i, 'off_end': startOffset + i, 'wtype': 'punc' } else: curToken['wf'] += text[i] if len(curToken['wf']) > 0: curToken['off_end'] = startOffset + len(text) tokens.append(curToken) return tokens def retrieve_analyses(self, aID, lang=''): """ Compile list of analyses retrieved from the relevant tiers of an analyzed EAF file associated with the token identified by aID. """ analyses = [] analysisTiers = [] for tierType in [ 'pos', 'gramm', 'lemma', 'parts', 'gloss', 'trans_ru' ]: if (aID, tierType) not in self.segmentChildren: continue analysisTiers.append([]) for childID in self.segmentChildren[(aID, tierType)]: if childID not in self.segmentTree: continue contents = self.segmentTree[childID][0] for ana in self.retrieve_analyses(childID, lang=lang): if tierType == 'lemma': ana['lex'] = contents elif tierType == 'parts': ana['parts'] = contents elif tierType == 'gloss': ana['gloss'] = contents elif tierType == 'trans_ru': #print(contents) if re.findall('[а-яёА-ЯЁ]+', contents): ana['trans_ru'] = re.findall( '[а-яёА-ЯЁ._]+', contents)[0].strip('.') elif tierType == 'pos' and len(contents) > 0: ana['gr.pos'] = contents elif tierType == 'gramm': grJSON = self.tp.parser.transform_gramm_str(contents, lang=lang) ana.update(grJSON) analysisTiers[-1].append(ana) analysisTiers[-1] = [ ana for ana in analysisTiers[-1] if len(ana) > 0 ] if len(analysisTiers) <= 0: return [{}] for combination in itertools.product(*analysisTiers): ana = {} for partAna in combination: ana.update(partAna) if len(ana) > 0: self.tp.parser.process_gloss_in_ana(ana) analyses.append(ana) if len(analyses) <= 0: return [{}] return analyses def retrieve_words(self, text, wordIDs, lang=''): """ Return a list of words with their analyses retrieved from the relevant tiers of an analyzed EAF file. Try to align words with the text of the entire sentence. """ words = [] iSentPos = 0 iBufferStart = 0 sBuffer = '' for iWord in range(len(wordIDs)): iWordPos = 0 word = self.segmentTree[wordIDs[iWord]][0] if len(sBuffer) <= 0: iBufferStart = iSentPos if len(word) <= 0: continue while iSentPos < len( text) and text[iSentPos].lower() != word[iWordPos].lower(): sBuffer += text[iSentPos] iSentPos += 1 if len(sBuffer) > 0: words += self.add_punc(sBuffer, iBufferStart) sBuffer = '' iBufferStart = iSentPos if iSentPos == len(text): print('Unexpected end of sentence:', text) return words token = { 'wf': word, 'off_start': iSentPos, 'off_end': iSentPos + len(word), 'wtype': 'word' } while iSentPos < len(text) and iWordPos < len(word): if text[iSentPos].lower() == word[iWordPos].lower(): iSentPos += 1 iWordPos += 1 continue if self.rxLetters.search( word[iWordPos]) is None and self.rxLetters.search( text[iSentPos]) is not None: iWordPos += 1 continue iSentPos += 1 token['off_end'] = iSentPos analyses = [ ana for ana in self.retrieve_analyses(wordIDs[iWord], lang=lang) if len(ana) > 0 ] if len(analyses) > 0: token['ana'] = analyses words.append(token) if iSentPos < len(text): words += self.add_punc(text[iSentPos:], iSentPos) return words def process_tier(self, tierNode, aID2pID, srcFile, alignedTier=False): """ Extract segments from the tier node and iterate over them, returning them as JSON sentences. If alignedTier is False, store the start and end timestamps, as well as pIDs for alignment, in the dictionary aID2pID. If alignedTier is True, use the information from aID2pID for establishing time boundaries of the sentences and aligning it with the source tier. """ lang = '' # We have to find out what language the tier represents. # First, check the tier type. If it is not associated with any language, # check all tier ID regexes. if 'TIER_ID' not in tierNode.attrib: return if ('LINGUISTIC_TYPE_REF' in tierNode.attrib and tierNode.attrib['LINGUISTIC_TYPE_REF'] in self.corpusSettings['tier_languages']): lang = self.corpusSettings['tier_languages'][ tierNode.attrib['LINGUISTIC_TYPE_REF']] else: for k, v in self.corpusSettings['tier_languages'].items(): if not k.startswith('^'): k = '^' + k if not k.endswith('$'): k += '$' try: rxTierID = re.compile(k) if rxTierID.search(tierNode.attrib['TIER_ID']) is not None: lang = v break except: continue if len(lang) <= 0 or lang not in self.corpusSettings['languages']: return langID = self.corpusSettings['languages'].index(lang) speaker = '' if not alignedTier and 'PARTICIPANT' in tierNode.attrib: speaker = tierNode.attrib['PARTICIPANT'] self.participants[tierNode.attrib['TIER_ID']] = speaker else: if ('PARENT_REF' in tierNode.attrib and tierNode.attrib['PARENT_REF'] in self.participants): speaker = self.participants[tierNode.attrib['PARENT_REF']] elif 'PARTICIPANT' in tierNode.attrib: speaker = tierNode.attrib['PARTICIPANT'] segments = tierNode.xpath( 'ANNOTATION/REF_ANNOTATION | ANNOTATION/ALIGNABLE_ANNOTATION') for segNode in segments: if ('ANNOTATION_ID' not in segNode.attrib or segNode.attrib['ANNOTATION_ID'] not in self.segmentTree): continue segData = self.segmentTree[segNode.attrib['ANNOTATION_ID']] if not alignedTier: if segData[2] is None or segData[3] is None: continue tli1 = segData[2] tli2 = segData[3] elif segData[1] is not None: aID = segData[1] pID, tli1, tli2 = aID2pID[aID] else: continue text = segData[0] curSent = { 'text': text, 'words': None, 'lang': langID, 'meta': { 'speaker': speaker } } if speaker in self.speakerMeta: for k, v in self.speakerMeta[speaker].items(): curSent['meta'][k] = v if (segNode.attrib['ANNOTATION_ID'], 'word') not in self.segmentChildren: curSent['words'] = self.tp.tokenizer.tokenize(text) self.tp.splitter.add_next_word_id_sentence(curSent) self.tp.parser.analyze_sentence(curSent, lang=lang) else: curSent['words'] = self.retrieve_words( text, self.segmentChildren[(segNode.attrib['ANNOTATION_ID'], 'word')], lang=lang) self.tp.splitter.add_next_word_id_sentence(curSent) if len(self.corpusSettings['aligned_tiers']) > 0: if not alignedTier: self.pID += 1 aID = segNode.attrib['ANNOTATION_ID'] aID2pID[aID] = (self.pID, tli1, tli2) paraAlignment = { 'off_start': 0, 'off_end': len(curSent['text']), 'para_id': self.pID } curSent['para_alignment'] = [paraAlignment] else: paraAlignment = { 'off_start': 0, 'off_end': len(curSent['text']), 'para_id': pID } curSent['para_alignment'] = [paraAlignment] self.add_src_alignment(curSent, tli1, tli2, srcFile) yield curSent def get_sentences(self, srcTree, srcFile): """ Iterate over sentences in the XML tree. """ # mainTierTypes = '(' + ' | '.join('/ANNOTATION_DOCUMENT/TIER[@LINGUISTIC_TYPE_REF=\'' + x + '\'] | ' + # '/ANNOTATION_DOCUMENT/TIER[@TIER_ID=\'' + x + '\']' # for x in self.corpusSettings['main_tiers']) + ')' # mainTiers = srcTree.xpath(mainTierTypes) mainTiers = [] alignedTiers = [] for tierNode in srcTree.xpath('/ANNOTATION_DOCUMENT/TIER'): for tierRegex in self.corpusSettings['main_tiers']: if not tierRegex.startswith('^'): tierRegex = '^' + tierRegex if not tierRegex.endswith('$'): tierRegex += '$' try: if re.search(tierRegex, tierNode.attrib['TIER_ID']) is not None: mainTiers.append(tierNode) break elif ('LINGUISTIC_TYPE_REF' in tierNode.attrib and re.search(tierRegex, tierNode.attrib['LINGUISTIC_TYPE_REF']) is not None): mainTiers.append(tierNode) break except: pass for tierRegex in self.corpusSettings['aligned_tiers']: if not tierRegex.startswith('^'): tierRegex = '^' + tierRegex if not tierRegex.endswith('$'): tierRegex += '$' try: if re.search(tierRegex, tierNode.attrib['TIER_ID']) is not None: alignedTiers.append(tierNode) break elif ('LINGUISTIC_TYPE_REF' in tierNode.attrib and re.search(tierRegex, tierNode.attrib['LINGUISTIC_TYPE_REF']) is not None): alignedTiers.append(tierNode) break except: pass if len(mainTiers) <= 0: return # if len(self.corpusSettings['aligned_tiers']) > 0: # alignedTierTypes = '(' + ' | '.join('/ANNOTATION_DOCUMENT/TIER[@LINGUISTIC_TYPE_REF=\'' + x + '\'] | ' + # '/ANNOTATION_DOCUMENT/TIER[@TIER_ID=\'' + x + '\']' # for x in self.corpusSettings['aligned_tiers']) + ')' # alignedTiers = srcTree.xpath(alignedTierTypes) aID2pID = {} # annotation ID -> (pID, tli1, tli2) correspondence for tier in mainTiers: for sent in self.process_tier(tier, aID2pID, srcFile, alignedTier=False): yield sent for tier in alignedTiers: for sent in self.process_tier(tier, aID2pID, srcFile, alignedTier=True): yield sent def add_speaker_marks(self, sentences): """ Add the name/code of the speaker in the beginning of every sentence that starts the turn. """ prevSpeaker = '' for i in range(len(sentences)): if 'meta' not in sentences[i] or 'speaker' not in sentences[i][ 'meta']: continue speaker = '[' + sentences[i]['meta']['speaker'] + ']' addOffset = len(speaker) + 2 if sentences[i]['meta']['speaker'] != prevSpeaker: sentences[i][ 'text'] = '\n' + speaker + ' ' + sentences[i]['text'] sentences[i]['words'].insert( 0, { 'off_start': -len(speaker) - 1, 'off_end': -1, 'wf': speaker, 'wtype': 'punc', 'next_word': 0 }) sentences[i]['words'].insert( 0, { 'off_start': -len(speaker) - 2, 'off_end': -len(speaker) - 1, 'wf': '\n', 'wtype': 'punc', 'next_word': -1 }) for w in sentences[i]['words']: w['off_start'] += addOffset w['off_end'] += addOffset w['next_word'] += 2 if 'para_alignment' in sentences[i]: for pa in sentences[i]['para_alignment']: if pa['off_start'] > 0: pa['off_start'] += addOffset pa['off_end'] += addOffset if 'src_alignment' in sentences[i]: for sa in sentences[i]['src_alignment']: if sa['off_start_sent'] > 0: sa['off_start_sent'] += addOffset sa['off_end_sent'] += addOffset prevSpeaker = sentences[i]['meta']['speaker'] if 'last' in sentences[i] and sentences[i]['last']: prevSpeaker = '' def add_sentence_meta(self, sentences, meta): """ Add some of the document-level metadata to the sentences. """ for s in sentences: if 'meta' not in s: continue if 'year1' in meta and 'year2' in meta and meta['year1'] == meta[ 'year2']: s['meta']['year'] = meta['year1'] def convert_file(self, fnameSrc, fnameTarget): curMeta = self.get_meta(fnameSrc) textJSON = {'meta': curMeta, 'sentences': []} nTokens, nWords, nAnalyzed = 0, 0, 0 srcTree = etree.parse(fnameSrc) self.tlis = self.get_tlis(srcTree) self.build_segment_tree(srcTree) srcFileNode = srcTree.xpath( '/ANNOTATION_DOCUMENT/HEADER/MEDIA_DESCRIPTOR') if len(srcFileNode ) > 0 and 'RELATIVE_MEDIA_URL' in srcFileNode[0].attrib: srcFile = self.rxStripDir.sub( '', html.unescape(srcFileNode[0].attrib['RELATIVE_MEDIA_URL'])) elif len(srcFileNode) > 0 and 'MEDIA_URL' in srcFileNode[0].attrib: srcFile = self.rxStripDir.sub( '', html.unescape(srcFileNode[0].attrib['MEDIA_URL'])) else: srcFile = '' textJSON['sentences'] = [ s for s in self.get_sentences(srcTree, srcFile) ] textJSON['sentences'].sort(key=lambda s: (s['lang'], s['src_alignment'] [0]['true_off_start_src'])) for i in range(len(textJSON['sentences']) - 1): # del textJSON['sentences'][i]['src_alignment'][0]['true_off_start_src'] if textJSON['sentences'][i]['lang'] != textJSON['sentences'][ i + 1]['lang']: textJSON['sentences'][i]['last'] = True for word in textJSON['sentences'][i]['words']: nTokens += 1 if word['wtype'] == 'word': nWords += 1 if 'ana' in word and len(word['ana']) > 0: nAnalyzed += 1 self.tp.splitter.recalculate_offsets(textJSON['sentences']) self.tp.splitter.add_next_word_id(textJSON['sentences']) self.add_speaker_marks(textJSON['sentences']) self.add_sentence_meta(textJSON['sentences'], curMeta) self.write_output(fnameTarget, textJSON) return nTokens, nWords, nAnalyzed def process_corpus(self, cutMedia=False): """ Take every eaf file from the source directory subtree, turn it into a parsed json and store it in the target directory. """ Txt2JSON.process_corpus(self) if not cutMedia: return for path, dirs, files in os.walk(os.path.join('..', self.srcExt)): for fname in files: fileExt = os.path.splitext(fname.lower())[1] if fileExt in self.mediaExtensions: fname = os.path.abspath(os.path.join(path, fname)) print('Cutting media file', fname) self.mc.cut_media(fname)
class ISO_TEI_Hamburg2JSON(Txt2JSON): """ Contains methods to make JSONs ready for indexing from transcriptions aligned in Exmaralda in the format used in documentation projects carried out in Hamburg and then translated into a certain ISO TEI subset. """ rxBracketGloss = re.compile('[.-]?\\[.*?\\]') rxWordPunc = re.compile('^( *)([^\\w]*)(.*?)([^\\w]*?)( *)$') rxLetters = re.compile('\w+') rxFloat = re.compile('^[0-9]+(?:\.[0-9]+)?$') rxTrailingZeroes = re.compile('^0+(?=[1-9])|\.0+$') rxNonDigit = re.compile('[^0-9]+') mediaExtensions = {'.wav', '.mp3', '.mp4', '.avi'} sentenceEndPunct = {'declarative': '.', 'interrogative': '?'} namespaces = { 'tei': 'http://www.tei-c.org/ns/1.0', 'xml': 'http://www.w3.org/XML/1998/namespace' } pfx_xml = '{http://www.w3.org/XML/1998/namespace}' pfx_tei = '{http://www.tei-c.org/ns/1.0}' def __init__(self, settingsDir='conf'): Txt2JSON.__init__(self, settingsDir=settingsDir) self.mc = MediaCutter(settings=self.corpusSettings) self.srcExt = 'xml' # extension of the source files to be converted self.participants = {} # participant ID -> dictionary of properties self.tlis = {} # time labels (id -> {'n': number, 'time': time value}) self.wordsByID = {} # word ID -> word object self.morph2wordID = {} # morph ID -> (word ID, position in the word) self.pID = 0 # id of last aligned segment self.seg2pID = { } # ids of <seg> tags -> parallel IDs of corresponding sentences self.wordIDseq = [] # sequence of word/punctuation/incident IDs # (needed to understand ranges such as "w13 to inc2") self.glosses = set() self.posRules = {} self.load_pos_rules( os.path.join(self.corpusSettings['corpus_dir'], 'conf/posRules.txt')) def load_pos_rules(self, fname): """ Load mapping of the POS tags used in the source files to your corpus POS tags. """ if len(fname) <= 0 or not os.path.isfile(fname): return rules = {} f = open(fname, 'r', encoding='utf-8-sig') for line in f: line = line.strip('\r\n') if len(line) > 0: rule = [i.strip() for i in line.split('\t')] if len(rule) != 2: continue rules[rule[0]] = rule[1] f.close() self.posRules = rules def load_speaker_meta(self, srcTree): speakerMeta = {} if 'speaker_meta_filename' in self.corpusSettings: try: f = open(os.path.join( self.corpusSettings['corpus_dir'], self.corpusSettings['speaker_meta_filename']), 'r', encoding='utf-8-sig') speakerMeta = json.loads(f.read()) f.close() except FileNotFoundError: self.log_message('The speaker metadata file not found.') else: for speaker in srcTree.xpath( '/tei:TEI/tei:teiHeader/tei:profileDesc/tei:particDesc/tei:person', namespaces=self.namespaces): if self.pfx_xml + 'id' not in speaker.attrib: continue speakerID = speaker.attrib[self.pfx_xml + 'id'] if 'n' in speaker.attrib: speakerCode = speaker.attrib['n'] else: speakerCode = speakerID speakerMeta[speakerID] = {'speaker': speakerCode} if 'sex' in speaker.attrib: if speaker.attrib['sex'] in ['1', 'M']: speakerMeta[speakerID]['gender'] = 'M' elif speaker.attrib['sex'] in ['2', 'F']: speakerMeta[speakerID]['gender'] = 'F' else: speakerMeta[speakerID]['gender'] = speaker.attrib[ 'sex'] if 'age' in speaker.attrib: speakerMeta[speakerID]['age'] = speaker.attrib['age'] if 'role' in speaker.attrib: speakerMeta[speakerID]['role'] = speaker.attrib['role'] return speakerMeta def get_tlis(self, srcTree): """ Retrieve and return all time labels from the XML tree. """ tlis = {} iTli = 0 for tli in srcTree.xpath('/tei:TEI/tei:text/tei:timeline', namespaces=self.namespaces)[0]: timeValue = tli.attrib[self.pfx_xml + 'id'] if 'interval' in tli.attrib: timeValue = tli.attrib['interval'] elif tli.attrib[self.pfx_xml + 'id'] in ['T0', 'T_START']: timeValue = '0' timeValue = self.rxTrailingZeroes.sub('', timeValue) tlis[tli.attrib[self.pfx_xml + 'id']] = { 'n': iTli, 'time': timeValue } iTli += 1 return tlis def id_range2list(self, idFrom, idTo): """ Turn a range of word/punctuation/incident (such as "w13 to inc2") IDs into a list of consecutive IDs. """ if idFrom not in self.wordIDseq or idTo not in self.wordIDseq: return [] return self.wordIDseq[self.wordIDseq. index(idFrom):self.wordIDseq.index(idTo) + 1] def add_pos_ana(self, ana, pos): """ Add the part of speech tag to single JSON analysis, taking into account the correspondences between source file tags and the target corpus tags. Change the analysis, do not return anything. """ if pos in self.posRules: pos = self.posRules[pos] if 'gr.pos' not in ana: ana['gr.pos'] = pos elif type(ana['gr.pos']) == str and ana['gr.pos'] != pos: ana['gr.pos'] = [ana['gr.pos'], pos] elif pos not in ana['gr.pos']: ana['gr.pos'].append(pos) def collect_annotation(self, annoTree): """ Return a dictionary that contains all word-level annotation events within an annotation block, the keys are word IDs. """ wordAnno = {} for tier in annoTree.xpath('tei:spanGrp', namespaces=self.namespaces): if 'type' not in tier.attrib: continue tierID = tier.attrib['type'] prevWordID = '-1' curWordNMorphs = 0 for wSpan in tier: if 'from' not in wSpan.attrib or 'to' not in wSpan.attrib: continue spanIDs = [wSpan.attrib['from']] wSpanTexts = [wSpan.text] if wSpan.attrib['from'] != wSpan.attrib['to']: # continue if (wSpan.attrib['from'].startswith(('w', 'pc', 'inc')) and wSpan.attrib['to'].startswith( ('w', 'pc', 'inc'))): # Some tiers, such as information structure, allow spans that include # multiple words. In this case, assign the value to each of the words # in the span in case of annotation tiers. However, if the tier is # SpeakerContribution_Event, try to split it into words so that each # word gets a corresponding part of the value. if tierID == 'SpeakerContribution_Event' and wSpan.text is not None: wSpanParts = re.findall('[^ ]+ *', wSpan.text) wSpanTexts = [] iSpanPart = 0 spanIDs = self.id_range2list(wSpan.attrib['from'], wSpan.attrib['to']) for wID in spanIDs: if tierID == 'SpeakerContribution_Event' and wSpan.text is not None: if iSpanPart < len(wSpanParts): wSpanTexts.append(wSpanParts[iSpanPart]) else: wSpanTexts.append('') iSpanPart += 1 else: wSpanTexts.append(wSpan.text) if wSpan.text is not None: self.log_message('Warning: span[from] = ' + wSpan.attrib['from'] + ', span[to] = ' + wSpan.attrib['to'] + ', text = "' + wSpan.text + '".') else: self.log_message('Warning: span[from] = ' + wSpan.attrib['from'] + ', span[to] = ' + wSpan.attrib['to'] + ', text is empty.') else: continue for spanID in spanIDs: wSpanText = wSpanTexts.pop(0) if spanID.startswith('seg'): continue elif spanID.startswith('w'): wordID = spanID elif spanID.startswith('inc'): wordID = spanID elif spanID.startswith('m'): wordID = self.morph2wordID[spanID][0] else: continue if wordID != prevWordID: prevWordID = wordID curWordNMorphs = 0 if wordID not in wordAnno: wordAnno[wordID] = {} if self.pfx_xml + 'id' in wSpan.attrib: self.morph2wordID[wSpan.attrib[self.pfx_xml + 'id']] = ( wordID, curWordNMorphs) curWordNMorphs += 1 if wSpanText is not None: wordAnno[wordID][tierID] = wSpanText else: wordAnno[wordID][tierID] = '' elif tierID not in ['mb', 'mp', 'ge', 'gr']: # Word-based annotations: one flat span for each word if tierID not in wordAnno[wordID]: wordAnno[wordID][tierID] = '' if len(wordAnno[wordID][tierID]) > 0: wordAnno[wordID][tierID] += '-' if wSpanText is not None: wordAnno[wordID][tierID] += wSpanText else: # Multiple morphemes inside one span in e.g. the mb tier wordAnno[wordID][tierID] = '' for mSpan in wSpan: mText = mSpan.text if self.pfx_xml + 'id' in mSpan.attrib: mID = mSpan.attrib[self.pfx_xml + 'id'] elif ('from' in mSpan.attrib and 'to' in mSpan.attrib and mSpan.attrib['from'] == mSpan.attrib['to']): mID = mSpan.attrib['from'] else: # continue mID = wordID + '_covert' # categories not expressed overtly if mText is None: self.log_message( 'Empty morpheme description cell: word ID ' + wordID + ', tier ' + tierID + '.') continue mText = '[' + mText + ']' # if 'mb' not in wordAnno[wordID]: # wordAnno[wordID]['mb'] = '∅' # elif mID not in self.morph2wordID: # wordAnno[wordID]['mb'] += '-∅' # if 'mp' not in wordAnno[wordID]: # wordAnno[wordID]['mp'] = '∅' # elif mID not in self.morph2wordID: # wordAnno[wordID]['mp'] += '-∅' self.morph2wordID[mID] = (wordID, curWordNMorphs) curWordNMorphs += 1 if tierID not in wordAnno[wordID]: wordAnno[wordID][tierID] = '' if len(wordAnno[wordID][tierID]) > 0: wordAnno[wordID][tierID] += '-' if mText is not None: wordAnno[wordID][tierID] += mText else: wordAnno[wordID][tierID] += '∅' return wordAnno def add_ana_fields(self, ana, curWordAnno): """ Add the information from the annotation tier events for the current word to the analysis. For each tier, the name of the tier is the used as the name of the field, and the text of the event is used as the value. """ for tierName in curWordAnno: if tierName in [ 'tx', 'mb', 'mp', 'gr', 'ge', 'ps', 'SpeakerContribution_Event' ]: continue elif len(curWordAnno[tierName]) > 0: ana[tierName] = curWordAnno[tierName] def process_words(self, annoTree): """ Iterate over words in an annotation block and add their analyses to the corresponding word objects in the sentences. """ wordAnno = self.collect_annotation(annoTree) for wordID in wordAnno: ana = {} curWordAnno = wordAnno[wordID] # mp: morph breaks with empty morphemes (corresponds to the mc tier: POS and morph categories) # mb: morph breaks without empty morphemes (corresponds to the gr/ge tiers: actual glosses) if 'ge' in curWordAnno: ana['gloss'] = curWordAnno['ge'] self.glosses |= set(g for g in ana['gloss'].split('-') if g.upper() == g) if 'mp' in curWordAnno: # mp contains normalized versions of morphemes. If this tier exists, # take normalized stem from it and make it a lemma. Then forget mp # and write glosses based on the mb tier, if it exists. ana['parts'] = curWordAnno['mp'] self.tp.parser.process_gloss_in_ana(ana) if 'gloss_index' in ana: stems, newIndexGloss = self.tp.parser.find_stems( ana['gloss_index'], self.corpusSettings['languages'][0]) ana['lex'] = ' '.join(s[1] for s in stems) if 'mb' in curWordAnno: ana['parts'] = curWordAnno['mb'] if 'gr' in curWordAnno: ana['gloss_ru'] = curWordAnno['gr'] self.tp.parser.process_gloss_in_ana(ana, 'ru') if 'ps' in curWordAnno: self.add_pos_ana(ana, curWordAnno['ps']) self.tp.parser.process_gloss_in_ana(ana) if 'gloss_index' in ana: stems, newIndexGloss = self.tp.parser.find_stems( ana['gloss_index'], self.corpusSettings['languages'][0]) if 'lex' not in ana: ana['lex'] = ' '.join(s[1] for s in stems) ana['trans_en'] = self.rxBracketGloss.sub( '', ' '.join(s[0] for s in stems)) self.add_ana_fields(ana, curWordAnno) self.tp.parser.gloss2gr(ana, self.corpusSettings['languages'][0]) ana['gloss_index'] = self.rxBracketGloss.sub('', newIndexGloss) if 'gloss_index_ru' in ana: stems, newIndexGloss = self.tp.parser.find_stems( ana['gloss_index_ru'], self.corpusSettings['languages'][0]) ana['trans_ru'] = self.rxBracketGloss.sub( '', ' '.join(s[0] for s in stems)) del ana['gloss_index_ru'] del ana['gloss_ru'] if 'glosses_covert_ru' in ana: del ana['glosses_covert_ru'] if 'gloss' in ana: ana['gloss'] = self.rxBracketGloss.sub('', ana['gloss']) self.wordsByID[wordID]['ana'] = [ana] self.wordsByID[wordID]['word_source'] = '' if 'SpeakerContribution_Event' in curWordAnno: self.wordsByID[wordID]['word_source'] = curWordAnno[ 'SpeakerContribution_Event'] def fragmentize_src_alignment(self, alignment): """ Find corresponding media file fragment and transform a JSON dictionary with the information about the alignment. """ fileName, fileExt = os.path.splitext(alignment['src'].lower()) if fileExt not in self.mediaExtensions: return ts1 = alignment['off_start_src'] ts2 = alignment['off_end_src'] if len(ts1) <= 0 or len(ts2) <= 0: return ts1frag, ts2frag, srcFileFrag = self.mc.get_media_name( alignment['src'], float(ts1), float(ts2)) alignment['src'] = srcFileFrag alignment['off_start_src'] = str(ts1frag) alignment['off_end_src'] = str(ts2frag) def add_src_alignment(self, sent, sentBoundaries, srcFile): """ Add the alignment of the sentence with the sound/video. """ alignment = { 'off_start_src': self.tlis[sentBoundaries[0]]['time'], 'off_end_src': self.tlis[sentBoundaries[1]]['time'], 'off_start_sent': 0, 'off_end_sent': len(sent['text']), 'mtype': 'audio', 'src_id': sentBoundaries[0] + '_' + sentBoundaries[1], 'src': srcFile } if (self.rxFloat.search(alignment['off_start_src']) is None or self.rxFloat.search(alignment['off_end_src']) is None): return self.fragmentize_src_alignment(alignment) sent['src_alignment'] = [alignment] def get_parallel_sentences(self, srcTree, sentBoundaries, srcFile): """ Iterate over sentences in description tiers aligned with the sentence in the main tx tier. The sentence to align with is defined by the tuple sentBoundaries that contains the start and the end time label for the sentence. """ self.pID += 1 for iTier in range(len(self.corpusSettings['translation_tiers'])): tierName = self.corpusSettings['translation_tiers'][iTier] events = srcTree.xpath('/tei:basic-transcription/tei:basic-body/' 'tei:tier[@xml:id=\'' + tierName + '\']/' 'tei:event[@tei:start=\'' + sentBoundaries[0] + '\' and @tei:end=\'' + sentBoundaries[1] + '\']', namespaces=self.namespaces) for event in events: text = '' for child in event: if child.tail is not None: text += child.tail if len(text) <= 0: text = event.text if text is None or len(text) <= 0: continue text = self.tp.cleaner.clean_text(text) words = self.tp.tokenizer.tokenize(text) paraAlignment = { 'off_start': 0, 'off_end': len(text), 'para_id': self.pID } paraSent = { 'words': words, 'text': text, 'para_alignment': [paraAlignment], 'lang': len(self.corpusSettings['languages']) + iTier } self.add_src_alignment(paraSent, sentBoundaries, srcFile) yield paraSent def get_segment_words(self, segment): """ Extract all words and punctuation from a <seg> node. Return list of words and fill the self.wordsByID dictionary ({word ID -> word object in the list}). """ wordList = [] prevTag = '' for wordNode in segment: if wordNode in (self.pfx_tei + 'w', self.pfx_tei + 'pc' ) and self.pfx_xml + 'id' not in wordNode.attrib: continue try: wordID = wordNode.attrib[self.pfx_xml + 'id'] except KeyError: continue if wordNode.tag == self.pfx_tei + 'w': # if prevTag == self.pfx_tei + 'w' and len(wordList) > 0: # # If there is no time anchor between two words, # # treat it as a single token divided by a word-internal whitespace. # # TODO: This is a temporary solution. Changes have to be made # # to the source Exmaralda files to avoid splitting such words. # wordList[-1]['wf'] += ' ' + wordNode.text.strip() # self.wordsByID[wordNode.attrib[self.pfx_xml + 'id']] = wordList[-1] # print('Warning: consecutive words with no acnhor between them (' + wordList[-1]['wf'] + ')') # else: word = {'wf': wordNode.text.strip(), 'wtype': 'word'} wordList.append(word) self.wordsByID[wordID] = word self.wordIDseq.append(wordID) elif wordNode.tag == self.pfx_tei + 'pc': word = {'wf': wordNode.text.strip(), 'wtype': 'punct'} wordList.append(word) self.wordsByID[wordID] = word elif wordNode.tag == self.pfx_tei + 'incident': # Treat "incidents" as punctuation # continue word = { 'wf': '((' + wordNode[0].text.strip() + '))', 'wtype': 'punct', 'incident': True } wordList.append(word) self.wordsByID[wordID] = word self.wordIDseq.append(wordID) prevTag = wordNode.tag return wordList def align_words_and_baseline(self, sent): """ Fill in the offset fields for individual words in the sentence. """ iSentPos = 0 for iWord in range(len(sent['words'])): iWordPos = 0 word = sent['words'][iWord] wf = word['wf'] if len(wf) <= 0: continue # if 'incident' in word: # sent['text'] = sent['text'][:iSentPos] + ' ' + wf + ' ' + sent['text'][iSentPos:] while (iSentPos < len(sent['text']) and sent['text'][iSentPos].lower() != wf[iWordPos].lower()): iSentPos += 1 if iSentPos == len(sent['text']): if iWord == 0 and word['wtype'] == 'punct': # Try repairing it by inserting that punctuation to the sentence text sent['text'] = wf + sent['text'] iSentPos = 0 print( 'Unexpected end of sentence, attempting to repair sentence text. ' 'Details:\nSentence (SpeakerContribution_Event):', sent['text'], '\nWords (annotationBlock/u/seg):', '+'.join(w['wf'] for w in sent['words'])) else: for iWordRest in range(iWord, len(sent['words'])): sent['words'][iWordRest]['off_start'] = len( sent['text']) - 1 sent['words'][iWordRest]['off_end'] = len( sent['text']) - 1 word['off_end'] = len(sent['text']) - 1 print( 'Unexpected end of sentence, terminating alignment now. ' 'Details:\nSentence (SpeakerContribution_Event):', sent['text'], '\nWords (annotationBlock/u/seg):', '+'.join(w['wf'] for w in sent['words'])) return word['off_start'] = iSentPos word['off_end'] = iSentPos + len(wf) while iSentPos < len(sent['text']) and iWordPos < len(wf): if sent['text'][iSentPos].lower() == wf[iWordPos].lower(): iSentPos += 1 iWordPos += 1 continue if self.rxLetters.search( wf[iWordPos]) is None and self.rxLetters.search( sent['text'][iSentPos]) is not None: iWordPos += 1 continue iSentPos += 1 word['off_end'] = iSentPos if len(sent['words']) > 0 and sent['words'][0]['off_start'] > 0: # Add the beginning of the sentence as punctuation. leadingPunct = { 'wf': sent['text'][:sent['words'][0]['off_start']], 'wtype': 'punct', 'off_start': 0, 'off_end': sent['words'][0]['off_start'] } sent['words'].insert(0, leadingPunct) def add_full_text(self, anno, curSentences, tierName=''): """ Add full texts of the sentences from the tier requested (ts stands for the main text tier). Find relevant sentences based on the time anchors. If there is no such tier, restore the text of the sentence from the word_source properties of individual words. Do not return anything. """ seg2text = {} # (from, to) -> sentence text for spanGr in anno.xpath('tei:spanGrp', namespaces=self.namespaces): if 'type' in spanGr.attrib and spanGr.attrib['type'] == tierName: for span in spanGr.xpath('tei:span', namespaces=self.namespaces): if 'from' not in span.attrib or 'to' not in span.attrib: continue if span.attrib['from'] != span.attrib['to']: self.log_message( '"from" attribute != "to" attribute: ' + span.attrib['from'] + '; ' + span.attrib['to']) if span.attrib['from'] not in self.seg2pID: self.log_message('Wrong "from" attribute: ' + span.attrib['from']) continue if span.attrib['to'] not in self.seg2pID: self.log_message('Wrong "to" attribute: ' + span.attrib['to']) continue spanText = span.text if spanText is None: spanText = '' seg2text[( self.seg2pID[span.attrib['from']], self.seg2pID[span.attrib['from']])] = spanText.strip() for s in curSentences: if 'para_alignment' not in s or len(s['para_alignment']) <= 0: continue paraID = (s['para_alignment'][0]['para_id'], s['para_alignment'][0]['para_id']) if 'text' not in s: s['text'] = '' if paraID in seg2text: s['text'] += seg2text[paraID] else: for w in s['words']: if 'word_source' in w: s['text'] += w['word_source'] del w['word_source'] s['text'] = s['text'].strip(' \t') if 'src_alignment' in s: for sa in s['src_alignment']: sa['off_end_sent'] = len(s['text']) def add_para_offsets(self, sentences): """ Add character offsets to the parallel alignments of each of the sentences. Do not return anything. """ for s in sentences: if 'para_alignment' not in s: continue for para in s['para_alignment']: para['off_start'] = 0 para['off_end'] = len(s['text']) def get_sentences(self, srcTree, srcFile): """ Iterate over sentences in the XML tree. """ annotations = srcTree.xpath( '/tei:TEI/tei:text/tei:body/tei:annotationBlock', namespaces=self.namespaces) if len(annotations) <= 0: return for anno in annotations: firstSentence = False if len(annotations) > 1: firstSentence = True curSentences = [] paraSentences = { } # tier name -> parallel sentences (translations, alternative transcriptions, etc.) sentMeta = {} if 'start' not in anno.attrib or 'end' not in anno.attrib: self.log_message( 'No start or end attribute in annotationBlock ' + anno.attrib[self.pfx_xml + 'id']) continue if 'who' in anno.attrib and anno.attrib['who'] in self.participants: sentMeta = self.participants[anno.attrib['who']] curAnchor = prevAnchor = anno.attrib['start'] endAnchor = anno.attrib['end'] curSent = None for u in anno.xpath('tei:u', namespaces=self.namespaces): for seg_anchor in u: if seg_anchor.tag == self.pfx_tei + 'anchor' and 'synch' in seg_anchor.attrib: curAnchor = seg_anchor.attrib['synch'] if curSent is not None: self.add_src_alignment(curSent, [prevAnchor, curAnchor], srcFile) prevAnchor = curAnchor elif (seg_anchor.tag == self.pfx_tei + 'seg' and self.pfx_xml + 'id' in seg_anchor.attrib): if curSent is not None: curSentences.append(curSent) self.pID += 1 segID = seg_anchor.attrib[self.pfx_xml + 'id'] self.seg2pID[segID] = self.pID curSent = { 'words': self.get_segment_words(seg_anchor), 'text': '', 'lang': 0, 'para_alignment': [{ 'para_id': self.pID }] } if firstSentence and 'who' in anno.attrib and anno.attrib[ 'who'] in self.participants: firstSentence = False curSent['words'].insert( 0, { 'wtype': 'punct', 'wf': '[' + self.participants[anno.attrib['who']] ['speaker'] + ']' }) curSent['words'].insert(0, { 'wtype': 'punct', 'wf': '\n' }) curSent['text'] = '\n[' + self.participants[ anno.attrib['who']]['speaker'] + '] ' if len(sentMeta) > 0: curSent['meta'] = copy.deepcopy(sentMeta) if curSent is not None: curSentences.append(curSent) if curSent is not None: self.add_src_alignment(curSent, [curAnchor, endAnchor], srcFile) self.process_words(anno) self.add_full_text(anno, curSentences) self.add_para_offsets(curSentences) for tierName in self.corpusSettings['tier_languages']: lang = self.corpusSettings['tier_languages'][tierName] langID = self.corpusSettings['languages'].index(lang) if langID == 0: continue paraSentences[tierName] = [] for sent in curSentences: paraSent = { 'words': [], 'text': '', 'lang': langID, 'para_alignment': copy.deepcopy(sent['para_alignment']) } if 'src_alignment' in sent: paraSent['src_alignment'] = copy.deepcopy( sent['src_alignment']) paraSentences[tierName].append(paraSent) self.add_full_text(anno, paraSentences[tierName], tierName) for sent in curSentences: if len(sent['text']) <= 0: self.log_message( 'Zero length sentence: ' + json.dumps(sent, ensure_ascii=False, indent=None)) continue self.align_words_and_baseline(sent) yield sent for tierName in paraSentences: for paraSent in paraSentences[tierName]: if len(paraSent['text']) <= 0: paraSent['words'] = [{ 'wf': '—', 'wtype': 'punct', 'off_start': 0, 'off_end': 1 }] paraSent['text'] = '—' else: paraSent['words'] = self.tp.tokenizer.tokenize( paraSent['text']) paraSent['para_alignment'][0]['off_end'] = len( paraSent['text']) yield paraSent def convert_file(self, fnameSrc, fnameTarget): """ Take one source Exmaralda file fnameSrc, parse the XML tree, extract timestamps, align sentences with words and their analyses and ultimately generate a parsed JSON file ready for indexing. Write the output to fnameTarget. Return number of tokens, number of words and number of words with at least one analysis in the document. """ # curMeta = self.get_meta(fnameSrc) # Currently, no metadata are loaded: print(fnameSrc) curMeta = { 'title': fnameSrc, 'author': '', 'year1': '1900', 'year2': '2017' } textJSON = {'meta': curMeta, 'sentences': []} nTokens, nWords, nAnalyze = 0, 0, 0 self.seg2pID = {} self.morph2wordID = {} self.wordIDseq = [] srcTree = etree.parse(fnameSrc) self.tlis = self.get_tlis(srcTree) self.participants = self.load_speaker_meta(srcTree) srcFileNode = srcTree.xpath( '/tei:TEI/tei:teiHeader/tei:fileDesc/tei:sourceDesc/tei:recordingStmt/tei:recording/tei:media', namespaces=self.namespaces) if len(srcFileNode) > 0 and 'url' in srcFileNode[0].attrib: srcFile = self.rxStripDir.sub('', srcFileNode[0].attrib['url']) else: srcFile = '' textJSON['sentences'] = [ s for s in self.get_sentences(srcTree, srcFile) ] textJSON['sentences'].sort(key=lambda s: s['lang']) for i in range(len(textJSON['sentences']) - 1): if textJSON['sentences'][i]['lang'] != textJSON['sentences'][ i + 1]['lang']: textJSON['sentences'][i]['last'] = True self.tp.splitter.recalculate_offsets(textJSON['sentences']) self.tp.splitter.add_next_word_id(textJSON['sentences']) self.write_output(fnameTarget, textJSON) return nTokens, nWords, nAnalyze def process_corpus(self, cutMedia=True): """ Take every Exmaralda file from the source directory subtree, turn it into a parsed json and store it in the target directory. Split all the corpus media files into overlapping chunks of small duration. This is the main function of the class. """ Txt2JSON.process_corpus(self) if not cutMedia: return for path, dirs, files in os.walk( os.path.join(self.corpusSettings['corpus_dir'], self.srcExt)): for fname in files: fileExt = os.path.splitext(fname.lower())[1] if fileExt in self.mediaExtensions: fname = os.path.abspath(os.path.join(path, fname)) print('Cutting media file', fname) self.mc.cut_media(fname)
class Eaf2JSON(Txt2JSON): """ Contains methods to make JSONs ready for indexing from ELAN aligned files, a csv with metadata and a list with parsed word forms. """ mediaExtensions = {'.wav', '.mp3', '.mp4', '.avi', '.mov', '.mts'} rxSpaces = re.compile('[ \t]+') rxLetters = re.compile('\w+') bracketPairs = { ']': re.compile('\\[[^ \\]]*$'), ')': re.compile('\\([^ \\)]*$'), '>': re.compile('<[^ >]*$'), '}': re.compile('\\{[^ \\}]*$'), } standardAnaTiers = ['pos', 'gramm', 'lemma', 'parts', 'gloss'] def __init__(self, settingsDir='conf_conversion'): Txt2JSON.__init__(self, settingsDir=settingsDir) self.speakerMeta = self.load_speaker_meta() self.mc = MediaCutter(settings=self.corpusSettings) self.srcExt = 'eaf' self.tlis = {} # time labels self.pID = 0 # id of last aligned segment self.glosses = set() self.participants = {} # main tier ID -> participant ID self.segmentTree = {} # aID -> (contents, parent aID, tli1, tli2) self.segmentChildren = {} # (aID, child tier type) -> [child aID] self.spanAnnoTiers = {} # span annotation tier type -> {tier ID -> [(tli1, tli2, contents)} self.alignedSpanAnnoTiers = {} # aID of a segment -> {span annotation tier ID -> contents} self.additionalWordFields = [] # names of additional word-level fields associated with some analysis tiers self.privacySegments = {} # segments (start_ms, end_ms) that should be beeped out, one list per source file self.rxIgnoreTokens = None self.set_ignore_tokens() self.usedMediaFiles = set() # filenames of media fragments referenced in the JSONs def set_ignore_tokens(self): """ Compile regexes for tokens which should be ignored when aligning the token tier with the text tier. """ if 'ignore_tokens' not in self.corpusSettings: self.corpusSettings['ignore_tokens'] = '' if not self.corpusSettings['ignore_tokens'].startswith('^'): self.corpusSettings['ignore_tokens'] = '^' + self.corpusSettings['ignore_tokens'] if not self.corpusSettings['ignore_tokens'].endswith('$'): self.corpusSettings['ignore_tokens'] += '$' try: self.rxIgnoreTokens = re.compile(self.corpusSettings['ignore_tokens']) except: print('Please check your ignore token regex.') def load_speaker_meta(self): speakerMeta = {} if 'speaker_meta_filename' not in self.corpusSettings: return speakerMeta try: f = open(os.path.join(self.corpusSettings['corpus_dir'], self.corpusSettings['speaker_meta_filename']), 'r', encoding='utf-8-sig') speakerMeta = json.loads(f.read()) f.close() except FileNotFoundError: print('The speaker metadata file not found.') return speakerMeta def get_tlis(self, srcTree): """ Retrieve and return all time labels from the XML tree. """ tlis = {} iTli = 0 for tli in srcTree.xpath('/ANNOTATION_DOCUMENT/TIME_ORDER/TIME_SLOT'): timeValue = '' if 'TIME_VALUE' in tli.attrib: timeValue = tli.attrib['TIME_VALUE'] tlis[tli.attrib['TIME_SLOT_ID']] = {'n': iTli, 'time': timeValue} iTli += 1 return tlis def traverse_tree(self, srcTree, callback): """ Iterate over all tiers in the XML tree and call the callback function for each of them. """ for tierNode in srcTree.xpath('/ANNOTATION_DOCUMENT/TIER'): if 'TIER_ID' not in tierNode.attrib: continue callback(tierNode) def add_aligned_style_span_data(self, parentID, annoTierID, text): if annoTierID is None or len(annoTierID) <= 0 or parentID is None: return if parentID not in self.alignedSpanAnnoTiers: self.alignedSpanAnnoTiers[parentID] = {} self.alignedSpanAnnoTiers[parentID][annoTierID] = text def get_span_tier_id(self, tierNode): """ Return tier ID and the sentence-level metadata field name for a tier that contains sentence-level annotation, based on the span_annotation_tiers dictionary in conversion_settings.json. """ if 'span_annotation_tiers' not in self.corpusSettings: return tierNode.attrib['TIER_ID'], None annoTierRules = {} if ('LINGUISTIC_TYPE_REF' in tierNode.attrib and tierNode.attrib['LINGUISTIC_TYPE_REF'] in self.corpusSettings['span_annotation_tiers']): annoTierRules = self.corpusSettings['span_annotation_tiers'][tierNode.attrib['LINGUISTIC_TYPE_REF']] else: for k, v in self.corpusSettings['span_annotation_tiers'].items(): if not k.startswith('^'): k = '^' + k if not k.endswith('$'): k += '$' try: rxTierID = re.compile(k) if rxTierID.search(tierNode.attrib['TIER_ID']) is not None: annoTierRules = v break except: continue if len(annoTierRules) <= 0 or 'sentence_meta' not in annoTierRules: return tierNode.attrib['TIER_ID'], None return tierNode.attrib['TIER_ID'], annoTierRules['sentence_meta'] def cb_build_segment_tree(self, tierNode): tierType = '' # analysis tiers: word/POS/gramm/gloss etc. if 'analysis_tiers' in self.corpusSettings: for k, v in self.corpusSettings['analysis_tiers'].items(): if not k.startswith('^'): k = '^' + k if not k.endswith('$'): k += '$' try: rxTierID = re.compile(k) if (rxTierID.search(tierNode.attrib['TIER_ID']) is not None or rxTierID.search(tierNode.attrib['LINGUISTIC_TYPE_REF']) is not None): tierType = v if tierType not in self.standardAnaTiers: self.additionalWordFields.append(tierType) break except: print('Something is wrong with an analysis tier regex: ' + k) for segNode in tierNode.xpath('ANNOTATION/REF_ANNOTATION | ANNOTATION/ALIGNABLE_ANNOTATION'): if 'ANNOTATION_ID' not in segNode.attrib: continue aID = segNode.attrib['ANNOTATION_ID'] try: segContents = segNode.xpath('ANNOTATION_VALUE')[0].text.strip() except AttributeError: segContents = '' try: segParent = segNode.attrib['ANNOTATION_REF'] except KeyError: segParent = None tli1, tli2 = None, None if 'TIME_SLOT_REF1' in segNode.attrib: tli1 = segNode.attrib['TIME_SLOT_REF1'] elif segParent in self.segmentTree and self.segmentTree[segParent][2] is not None: tli1 = self.segmentTree[segParent][2] if 'TIME_SLOT_REF2' in segNode.attrib: tli2 = segNode.attrib['TIME_SLOT_REF2'] elif segParent in self.segmentTree and self.segmentTree[segParent][3] is not None: tli2 = self.segmentTree[segParent][3] self.segmentTree[aID] = (segContents, segParent, tli1, tli2) if segParent is None: continue if len(tierType) > 0: try: self.segmentChildren[(segParent, tierType)].append(aID) except KeyError: self.segmentChildren[(segParent, tierType)] = [aID] annoTierID, annoTierType = self.get_span_tier_id(tierNode) self.add_aligned_style_span_data(segParent, annoTierType, segContents) def build_segment_tree(self, srcTree): """ Read the entire XML tree and save all segment data (contents, links to the parents and timestamps, if any). """ self.segmentTree = {} self.segmentChildren = {} self.traverse_tree(srcTree, self.cb_build_segment_tree) def fragmentize_src_alignment(self, sent): """ Find corresponding media file fragment and transform a JSON dictionaries with the information about the alignment. """ if 'src_alignment' not in sent: return sent['src_alignment'].sort(key=lambda a: a['off_start_src']) minTime = sent['src_alignment'][0]['off_start_src'] maxTime = sent['src_alignment'][-1]['off_end_src'] for alignment in sent['src_alignment']: fileName, fileExt = os.path.splitext(alignment['src'].lower()) if fileExt not in self.mediaExtensions: return segStart = alignment['off_start_src'] segEnd = alignment['off_end_src'] ts1frag, ts2frag, srcFileFrag = self.mc.get_media_name(alignment['src'], segStart, segEnd, minTime=minTime, maxTime=maxTime) self.usedMediaFiles.add(srcFileFrag) alignment['src'] = srcFileFrag alignment['off_start_src'] = ts1frag alignment['off_end_src'] = ts2frag def add_src_alignment(self, sent, tli1, tli2, srcFile): """ Add the alignment of the sentence with the sound/video. If word-level time data is available, align words, otherwise align the whole sentence. """ sentAlignments = [] ts1 = self.tlis[tli1]['time'] ts2 = self.tlis[tli2]['time'] sentAlignments.append({'off_start_src': float(ts1) / EAF_TIME_MULTIPLIER, 'off_end_src': float(ts2) / EAF_TIME_MULTIPLIER, 'true_off_start_src': float(ts1) / EAF_TIME_MULTIPLIER, 'off_start_sent': 0, 'off_end_sent': len(sent['text']), 'mtype': 'audio', 'src_id': ts1 + '_' + ts2, 'src': srcFile}) # for alignment in sentAlignments: # self.fragmentize_src_alignment(alignment) sent['src_alignment'] = sentAlignments def add_punc(self, words, text, prevText, startOffset): """ Make one or several punctuation tokens out of the text and add them to the words list. """ if len(text) <= 0: return # First, check for closing brackets that should belong to the word: if text[0] in self.bracketPairs and len(words) > 0: if self.bracketPairs[text[0]].search(prevText) is not None: words[-1]['off_end'] += 1 text = text[1:] curToken = {'wf': '', 'off_start': startOffset, 'off_end': startOffset, 'wtype': 'punct'} for i in range(len(text)): if self.rxSpaces.search(text[i]) is not None: if len(curToken['wf']) > 0: curToken['off_end'] = startOffset + i words.append(curToken) curToken = {'wf': '', 'off_start': startOffset + i, 'off_end': startOffset + i, 'wtype': 'punct'} else: curToken['wf'] += text[i] if len(curToken['wf']) > 0: curToken['off_end'] = startOffset + len(text) words.append(curToken) def retrieve_analyses(self, aID, lang='', topLevel=True): """ Compile list of analyses retrieved from the relevant tiers of an analyzed EAF file associated with the token identified by aID. topLevel == True iff the function was called by a token processor, rather than by the same function recursively. This is needed because certain wrap-up operations should be performed only on the top level, e.g. gloss-to-tag conversion or collation of analyses. TODO: actually, the top-level tier here is the lowest tier in the hierarchy where subdivision of a parent cell implies multiple analyses. A POS or a lemma tier could be top-level, for example. """ analyses = [] analysisTiers = [] for tierType in set(self.standardAnaTiers) | set(self.additionalWordFields): if (aID, tierType) not in self.segmentChildren: continue analysisTiers.append([]) for childID in self.segmentChildren[(aID, tierType)]: if childID not in self.segmentTree: continue contents = self.segmentTree[childID][0] for ana in self.retrieve_analyses(childID, lang=lang, topLevel=False): if tierType == 'lemma': ana['lex'] = contents elif tierType == 'parts': ana['parts'] = contents elif tierType == 'gloss': ana['gloss'] = contents elif tierType == 'pos' and len(contents) > 0: ana['gr.pos'] = contents elif tierType == 'gramm': grJSON = self.tp.parser.transform_gramm_str(contents, lang=lang) ana.update(grJSON) elif tierType in self.additionalWordFields: ana[tierType] = contents analysisTiers[-1].append(ana) analysisTiers[-1] = [ana for ana in analysisTiers[-1] if len(ana) > 0] if len(analysisTiers) <= 0: return [{}] for combination in itertools.product(*analysisTiers): ana = {} for partAna in combination: ana.update(partAna) if len(ana) > 0: analyses.append(ana) if topLevel: if ('one_morph_per_cell' in self.corpusSettings and self.corpusSettings['one_morph_per_cell']): curLex = set() curStemGloss = set() allAnaFields = set() for ana in analyses: for k in ana: allAnaFields.add(k) totalAna = {k: '' for k in allAnaFields} for k in totalAna: for ana in analyses: if k in ['lex'] or k.startswith('gr.'): if k in ana: if len(totalAna[k]) <= 0: totalAna[k] = ana[k] elif type(totalAna[k]) == str and totalAna[k] != ana[k]: totalAna[k] = [totalAna[k], ana[k]] elif type(totalAna[k]) == list and ana[k] not in totalAna[k]: totalAna[k].append(ana[k]) else: if len(totalAna[k]) > 0 and k not in ['parts']: totalAna[k] += '-' if k not in ana: totalAna[k] += '∅' else: totalAna[k] += ana[k] if k == 'parts' and not ana[k].startswith('-') and not ana[k].endswith('-'): curLex.add(ana[k]) if 'gloss' in ana: curStemGloss.add(ana['gloss']) if 'lex' not in totalAna or len(totalAna['lex']) <= 0: totalAna['lex'] = [l for l in sorted(curLex)] if len(totalAna['lex']) == 1: totalAna['lex'] = totalAna['lex'][0] if 'trans_en' not in totalAna or len(totalAna['trans_en']) <= 0: totalAna['trans_en'] = [t for t in sorted(curStemGloss)] if len(totalAna['trans_en']) == 1: totalAna['trans_en'] = totalAna['trans_en'][0] analyses = [totalAna] for ana in analyses: self.tp.parser.process_gloss_in_ana(ana) if 'gloss_index' in ana: if 'analysis_tiers' in self.corpusSettings and 'gramm' not in self.corpusSettings['analysis_tiers']: self.tp.parser.gloss2gr(ana, self.corpusSettings['languages'][0]) if len(analyses) <= 0: return [{}] return analyses def retrieve_words(self, text, wordIDs, lang=''): """ Return a list of words with their analyses retrieved from the relevant tiers of an analyzed EAF file. Try to align words with the text of the entire sentence. Return the text as well, since it may be slightly altered if there is no exact correspondence between the text tier and the token tier. """ words = [] iSentPos = 0 iBufferStart = 0 sBuffer = '' for iWord in range(len(wordIDs)): iWordPos = 0 word = self.segmentTree[wordIDs[iWord]][0] if len(sBuffer) <= 0: iBufferStart = iSentPos if len(word) <= 0 or self.rxIgnoreTokens.search(word) is not None: continue while iSentPos < len(text) and text[iSentPos].lower() != word[iWordPos].lower(): sBuffer += text[iSentPos] iSentPos += 1 if len(sBuffer) > 0: self.add_punc(words, sBuffer, text[:iBufferStart], iBufferStart) sBuffer = '' iBufferStart = iSentPos if iSentPos == len(text): # If the remaining tokens consist of punctuation, add them to the sentence if self.rxLetters.search(word) is None and self.rxIgnoreTokens.search(word) is None: text += word self.add_punc(words, word, text[:iSentPos], iSentPos) continue else: print('Unexpected end of sentence:', text) return words, text token = {'wf': word, 'off_start': iSentPos, 'off_end': iSentPos + len(word), 'wtype': 'word', 'n_orig': iWord} while iSentPos < len(text) and iWordPos < len(word): if text[iSentPos].lower() == word[iWordPos].lower(): iSentPos += 1 iWordPos += 1 continue if self.rxLetters.search(word[iWordPos]) is None and self.rxLetters.search(text[iSentPos]) is not None: iWordPos += 1 continue iSentPos += 1 token['off_end'] = iSentPos analyses = [ana for ana in self.retrieve_analyses(wordIDs[iWord], lang=lang) if len(ana) > 0] if len(analyses) > 0: token['ana'] = analyses words.append(token) if iSentPos < len(text): self.add_punc(words, text[iSentPos:], text[:iSentPos], iSentPos) return words, text def process_span_annotation_tier(self, tierNode): """ If the tier in tierNode is a span annotation tier, extract its data. If the tier is time-aligned, save the data to self.spanAnnoTiers[annoTierID] as time labels. """ if ('span_annotation_tiers' not in self.corpusSettings or len(self.corpusSettings['span_annotation_tiers']) <= 0): return annoTierID, annoTierType = self.get_span_tier_id(tierNode) if annoTierType is None or len(annoTierType) <= 0: return if annoTierType not in self.spanAnnoTiers: self.spanAnnoTiers[annoTierType] = {} if annoTierID not in self.spanAnnoTiers[annoTierType]: self.spanAnnoTiers[annoTierType][annoTierID] = [] segments = tierNode.xpath('ANNOTATION/ALIGNABLE_ANNOTATION') for segNode in segments: if ('ANNOTATION_ID' not in segNode.attrib or segNode.attrib['ANNOTATION_ID'] not in self.segmentTree): continue segData = self.segmentTree[segNode.attrib['ANNOTATION_ID']] if segData[2] is None or segData[3] is None: continue tli1 = segData[2] tli2 = segData[3] text = segData[0] self.spanAnnoTiers[annoTierType][annoTierID].append((tli1, tli2, text)) self.spanAnnoTiers[annoTierType][annoTierID].sort( key=lambda x: (float(self.tlis[x[0]]['time']), float(self.tlis[x[1]]['time']), x[2]) ) def add_privacy_segments(self, srcTree, srcFile): """ Remember segments that should be beeped out because they contain sensitive data. """ if 'privacy_tier' not in self.corpusSettings or len(srcFile) <= 0: return privTierID = self.corpusSettings['privacy_tier'] if srcFile not in self.privacySegments: self.privacySegments[srcFile] = [] for tierNode in srcTree.xpath('/ANNOTATION_DOCUMENT/TIER'): if 'TIER_ID' not in tierNode.attrib: continue if (tierNode.attrib['TIER_ID'] == privTierID or ('LINGUISTIC_TYPE_REF' in tierNode.attrib and tierNode.attrib['LINGUISTIC_TYPE_REF'] == privTierID)): segments = tierNode.xpath('ANNOTATION/ALIGNABLE_ANNOTATION') for segNode in segments: if ('ANNOTATION_ID' not in segNode.attrib or segNode.attrib['ANNOTATION_ID'] not in self.segmentTree): continue segData = self.segmentTree[segNode.attrib['ANNOTATION_ID']] if segData[2] is None or segData[3] is None: continue tli1 = segData[2] tli2 = segData[3] self.privacySegments[srcFile].append((int(self.tlis[tli1]['time']), int(self.tlis[tli2]['time']))) def process_tier(self, tierNode, aID2pID, srcFile, alignedTier=False): """ Extract segments from the tier node and iterate over them, returning them as JSON sentences. If alignedTier is False, store the start and end timestamps, as well as pIDs for alignment, in the dictionary aID2pID. If alignedTier is True, use the information from aID2pID for establishing time boundaries of the sentences and aligning it with the source tier. """ lang = '' # We have to find out what language the tier represents. # First, check the tier type. If it is not associated with any language, # check all tier ID regexes. if 'TIER_ID' not in tierNode.attrib: return # Find out the participant (speaker) and save that information speaker = '' if not alignedTier and 'PARTICIPANT' in tierNode.attrib: speaker = tierNode.attrib['PARTICIPANT'] self.participants[tierNode.attrib['TIER_ID']] = speaker else: if ('PARENT_REF' in tierNode.attrib and tierNode.attrib['PARENT_REF'] in self.participants): speaker = self.participants[tierNode.attrib['PARENT_REF']] self.participants[tierNode.attrib['TIER_ID']] = speaker elif 'PARTICIPANT' in tierNode.attrib: speaker = tierNode.attrib['PARTICIPANT'] self.participants[tierNode.attrib['TIER_ID']] = speaker # Find out the language of the tier if ('LINGUISTIC_TYPE_REF' in tierNode.attrib and tierNode.attrib['LINGUISTIC_TYPE_REF'] in self.corpusSettings['tier_languages']): lang = self.corpusSettings['tier_languages'][tierNode.attrib['LINGUISTIC_TYPE_REF']] else: for k, v in self.corpusSettings['tier_languages'].items(): if not k.startswith('^'): k = '^' + k if not k.endswith('$'): k += '$' try: rxTierID = re.compile(k) if rxTierID.search(tierNode.attrib['TIER_ID']) is not None: lang = v break except: continue if len(lang) <= 0 or lang not in self.corpusSettings['languages']: # A tier can also contain span annotations, let's check it: if len(lang) <= 0 and not alignedTier: self.process_span_annotation_tier(tierNode) # Otherwise, we do not want a tier with no language association return langID = self.corpusSettings['languages'].index(lang) segments = tierNode.xpath('ANNOTATION/REF_ANNOTATION | ANNOTATION/ALIGNABLE_ANNOTATION') for segNode in segments: if ('ANNOTATION_ID' not in segNode.attrib or segNode.attrib['ANNOTATION_ID'] not in self.segmentTree): continue segData = self.segmentTree[segNode.attrib['ANNOTATION_ID']] if not alignedTier: if segData[2] is None or segData[3] is None: continue tli1 = segData[2] tli2 = segData[3] elif segData[1] is not None: aID = segData[1] pID, tli1, tli2 = aID2pID[aID] else: continue text = segData[0] curSent = {'text': text, 'words': None, 'lang': langID, 'meta': {'speaker': speaker}} # Add speaker metadata if speaker in self.speakerMeta: for k, v in self.speakerMeta[speaker].items(): curSent['meta'][k] = v # Add metadata and style spans from sentence-aligned annotation tiers if segNode.attrib['ANNOTATION_ID'] in self.alignedSpanAnnoTiers: spanAnnoData = self.alignedSpanAnnoTiers[segNode.attrib['ANNOTATION_ID']] for annoTierID in spanAnnoData: curSpanValue = spanAnnoData[annoTierID] if annoTierID not in curSent['meta']: curSent['meta'][annoTierID] = [] if curSpanValue not in curSent['meta'][annoTierID]: curSent['meta'][annoTierID].append(curSpanValue) # Add style spans curRules = {} for tierID in self.corpusSettings['span_annotation_tiers']: if ('sentence_meta' in self.corpusSettings['span_annotation_tiers'][tierID] and self.corpusSettings['span_annotation_tiers'][tierID][ 'sentence_meta'] == annoTierID): curRules = self.corpusSettings['span_annotation_tiers'][tierID] break if len(curRules) <= 0: continue if 'styles' in curRules and curSpanValue in curRules['styles']: spanStyle = curRules['styles'][curSpanValue] if 'style_spans' not in curSent: curSent['style_spans'] = [] curSent['style_spans'].append({ 'off_start': 0, 'off_end': len(curSent['text']), 'span_class': spanStyle, 'tooltip_text': curSpanValue }) # Tokenize the sentence or align it with an existing tokenization if (segNode.attrib['ANNOTATION_ID'], 'word') not in self.segmentChildren: curSent['words'] = self.tp.tokenizer.tokenize(text) self.tp.splitter.add_next_word_id_sentence(curSent) self.tp.parser.analyze_sentence(curSent, lang=lang) curSent['nTokensOrig'] = len(curSent['words']) else: tokensOrig = self.segmentChildren[(segNode.attrib['ANNOTATION_ID'], 'word')] curSent['nTokensOrig'] = len(tokensOrig) curSent['words'], curSent['text'] = self.retrieve_words(text, tokensOrig, lang=lang) self.tp.splitter.add_next_word_id_sentence(curSent) if len(self.corpusSettings['aligned_tiers']) > 0: if not alignedTier: self.pID += 1 aID = segNode.attrib['ANNOTATION_ID'] aID2pID[aID] = (self.pID, tli1, tli2) paraAlignment = {'off_start': 0, 'off_end': len(curSent['text']), 'para_id': self.pID} curSent['para_alignment'] = [paraAlignment] else: paraAlignment = {'off_start': 0, 'off_end': len(curSent['text']), 'para_id': pID} curSent['para_alignment'] = [paraAlignment] self.add_src_alignment(curSent, tli1, tli2, srcFile) yield curSent def add_span_annotations(self, sentences): """ Add span-like annotations, i.e. annotations that could span several tokens or even sentences and reside in time-aligned tiers. Add them to the relevant sentences as style spans and/or as sentence-level metadata values, depending on what is said in corpusSettings['span_annotation_tiers']. Modify sentences, do not return anything. """ sentences.sort(key=lambda s: s['src_alignment'][0]['true_off_start_src']) for annoTierType in self.spanAnnoTiers: curRules = {} for tierID in self.corpusSettings['span_annotation_tiers']: if ('sentence_meta' in self.corpusSettings['span_annotation_tiers'][tierID] and self.corpusSettings['span_annotation_tiers'][tierID]['sentence_meta'] == annoTierType): curRules = self.corpusSettings['span_annotation_tiers'][tierID] break if len(curRules) <= 0: continue for annoTierID in self.spanAnnoTiers[annoTierType]: # There may be more than one span-like annotation tier of a given type. # Different tiers may refer to different participants, so we have to # check which tiers should trigger metadata changes for which sentences. curSpeaker = '' if annoTierID in self.participants: curSpeaker = self.participants[annoTierID] iSentence = 0 iSpan = 0 while iSentence < len(sentences) and iSpan < len(self.spanAnnoTiers[annoTierType][annoTierID]): curSpan = self.spanAnnoTiers[annoTierType][annoTierID][iSpan] curSentence = sentences[iSentence] if 'languages' in curRules and 'lang' in curSentence: if self.corpusSettings['languages'][curSentence['lang']] not in curRules['languages']: iSentence += 1 continue if (len(curSpeaker) > 0 and 'meta' in curSentence and 'speaker' in curSentence['meta'] and curSentence['meta']['speaker'] != curSpeaker): iSentence += 1 continue curSpanStart = float(self.tlis[curSpan[0]]['time']) / EAF_TIME_MULTIPLIER curSpanEnd = float(self.tlis[curSpan[1]]['time']) / EAF_TIME_MULTIPLIER curSpanValue = curSpan[2] # This is happening after the offsets are recalculated to account for media cutting curSentenceStart = curSentence['src_alignment'][0]['true_off_start_src'] curSentenceEnd = curSentenceStart + (float(curSentence['src_alignment'][0]['off_end_src']) - float(curSentence['src_alignment'][0]['off_start_src'])) if curSpanStart >= curSentenceEnd - 0.03 or len(curSentence['words']) <= 0: iSentence += 1 continue elif curSpanEnd <= curSentenceStart + 0.03: iSpan += 1 continue if 'meta' not in curSentence: curSentence['meta'] = {} if annoTierType not in curSentence['meta']: curSentence['meta'][annoTierType] = [] if curSpanValue not in curSentence['meta'][annoTierType]: curSentence['meta'][annoTierType].append(curSpanValue) # The ugly part: span-like annotations in ELAN are time-aligned, but usually # they refer to tokens, which are symbolical subdivisions of a time-aligned # sentence. So the "real" time boundaries of span-like annotations are visually # aligned with "imaginary" time boundaries of tokens. # We will calculate these imaginary boundaries to compare them to the annotation # boundaries and know which tokens the annotation should cover. # Note that the visual alignment can be imperfect, so we have to account for that. # We use the original tokenization as represented in ELAN for calcuations, # which might be different from what is in curSentence['words'] now (e.g. punctuation # might have been absent from the original tokens). tokenDuration = (curSentenceEnd - curSentenceStart) / curSentence['nTokensOrig'] tokensInvolvedOrig = [] tokensInvolved = [] for iToken in range(curSentence['nTokensOrig']): tokenStart = curSentenceStart + (iToken + 0.1) * tokenDuration tokenEnd = curSentenceStart + (iToken + 0.9) * tokenDuration if curSpanStart <= tokenStart and tokenEnd <= curSpanEnd: tokensInvolvedOrig.append(iToken) # Find which actual token numbers correspond to the original ones. if any('n_orig' in t for t in curSentence['words']): for iToken in range(len(curSentence['words'])): curToken = curSentence['words'][iToken] if 'n_orig' in curToken and curToken['n_orig'] in tokensInvolvedOrig: tokensInvolved.append(iToken) else: tokensInvolved = tokensInvolvedOrig # I'm not sure this is really necessary if (len(tokensInvolved) > 0 and 'styles' in curRules and curSpanValue in curRules['styles']): spanOffStart = curSentence['words'][tokensInvolved[0]]['off_start'] spanOffEnd = curSentence['words'][tokensInvolved[-1]]['off_end'] spanStyle = curRules['styles'][curSpanValue] if 'style_spans' not in curSentence: curSentence['style_spans'] = [] curSentence['style_spans'].append({ 'off_start': spanOffStart, 'off_end': spanOffEnd, 'span_class': spanStyle, 'tooltip_text': curSpanValue + ' [' + str(iSpan) + ']' }) if curSpanEnd < curSentenceEnd: iSpan += 1 else: iSentence += 1 def get_sentences(self, srcTree, srcFile): """ Iterate over sentences in the XML tree. """ # mainTierTypes = '(' + ' | '.join('/ANNOTATION_DOCUMENT/TIER[@LINGUISTIC_TYPE_REF=\'' + x + '\'] | ' + # '/ANNOTATION_DOCUMENT/TIER[@TIER_ID=\'' + x + '\']' # for x in self.corpusSettings['main_tiers']) + ')' # mainTiers = srcTree.xpath(mainTierTypes) mainTiers = [] alignedTiers = [] for tierNode in srcTree.xpath('/ANNOTATION_DOCUMENT/TIER'): for tierRegex in self.corpusSettings['main_tiers']: if not tierRegex.startswith('^'): tierRegex = '^' + tierRegex if not tierRegex.endswith('$'): tierRegex += '$' try: if re.search(tierRegex, tierNode.attrib['TIER_ID']) is not None: mainTiers.append(tierNode) break elif ('LINGUISTIC_TYPE_REF' in tierNode.attrib and re.search(tierRegex, tierNode.attrib['LINGUISTIC_TYPE_REF']) is not None): mainTiers.append(tierNode) break except: pass for tierRegex in self.corpusSettings['aligned_tiers']: if not tierRegex.startswith('^'): tierRegex = '^' + tierRegex if not tierRegex.endswith('$'): tierRegex += '$' try: if re.search(tierRegex, tierNode.attrib['TIER_ID']) is not None: alignedTiers.append(tierNode) break elif ('LINGUISTIC_TYPE_REF' in tierNode.attrib and re.search(tierRegex, tierNode.attrib['LINGUISTIC_TYPE_REF']) is not None): alignedTiers.append(tierNode) break except: pass if len(mainTiers) <= 0: return # if len(self.corpusSettings['aligned_tiers']) > 0: # alignedTierTypes = '(' + ' | '.join('/ANNOTATION_DOCUMENT/TIER[@LINGUISTIC_TYPE_REF=\'' + x + '\'] | ' + # '/ANNOTATION_DOCUMENT/TIER[@TIER_ID=\'' + x + '\']' # for x in self.corpusSettings['aligned_tiers']) + ')' # alignedTiers = srcTree.xpath(alignedTierTypes) aID2pID = {} # annotation ID -> (pID, tli1, tli2) correspondence for tier in mainTiers: for sent in self.process_tier(tier, aID2pID, srcFile, alignedTier=False): yield sent for tier in alignedTiers: for sent in self.process_tier(tier, aID2pID, srcFile, alignedTier=True): yield sent def add_speaker_marks(self, sentences): """ Add the name/code of the speaker in the beginning of every sentence that starts the turn. """ if 'insert_speaker_marks' in self.corpusSettings and not self.corpusSettings['insert_speaker_marks']: return langs2process = [i for i in range(len(self.corpusSettings['languages']))] if 'speaker_marks_languages' in self.corpusSettings: langs2process = [i for i in range(len(self.corpusSettings['languages'])) if self.corpusSettings['languages'][i] in self.corpusSettings['speaker_marks_languages']] langs2process = set(langs2process) prevSpeaker = '' for i in range(len(sentences)): if 'meta' not in sentences[i] or 'speaker' not in sentences[i]['meta']: continue if 'lang' in sentences[i] and sentences[i]['lang'] not in langs2process: continue speaker = '[' + sentences[i]['meta']['speaker'] + ']' addOffset = len(speaker) + 2 if sentences[i]['meta']['speaker'] != prevSpeaker: sentences[i]['text'] = '\n' + speaker + ' ' + sentences[i]['text'] sentences[i]['words'].insert(0, {'off_start': -len(speaker) - 1, 'off_end': -1, 'wf': speaker, 'wtype': 'punct', 'next_word': 0}) sentences[i]['words'].insert(0, {'off_start': -len(speaker) - 2, 'off_end': -len(speaker) - 1, 'wf': '\n', 'wtype': 'punct', 'next_word': -1}) for w in sentences[i]['words']: w['off_start'] += addOffset w['off_end'] += addOffset w['next_word'] += 2 if 'para_alignment' in sentences[i]: for pa in sentences[i]['para_alignment']: if pa['off_start'] > 0: pa['off_start'] += addOffset pa['off_end'] += addOffset if 'src_alignment' in sentences[i]: for sa in sentences[i]['src_alignment']: if sa['off_start_sent'] > 0: sa['off_start_sent'] += addOffset sa['off_end_sent'] += addOffset if 'style_spans' in sentences[i]: for ss in sentences[i]['style_spans']: ss['off_start'] += addOffset ss['off_end'] += addOffset prevSpeaker = sentences[i]['meta']['speaker'] if 'last' in sentences[i] and sentences[i]['last']: prevSpeaker = '' def add_sentence_meta(self, sentences, meta): """ Add some of the document-level metadata to the sentences. """ for s in sentences: if 'meta' not in s: continue if 'year1' in meta and 'year2' in meta and meta['year1'] == meta['year2']: s['meta']['year'] = meta['year1'] def clean_up_sentences(self, sentences): """ Remove temporary keys that are no longer needed. """ for s in sentences: if 'nTokensOrig' in s: del s['nTokensOrig'] for word in s['words']: if 'n_orig' in word: del word['n_orig'] def convert_file(self, fnameSrc, fnameTarget): curMeta = self.get_meta(fnameSrc) textJSON = {'meta': curMeta, 'sentences': []} nTokens, nWords, nAnalyzed = 0, 0, 0 self.spanAnnoTiers = {} srcTree = etree.parse(fnameSrc) self.tlis = self.get_tlis(srcTree) self.build_segment_tree(srcTree) srcFileNode = srcTree.xpath('/ANNOTATION_DOCUMENT/HEADER/MEDIA_DESCRIPTOR') if len(srcFileNode) > 0 and 'RELATIVE_MEDIA_URL' in srcFileNode[0].attrib: srcFile = self.rxStripDir.sub('', html.unescape(srcFileNode[0].attrib['RELATIVE_MEDIA_URL'])) elif len(srcFileNode) > 0 and 'MEDIA_URL' in srcFileNode[0].attrib: srcFile = self.rxStripDir.sub('', html.unescape(srcFileNode[0].attrib['MEDIA_URL'])) else: srcFile = '' textJSON['sentences'] = [s for s in self.get_sentences(srcTree, srcFile)] self.add_privacy_segments(srcTree, srcFile) self.add_span_annotations(textJSON['sentences']) # First sorting: sort sentences by language, but keep them sorted by speaker # (which they are now, since each speaker has a separate set of tiers in ELAN). textJSON['sentences'].sort(key=lambda s: (s['lang'])) if 'sentence_segmentation' in self.corpusSettings and self.corpusSettings['sentence_segmentation']: self.tp.splitter.resegment_sentences(textJSON['sentences']) for s in textJSON['sentences']: self.fragmentize_src_alignment(s) # Final sorting: inside each language, sort sentences by their time offsets. textJSON['sentences'].sort(key=lambda s: (s['lang'], s['src_alignment'][0]['true_off_start_src'])) for i in range(len(textJSON['sentences']) - 1): # del textJSON['sentences'][i]['src_alignment'][0]['true_off_start_src'] if textJSON['sentences'][i]['lang'] != textJSON['sentences'][i + 1]['lang']: textJSON['sentences'][i]['last'] = True for word in textJSON['sentences'][i]['words']: nTokens += 1 if word['wtype'] == 'word': nWords += 1 if 'ana' in word and len(word['ana']) > 0: nAnalyzed += 1 self.tp.splitter.recalculate_offsets(textJSON['sentences']) self.tp.splitter.add_next_word_id(textJSON['sentences']) self.add_speaker_marks(textJSON['sentences']) self.add_sentence_meta(textJSON['sentences'], curMeta) self.clean_up_sentences(textJSON['sentences']) if 'capitalize_sentences' in self.corpusSettings and self.corpusSettings['capitalize_sentences']: self.tp.splitter.capitalize_sentences(textJSON['sentences']) self.write_output(fnameTarget, textJSON) return nTokens, nWords, nAnalyzed def process_corpus(self, cutMedia=True): """ Take every eaf file from the source directory subtree, turn it into a parsed json and store it in the target directory. """ Txt2JSON.process_corpus(self) if not cutMedia: return mediaDir = os.path.join(self.corpusSettings['corpus_dir'], self.srcExt) if 'media_dir' in self.corpusSettings: mediaDir = self.corpusSettings['media_dir'] for path, dirs, files in os.walk(mediaDir): # Process video files first files = [fname for fname in files if fname.lower().endswith(('.avi', '.mts', '.mov'))] + \ [fname for fname in files if fname.lower().endswith('.mp4')] + \ [fname for fname in files if not fname.lower().endswith(('.avi', '.mts', '.mov', '.mp4'))] for fname in files: fileExt = os.path.splitext(fname.lower())[1] if fileExt in self.mediaExtensions: privacySegments = [] if fname in self.privacySegments: privacySegments = self.privacySegments[fname] fname = os.path.abspath(os.path.join(path, fname)) print('Cutting media file', fname) self.mc.cut_media(fname, usedFilenames=self.usedMediaFiles, privacySegments=privacySegments)
class Exmaralda_Hamburg2JSON(Txt2JSON): """ Contains methods to make JSONs ready for indexing from aligned Exmaralda files in the format used in documentation projects carried out in Hamburg. """ rxBracketGloss = re.compile('\\.?\\[.*?\\]') rxSplitGlosses = re.compile('-|\\.(?=\\[)') rxWordPunc = re.compile('^( *)([^\\w]*)(.*?)([^\\w]*?)( *)$') txTierXpath = '/basic-transcription/basic-body/tier[@id=\'tx\']' mediaExtensions = {'.wav', '.mp3', '.mp4', '.avi'} def __init__(self, settingsDir='conf'): Txt2JSON.__init__(self, settingsDir=settingsDir) self.mc = MediaCutter(settings=self.corpusSettings) self.srcExt = 'exb' # extension of the source files to be converted self.tlis = {} # time labels (id -> {'n': number, 'time': time value}) self.pID = 0 # id of last aligned segment self.glosses = set() def get_tlis(self, srcTree): """ Retrieve and return all time labels from the XML tree. """ tlis = {} iTli = 0 for tli in srcTree.xpath( '/basic-transcription/basic-body/common-timeline')[0]: timeValue = '' if 'time' in tli.attrib: timeValue = tli.attrib['time'] tlis[tli.attrib['id']] = {'n': iTli, 'time': timeValue} iTli += 1 return tlis def find_sentence_index(self, sentenceBoundaries, tli): """ Find the number of the sentence the event with the given time label (start or end) belongs to. """ if tli not in self.tlis: return -1 for i in range(len(sentenceBoundaries)): tliStart, tliEnd = sentenceBoundaries[i] if (tli == tliStart or self.tlis[tliStart]['n'] <= self.tlis[tli]['n'] < self.tlis[tliEnd]['n']): return i return -1 def get_sentence_boundaries(self, refTier): """ Go over the reference tier (as XML node). For each event in the tier, extract start and end attributes. Return a list with (start time label, end time label) tuples. """ boundaries = [] for event in refTier: if 'start' not in event.attrib or 'end' not in event.attrib: continue sentStart, sentEnd = event.attrib['start'], event.attrib['end'] if sentStart not in self.tlis or sentEnd not in self.tlis: continue boundaries.append((sentStart, sentEnd)) return boundaries def get_word_tlis(self, srcTree): """ Collect all pairs of time labels that delimit words. """ txTiers = srcTree.xpath(Exmaralda_Hamburg2JSON.txTierXpath) tliTuples = set() for txTier in txTiers: for event in txTier: if 'start' not in event.attrib or 'end' not in event.attrib: continue tliTuple = (event.attrib['start'], event.attrib['end']) tliTuples.add(tliTuple) return tliTuples def collect_annotation(self, srcTree): """ Return a dictionary that contains all word-level annotation events, the keys are tuples (start time label, end time label). """ wordTlis = self.get_word_tlis(srcTree) wordAnno = {} for tier in srcTree.xpath( '/basic-transcription/basic-body/tier[@type=\'a\']'): if 'id' not in tier.attrib: continue # tierID = tier.attrib['id'] tierID = tier.attrib['category'] if tierID in self.corpusSettings[ 'translation_tiers'] or tierID in ('tx', 'ts'): continue for event in tier: if ('start' not in event.attrib or 'end' not in event.attrib or event.text is None): continue tupleKey = (event.attrib['start'], event.attrib['end']) # If an annotation spans several tokens, add it to each of them: tupleKeys = [tupleKey] if tupleKey not in wordTlis: for wordTli in wordTlis: if ((wordTli[0] == tupleKey[0] or self.tlis[tupleKey[0]]['n'] <= self.tlis[wordTli[0]]['n']) and (wordTli[1] == tupleKey[1] or self.tlis[tupleKey[1]]['n'] >= self.tlis[wordTli[1]]['n'])): tupleKeys.append(wordTli) for tk in tupleKeys: if tk not in wordAnno: wordAnno[tk] = {} wordAnno[tk][tierID] = event.text return wordAnno def add_ana_fields(self, ana, curWordAnno): """ Add the information from the annotation tier events for the current word to the analysis. For each tier, the name of the tier is the used as the name of the field, and the text of the event is used as the value. """ for tierName in curWordAnno: if tierName in ['tx', 'mb', 'mp', 'gr', 'ge']: continue if tierName == 'ps': ana['gr.pos'] = curWordAnno[tierName] else: ana[tierName] = curWordAnno[tierName] def get_words(self, srcTree): """ Iterate over words found in the tx tier of the XML tree. """ txTier = srcTree.xpath(Exmaralda_Hamburg2JSON.txTierXpath) wordAnno = self.collect_annotation(srcTree) for event in txTier[0]: if 'start' not in event.attrib or 'end' not in event.attrib: continue tupleKey = (event.attrib['start'], event.attrib['end']) if tupleKey not in wordAnno: continue wf = event.text if wf is None: continue curToken = { 'wf': wf, 'wtype': 'word', 'tli_start': event.attrib['start'], 'tli_end': event.attrib['end'] } if self.tp.tokenizer.rxOnlyPunc.search(wf.strip()) is not None: curToken['wtype'] = 'punct' yield curToken continue ana = {} curWordAnno = wordAnno[tupleKey] # mp: morph breaks with empty morphemes (corresponds to the mc tier: POS and morph categories) # mb: morph breaks without empty morphemes (corresponds to the gr/ge tiers: actual glosses) if 'mb' in curWordAnno: ana['parts'] = curWordAnno['mb'] if 'ge' in curWordAnno: ana['gloss'] = curWordAnno['ge'] self.glosses |= set( g for g in self.rxSplitGlosses.split(ana['gloss']) if g.upper() == g) # print(ana['gloss'], self.rxSplitGlosses.split(ana['gloss'])) self.tp.parser.process_gloss_in_ana(ana) if 'gloss_index' in ana: stems, newIndexGloss = self.tp.parser.find_stems( ana['gloss_index'], self.corpusSettings['languages'][0]) ana['lex'] = ' '.join(s[1] for s in stems) ana['trans_en'] = self.rxBracketGloss.sub( '', ' '.join(s[0] for s in stems)) self.add_ana_fields(ana, curWordAnno) useGlossList = False if 'glosses' in self.corpusSettings: useGlossList = True self.tp.parser.gloss2gr(ana, self.corpusSettings['languages'][0], useGlossList=useGlossList) ana['gloss_index'] = self.rxBracketGloss.sub('', newIndexGloss) curToken['ana'] = [ana] yield curToken def fragmentize_src_alignment(self, alignment): """ Find corresponding media file fragment and transform a JSON dictionary with the information about the alignment. """ fileName, fileExt = os.path.splitext(alignment['src'].lower()) if fileExt not in self.mediaExtensions: return ts1 = alignment['off_start_src'] ts2 = alignment['off_end_src'] if len(ts1) <= 0 or len(ts2) <= 0: return ts1frag, ts2frag, srcFileFrag = self.mc.get_media_name( alignment['src'], float(ts1), float(ts2)) alignment['src'] = srcFileFrag alignment['off_start_src'] = str(ts1frag) alignment['off_end_src'] = str(ts2frag) def add_src_alignment(self, sent, sentBoundaries, srcFile): """ Add the alignment of the sentence with the sound/video. If word-level time data is available, align words, otherwise align the whole sentence. """ wordAlignments = [] for word in sent['words']: if 'tli_start' not in word or 'tli_end' not in word: continue if len(self.tlis[word['tli_start']]['time']) > 0: for wa in wordAlignments: if len(wa['off_end_src']) <= 0: wa['off_end_src'] = self.tlis[ word['tli_start']]['time'] wa['src_id'] += word['tli_start'] wordAlignments.append({ 'off_start_src': self.tlis[word['tli_start']]['time'], 'off_end_src': '', 'off_start_sent': word['off_start'], 'off_end_sent': word['off_end'], 'mtype': 'audio', 'src': srcFile, 'src_id': word['tli_start'] + '_' }) if len(self.tlis[word['tli_end']]['time']) > 0: for wa in wordAlignments: if len(wa['off_end_src']) <= 0: wa['off_end_src'] = self.tlis[word['tli_end']]['time'] wa['off_end_sent'] = word['off_end'] wa['src_id'] += word['tli_end'] for wa in wordAlignments: if len(wa['off_end_src']) <= 0: if len(self.tlis[sentBoundaries[1]]['time']) > 0: wa['off_end_src'] = self.tlis[sentBoundaries[1]]['time'] wa['src_id'] += sentBoundaries[1] else: wa['off_end_src'] = wa['off_start_src'] wa['src_id'] += wa['src_id'][:-1] wa['off_end_sent'] = len(sent['text']) # if len(wordAlignments) <= 0 and len(self.tlis[sentBoundaries[0]]['time']) > 0: if len(self.tlis[sentBoundaries[0]]['time']) > 0: wordAlignments = [] # for the time being wordAlignments.append({ 'off_start_src': self.tlis[sentBoundaries[0]]['time'], 'off_end_src': self.tlis[sentBoundaries[1]]['time'], 'off_start_sent': 0, 'off_end_sent': len(sent['text']), 'mtype': 'audio', 'src_id': sentBoundaries[0] + '_' + sentBoundaries[1], 'src': srcFile }) if len(wordAlignments) > 0: for alignment in wordAlignments: self.fragmentize_src_alignment(alignment) sent['src_alignment'] = wordAlignments def get_parallel_sentences(self, srcTree, sentBoundaries, srcFile): """ Iterate over sentences in description tiers aligned with the sentence in the main tx tier. The sentence to align with is defined by the tuple sentBoundaries that contains the start and the end time label for the sentence. """ self.pID += 1 for iTier in range(len(self.corpusSettings['translation_tiers'])): tierName = self.corpusSettings['translation_tiers'][iTier] events = srcTree.xpath('/basic-transcription/basic-body/' 'tier[@id=\'' + tierName + '\']/' 'event[@start=\'' + sentBoundaries[0] + '\' and @end=\'' + sentBoundaries[1] + '\']') for event in events: text = '' for child in event: if child.tail is not None: text += child.tail if len(text) <= 0: text = event.text if text is None or len(text) <= 0: text = '' text = self.tp.cleaner.clean_text(text) if len(text) <= 0: words = [{ 'wf': '—', 'wtype': 'punct', 'off_start': 0, 'off_end': 1 }] text = '—' else: words = self.tp.tokenizer.tokenize(text) paraAlignment = { 'off_start': 0, 'off_end': len(text), 'para_id': self.pID } paraSent = { 'words': words, 'text': text, 'para_alignment': [paraAlignment], 'lang': len(self.corpusSettings['languages']) + iTier } self.add_src_alignment(paraSent, sentBoundaries, srcFile) yield paraSent def get_sentences(self, srcTree, srcFile): """ Iterate over sentences in the XML tree. """ refTiers = srcTree.xpath( '/basic-transcription/basic-body/tier[@id=\'ref\']') if len(refTiers) <= 0: return refTier = refTiers[0] # TODO: Multiple layers sentBoundaries = self.get_sentence_boundaries(refTier) prevSentIndex = -1 curSent = {'text': '', 'words': [], 'lang': 0} for word in self.get_words(srcTree): curSentIndex = self.find_sentence_index(sentBoundaries, word['tli_start']) if curSentIndex != prevSentIndex and len(curSent['text']) > 0: paraAlignment = { 'off_start': 0, 'off_end': len(curSent['text']), 'para_id': self.pID } curSent['para_alignment'] = [paraAlignment] self.add_src_alignment(curSent, sentBoundaries[prevSentIndex], srcFile) yield curSent curSent = {'text': '', 'words': [], 'lang': 0} for paraSent in self.get_parallel_sentences( srcTree, sentBoundaries[curSentIndex], srcFile): yield paraSent prevSentIndex = curSentIndex if word['wtype'] == 'punct': word['off_start'] = len(curSent['text']) curSent['text'] += word['wf'] word['off_end'] = len(curSent['text']) word['wf'] = word['wf'].strip() continue m = self.rxWordPunc.search(word['wf']) spacesL, punctL, wf, punctR, spacesR =\ m.group(1), m.group(2), m.group(3), m.group(4), m.group(5) curSent['text'] += spacesL if len(punctL) > 0: punc = { 'wf': punctL, 'wtype': 'punct', 'off_start': len(curSent['text']), 'off_end': len(curSent['text']) + len(punctL) } curSent['text'] += punctL curSent['words'].append(punc) word['off_start'] = len(curSent['text']) curSent['text'] += wf word['off_end'] = len(curSent['text']) word['wf'] = wf curSent['words'].append(word) if len(punctR) > 0: punc = { 'wf': punctR, 'wtype': 'punct', 'off_start': len(curSent['text']), 'off_end': len(curSent['text']) + len(punctR) } curSent['text'] += punctR curSent['words'].append(punc) curSent['text'] += spacesR if len(curSent['text']) > 0: paraAlignment = { 'off_start': 0, 'off_end': len(curSent['text']), 'para_id': self.pID } curSent['para_alignment'] = [paraAlignment] self.add_src_alignment(curSent, sentBoundaries[curSentIndex], srcFile) yield curSent def convert_file(self, fnameSrc, fnameTarget): """ Take one source Exmaralda file fnameSrc, parse the XML tree, extract timestamps, align sentences with words and their analyses and ultimately generate a parsed JSON file ready for indexing. Write the output to fnameTarget. Return number of tokens, number of words and number of words with at least one analysis in the document. """ curMeta = self.get_meta(fnameSrc) # curMeta = {'title': fnameSrc, 'author': '', 'year1': '1900', 'year2': '2017'} if curMeta is None: return 0, 0, 0 textJSON = {'meta': curMeta, 'sentences': []} nTokens, nWords, nAnalyzed = 0, 0, 0 srcTree = etree.parse(fnameSrc) self.tlis = self.get_tlis(srcTree) srcFileNode = srcTree.xpath( '/basic-transcription/head/meta-information/referenced-file') if len(srcFileNode) > 0 and 'url' in srcFileNode[0].attrib: srcFile = self.rxStripDir.sub('', srcFileNode[0].attrib['url']) else: srcFile = '' textJSON['sentences'] = [ s for s in self.get_sentences(srcTree, srcFile) ] textJSON['sentences'].sort(key=lambda s: s['lang']) for i in range(len(textJSON['sentences']) - 1): if textJSON['sentences'][i]['lang'] != textJSON['sentences'][ i + 1]['lang']: textJSON['sentences'][i]['last'] = True for word in textJSON['sentences'][i]['words']: nTokens += 1 if word['wtype'] == 'word': nWords += 1 if 'ana' in word and len(word['ana']) > 0: nAnalyzed += 1 self.tp.splitter.recalculate_offsets(textJSON['sentences']) self.tp.splitter.add_next_word_id(textJSON['sentences']) self.write_output(fnameTarget, textJSON) return nTokens, nWords, nAnalyzed def process_corpus(self, cutMedia=True): """ Take every Exmaralda file from the source directory subtree, turn it into a parsed json and store it in the target directory. Split all the corpus media files into overlapping chunks of small duration. This is the main function of the class. """ Txt2JSON.process_corpus(self) if not cutMedia: return for path, dirs, files in os.walk( os.path.join(self.corpusSettings['corpus_dir'], self.srcExt)): for fname in files: fileExt = os.path.splitext(fname.lower())[1] if fileExt in self.mediaExtensions: fname = os.path.abspath(os.path.join(path, fname)) print('Cutting media file', fname) self.mc.cut_media(fname)