Пример #1
0
 def __init__(self, settingsDir='conf'):
     Txt2JSON.__init__(self, settingsDir=settingsDir)
     self.mc = MediaCutter(settings=self.corpusSettings)
     self.srcExt = 'exb'  # extension of the source files to be converted
     self.tlis = {}  # time labels (id -> {'n': number, 'time': time value})
     self.pID = 0  # id of last aligned segment
     self.glosses = set()
Пример #2
0
 def __init__(self, settingsDir='conf'):
     Txt2JSON.__init__(self, settingsDir=settingsDir)
     self.speakerMeta = self.load_speaker_meta()
     self.mc = MediaCutter(settings=self.corpusSettings)
     self.srcExt = 'eaf'
     self.tlis = {}  # time labels
     self.pID = 0  # id of last aligned segment
     self.glosses = set()
     self.participants = {}  # main tier ID -> participant ID
     self.segmentTree = {}  # aID -> (contents, parent aID, tli1, tli2)
     self.segmentChildren = {}  # (aID, child tier type) -> [child aID]
Пример #3
0
 def __init__(self, settingsDir='conf'):
     Txt2JSON.__init__(self, settingsDir=settingsDir)
     self.mc = MediaCutter(settings=self.corpusSettings)
     self.srcExt = 'xml'  # extension of the source files to be converted
     self.participants = {}   # participant ID -> dictionary of properties
     self.tlis = {}       # time labels (id -> {'n': number, 'time': time value})
     self.wordsByID = {}  # word ID -> word object
     self.morph2wordID = {}   # morph ID -> (word ID, position in the word)
     self.pID = 0         # id of last aligned segment
     self.seg2pID = {}    # ids of <seg> tags -> parallel IDs of corresponding sentences
     self.wordIDseq = []  # sequence of word/punctuation/incident IDs
                          # (needed to understand ranges such as "w13 to inc2")
     self.glosses = set()
     self.posRules = {}
     self.load_pos_rules(os.path.join(self.corpusSettings['corpus_dir'], 'conf/posRules.txt'))
Пример #4
0
 def __init__(self, settingsDir='conf_conversion'):
     Txt2JSON.__init__(self, settingsDir=settingsDir)
     self.speakerMeta = self.load_speaker_meta()
     self.mc = MediaCutter(settings=self.corpusSettings)
     self.srcExt = 'eaf'
     self.tlis = {}  # time labels
     self.pID = 0  # id of last aligned segment
     self.glosses = set()
     self.participants = {}  # main tier ID -> participant ID
     self.segmentTree = {}  # aID -> (contents, parent aID, tli1, tli2)
     self.segmentChildren = {}  # (aID, child tier type) -> [child aID]
     self.spanAnnoTiers = {}  # span annotation tier type -> {tier ID -> [(tli1, tli2, contents)}
     self.alignedSpanAnnoTiers = {}  # aID of a segment -> {span annotation tier ID -> contents}
     self.additionalWordFields = []  # names of additional word-level fields associated with some analysis tiers
     self.privacySegments = {}  # segments (start_ms, end_ms) that should be beeped out, one list per source file
     self.rxIgnoreTokens = None
     self.set_ignore_tokens()
     self.usedMediaFiles = set()  # filenames of media fragments referenced in the JSONs
Пример #5
0
class Eaf2JSON(Txt2JSON):
    """
    Contains methods to make JSONs ready for indexing from
    ELAN aligned files, a csv with metadata and a list with parsed
    word forms.
    """

    mediaExtensions = {'.wav', '.mp3', '.mp4', '.avi'}
    rxSpaces = re.compile('[ \t]+')
    rxLetters = re.compile('\w+')

    def __init__(self, settingsDir='conf'):
        Txt2JSON.__init__(self, settingsDir=settingsDir)
        self.speakerMeta = self.load_speaker_meta()
        self.mc = MediaCutter(settings=self.corpusSettings)
        self.srcExt = 'eaf'
        self.tlis = {}  # time labels
        self.pID = 0  # id of last aligned segment
        self.glosses = set()
        self.participants = {}  # main tier ID -> participant ID
        self.segmentTree = {}  # aID -> (contents, parent aID, tli1, tli2)
        self.segmentChildren = {}  # (aID, child tier type) -> [child aID]

    def load_speaker_meta(self):
        speakerMeta = {}
        if 'speaker_meta_filename' not in self.corpusSettings:
            return speakerMeta
        try:
            f = open(os.path.join(
                '..', self.corpusSettings['speaker_meta_filename']),
                     'r',
                     encoding='utf-8-sig')
            speakerMeta = json.loads(f.read())
            f.close()
        except FileNotFoundError:
            print('The speaker metadata file not found.')
        return speakerMeta

    def get_tlis(self, srcTree):
        """
        Retrieve and return all time labels from the XML tree.
        """
        tlis = {}
        iTli = 0
        for tli in srcTree.xpath('/ANNOTATION_DOCUMENT/TIME_ORDER/TIME_SLOT'):
            timeValue = ''
            if 'TIME_VALUE' in tli.attrib:
                timeValue = tli.attrib['TIME_VALUE']
            tlis[tli.attrib['TIME_SLOT_ID']] = {'n': iTli, 'time': timeValue}
            iTli += 1
        return tlis

    def traverse_tree(self, srcTree, callback):
        """
        Iterate over all tiers in the XML tree and call the callback function
        for each of them.
        """
        for tierNode in srcTree.xpath('/ANNOTATION_DOCUMENT/TIER'):
            if 'TIER_ID' not in tierNode.attrib:
                continue
            callback(tierNode)

    def cb_build_segment_tree(self, tierNode):
        tierType = ''  # analysis tiers: word/POS/gramm/gloss etc.
        if 'analysis_tiers' in self.corpusSettings:
            for k, v in self.corpusSettings['analysis_tiers'].items():
                if not k.startswith('^'):
                    k = '^' + k
                if not k.endswith('$'):
                    k += '$'
                try:
                    rxTierID = re.compile(k)
                    if (rxTierID.search(tierNode.attrib['TIER_ID']) is not None
                            or rxTierID.search(
                                tierNode.attrib['LINGUISTIC_TYPE_REF'])
                            is not None):
                        tierType = v
                        break
                except:
                    print('Except')
        for segNode in tierNode.xpath(
                'ANNOTATION/REF_ANNOTATION | ANNOTATION/ALIGNABLE_ANNOTATION'):
            if 'ANNOTATION_ID' not in segNode.attrib:
                continue
            aID = segNode.attrib['ANNOTATION_ID']
            try:
                segContents = segNode.xpath('ANNOTATION_VALUE')[0].text.strip()
            except AttributeError:
                segContents = ''
            try:
                segParent = segNode.attrib['ANNOTATION_REF']
            except KeyError:
                segParent = None
            tli1, tli2 = None, None
            if 'TIME_SLOT_REF1' in segNode.attrib:
                tli1 = segNode.attrib['TIME_SLOT_REF1']
            elif segParent in self.segmentTree and self.segmentTree[segParent][
                    2] is not None:
                tli1 = self.segmentTree[segParent][2]
            if 'TIME_SLOT_REF2' in segNode.attrib:
                tli2 = segNode.attrib['TIME_SLOT_REF2']
            elif segParent in self.segmentTree and self.segmentTree[segParent][
                    3] is not None:
                tli2 = self.segmentTree[segParent][3]
            self.segmentTree[aID] = (segContents, segParent, tli1, tli2)
            if segParent is None:
                continue
            if len(tierType) > 0:
                try:
                    self.segmentChildren[(segParent, tierType)].append(aID)
                except KeyError:
                    self.segmentChildren[(segParent, tierType)] = [aID]

    def build_segment_tree(self, srcTree):
        """
        Read the entire XML tree and save all segment data (contents, links to
        the parents and timestamps, if any).
        """
        self.segmentTree = {}
        self.segmentChildren = {}
        self.traverse_tree(srcTree, self.cb_build_segment_tree)

    def fragmentize_src_alignment(self, alignment):
        """
        Find corresponding media file fragment and transform a JSON
        dictionary with the information about the alignment.
        """
        fileName, fileExt = os.path.splitext(alignment['src'].lower())
        if fileExt not in self.mediaExtensions:
            return
        ts1 = alignment['off_start_src']
        ts2 = alignment['off_end_src']
        if len(ts1) <= 0 or len(ts2) <= 0:
            return
        ts1frag, ts2frag, srcFileFrag = self.mc.get_media_name(
            alignment['src'],
            float(ts1) / EAF_TIME_MULTIPLIER,
            float(ts2) / EAF_TIME_MULTIPLIER)
        alignment['src'] = srcFileFrag
        alignment['off_start_src'] = str(ts1frag)
        alignment['off_end_src'] = str(ts2frag)

    def add_src_alignment(self, sent, tli1, tli2, srcFile):
        """
        Add the alignment of the sentence with the sound/video. If
        word-level time data is available, align words, otherwise
        align the whole sentence.
        """
        sentAlignments = []
        ts1 = self.tlis[tli1]['time']
        ts2 = self.tlis[tli2]['time']
        sentAlignments.append({
            'off_start_src': ts1,
            'off_end_src': ts2,
            'true_off_start_src': float(ts1) / EAF_TIME_MULTIPLIER,
            'off_start_sent': 0,
            'off_end_sent': len(sent['text']),
            'mtype': 'audio',
            'src_id': ts1 + '_' + ts2,
            'src': srcFile
        })
        for alignment in sentAlignments:
            self.fragmentize_src_alignment(alignment)
        sent['src_alignment'] = sentAlignments

    def add_punc(self, text, startOffset):
        """
        Make one or several punctuation tokens out of the text.
        """
        tokens = []
        curToken = {
            'wf': '',
            'off_start': startOffset,
            'off_end': startOffset,
            'wtype': 'punc'
        }
        for i in range(len(text)):
            if self.rxSpaces.search(text[i]) is not None:
                if len(curToken['wf']) > 0:
                    curToken['off_end'] = startOffset + i
                    tokens.append(curToken)
                    curToken = {
                        'wf': '',
                        'off_start': startOffset + i,
                        'off_end': startOffset + i,
                        'wtype': 'punc'
                    }
            else:
                curToken['wf'] += text[i]
        if len(curToken['wf']) > 0:
            curToken['off_end'] = startOffset + len(text)
            tokens.append(curToken)
        return tokens

    def retrieve_analyses(self, aID, lang=''):
        """
        Compile list of analyses retrieved from the relevant tiers of an analyzed
        EAF file associated with the token identified by aID.
        """
        analyses = []
        analysisTiers = []
        for tierType in [
                'pos', 'gramm', 'lemma', 'parts', 'gloss', 'trans_ru'
        ]:
            if (aID, tierType) not in self.segmentChildren:
                continue
            analysisTiers.append([])
            for childID in self.segmentChildren[(aID, tierType)]:
                if childID not in self.segmentTree:
                    continue
                contents = self.segmentTree[childID][0]
                for ana in self.retrieve_analyses(childID, lang=lang):
                    if tierType == 'lemma':
                        ana['lex'] = contents
                    elif tierType == 'parts':
                        ana['parts'] = contents
                    elif tierType == 'gloss':
                        ana['gloss'] = contents
                    elif tierType == 'trans_ru':
                        #print(contents)
                        if re.findall('[а-яёА-ЯЁ]+', contents):
                            ana['trans_ru'] = re.findall(
                                '[а-яёА-ЯЁ._]+', contents)[0].strip('.')
                    elif tierType == 'pos' and len(contents) > 0:
                        ana['gr.pos'] = contents
                    elif tierType == 'gramm':
                        grJSON = self.tp.parser.transform_gramm_str(contents,
                                                                    lang=lang)
                        ana.update(grJSON)
                    analysisTiers[-1].append(ana)
            analysisTiers[-1] = [
                ana for ana in analysisTiers[-1] if len(ana) > 0
            ]
        if len(analysisTiers) <= 0:
            return [{}]
        for combination in itertools.product(*analysisTiers):
            ana = {}
            for partAna in combination:
                ana.update(partAna)
            if len(ana) > 0:
                self.tp.parser.process_gloss_in_ana(ana)
                analyses.append(ana)
        if len(analyses) <= 0:
            return [{}]
        return analyses

    def retrieve_words(self, text, wordIDs, lang=''):
        """
        Return a list of words with their analyses retrieved from the relevant
        tiers of an analyzed EAF file. Try to align words with the text of the
        entire sentence.
        """
        words = []
        iSentPos = 0
        iBufferStart = 0
        sBuffer = ''
        for iWord in range(len(wordIDs)):
            iWordPos = 0
            word = self.segmentTree[wordIDs[iWord]][0]
            if len(sBuffer) <= 0:
                iBufferStart = iSentPos
            if len(word) <= 0:
                continue
            while iSentPos < len(
                    text) and text[iSentPos].lower() != word[iWordPos].lower():
                sBuffer += text[iSentPos]
                iSentPos += 1
            if len(sBuffer) > 0:
                words += self.add_punc(sBuffer, iBufferStart)
                sBuffer = ''
                iBufferStart = iSentPos
            if iSentPos == len(text):
                print('Unexpected end of sentence:', text)
                return words
            token = {
                'wf': word,
                'off_start': iSentPos,
                'off_end': iSentPos + len(word),
                'wtype': 'word'
            }
            while iSentPos < len(text) and iWordPos < len(word):
                if text[iSentPos].lower() == word[iWordPos].lower():
                    iSentPos += 1
                    iWordPos += 1
                    continue
                if self.rxLetters.search(
                        word[iWordPos]) is None and self.rxLetters.search(
                            text[iSentPos]) is not None:
                    iWordPos += 1
                    continue
                iSentPos += 1
            token['off_end'] = iSentPos
            analyses = [
                ana
                for ana in self.retrieve_analyses(wordIDs[iWord], lang=lang)
                if len(ana) > 0
            ]
            if len(analyses) > 0:
                token['ana'] = analyses
            words.append(token)
        if iSentPos < len(text):
            words += self.add_punc(text[iSentPos:], iSentPos)
        return words

    def process_tier(self, tierNode, aID2pID, srcFile, alignedTier=False):
        """
        Extract segments from the tier node and iterate over them, returning
        them as JSON sentences. If alignedTier is False, store the start and end
        timestamps, as well as pIDs for alignment, in the dictionary aID2pID.
        If alignedTier is True, use the information from aID2pID for establishing
        time boundaries of the sentences and aligning it with the source tier. 
        """
        lang = ''
        # We have to find out what language the tier represents.
        # First, check the tier type. If it is not associated with any language,
        # check all tier ID regexes.
        if 'TIER_ID' not in tierNode.attrib:
            return
        if ('LINGUISTIC_TYPE_REF' in tierNode.attrib
                and tierNode.attrib['LINGUISTIC_TYPE_REF']
                in self.corpusSettings['tier_languages']):
            lang = self.corpusSettings['tier_languages'][
                tierNode.attrib['LINGUISTIC_TYPE_REF']]
        else:
            for k, v in self.corpusSettings['tier_languages'].items():
                if not k.startswith('^'):
                    k = '^' + k
                if not k.endswith('$'):
                    k += '$'
                try:
                    rxTierID = re.compile(k)
                    if rxTierID.search(tierNode.attrib['TIER_ID']) is not None:
                        lang = v
                        break
                except:
                    continue
        if len(lang) <= 0 or lang not in self.corpusSettings['languages']:
            return
        langID = self.corpusSettings['languages'].index(lang)

        speaker = ''
        if not alignedTier and 'PARTICIPANT' in tierNode.attrib:
            speaker = tierNode.attrib['PARTICIPANT']
            self.participants[tierNode.attrib['TIER_ID']] = speaker
        else:
            if ('PARENT_REF' in tierNode.attrib
                    and tierNode.attrib['PARENT_REF'] in self.participants):
                speaker = self.participants[tierNode.attrib['PARENT_REF']]
            elif 'PARTICIPANT' in tierNode.attrib:
                speaker = tierNode.attrib['PARTICIPANT']

        segments = tierNode.xpath(
            'ANNOTATION/REF_ANNOTATION | ANNOTATION/ALIGNABLE_ANNOTATION')

        for segNode in segments:
            if ('ANNOTATION_ID' not in segNode.attrib or
                    segNode.attrib['ANNOTATION_ID'] not in self.segmentTree):
                continue
            segData = self.segmentTree[segNode.attrib['ANNOTATION_ID']]
            if not alignedTier:
                if segData[2] is None or segData[3] is None:
                    continue
                tli1 = segData[2]
                tli2 = segData[3]
            elif segData[1] is not None:
                aID = segData[1]
                pID, tli1, tli2 = aID2pID[aID]
            else:
                continue
            text = segData[0]
            curSent = {
                'text': text,
                'words': None,
                'lang': langID,
                'meta': {
                    'speaker': speaker
                }
            }
            if speaker in self.speakerMeta:
                for k, v in self.speakerMeta[speaker].items():
                    curSent['meta'][k] = v
            if (segNode.attrib['ANNOTATION_ID'],
                    'word') not in self.segmentChildren:
                curSent['words'] = self.tp.tokenizer.tokenize(text)
                self.tp.splitter.add_next_word_id_sentence(curSent)
                self.tp.parser.analyze_sentence(curSent, lang=lang)
            else:
                curSent['words'] = self.retrieve_words(
                    text,
                    self.segmentChildren[(segNode.attrib['ANNOTATION_ID'],
                                          'word')],
                    lang=lang)
                self.tp.splitter.add_next_word_id_sentence(curSent)
            if len(self.corpusSettings['aligned_tiers']) > 0:
                if not alignedTier:
                    self.pID += 1
                    aID = segNode.attrib['ANNOTATION_ID']
                    aID2pID[aID] = (self.pID, tli1, tli2)
                    paraAlignment = {
                        'off_start': 0,
                        'off_end': len(curSent['text']),
                        'para_id': self.pID
                    }
                    curSent['para_alignment'] = [paraAlignment]
                else:
                    paraAlignment = {
                        'off_start': 0,
                        'off_end': len(curSent['text']),
                        'para_id': pID
                    }
                    curSent['para_alignment'] = [paraAlignment]
            self.add_src_alignment(curSent, tli1, tli2, srcFile)
            yield curSent

    def get_sentences(self, srcTree, srcFile):
        """
        Iterate over sentences in the XML tree.
        """
        # mainTierTypes = '(' + ' | '.join('/ANNOTATION_DOCUMENT/TIER[@LINGUISTIC_TYPE_REF=\'' + x + '\'] | ' +
        #                                  '/ANNOTATION_DOCUMENT/TIER[@TIER_ID=\'' + x + '\']'
        #                                  for x in self.corpusSettings['main_tiers']) + ')'
        # mainTiers = srcTree.xpath(mainTierTypes)
        mainTiers = []
        alignedTiers = []
        for tierNode in srcTree.xpath('/ANNOTATION_DOCUMENT/TIER'):
            for tierRegex in self.corpusSettings['main_tiers']:
                if not tierRegex.startswith('^'):
                    tierRegex = '^' + tierRegex
                if not tierRegex.endswith('$'):
                    tierRegex += '$'
                try:
                    if re.search(tierRegex,
                                 tierNode.attrib['TIER_ID']) is not None:
                        mainTiers.append(tierNode)
                        break
                    elif ('LINGUISTIC_TYPE_REF' in tierNode.attrib
                          and re.search(tierRegex,
                                        tierNode.attrib['LINGUISTIC_TYPE_REF'])
                          is not None):
                        mainTiers.append(tierNode)
                        break
                except:
                    pass
            for tierRegex in self.corpusSettings['aligned_tiers']:
                if not tierRegex.startswith('^'):
                    tierRegex = '^' + tierRegex
                if not tierRegex.endswith('$'):
                    tierRegex += '$'
                try:
                    if re.search(tierRegex,
                                 tierNode.attrib['TIER_ID']) is not None:
                        alignedTiers.append(tierNode)
                        break
                    elif ('LINGUISTIC_TYPE_REF' in tierNode.attrib
                          and re.search(tierRegex,
                                        tierNode.attrib['LINGUISTIC_TYPE_REF'])
                          is not None):
                        alignedTiers.append(tierNode)
                        break
                except:
                    pass
        if len(mainTiers) <= 0:
            return
        # if len(self.corpusSettings['aligned_tiers']) > 0:
        #     alignedTierTypes = '(' + ' | '.join('/ANNOTATION_DOCUMENT/TIER[@LINGUISTIC_TYPE_REF=\'' + x + '\'] | ' +
        #                                         '/ANNOTATION_DOCUMENT/TIER[@TIER_ID=\'' + x + '\']'
        #                                         for x in self.corpusSettings['aligned_tiers']) + ')'
        #     alignedTiers = srcTree.xpath(alignedTierTypes)
        aID2pID = {}  # annotation ID -> (pID, tli1, tli2) correspondence
        for tier in mainTiers:
            for sent in self.process_tier(tier,
                                          aID2pID,
                                          srcFile,
                                          alignedTier=False):
                yield sent
        for tier in alignedTiers:
            for sent in self.process_tier(tier,
                                          aID2pID,
                                          srcFile,
                                          alignedTier=True):
                yield sent

    def add_speaker_marks(self, sentences):
        """
        Add the name/code of the speaker in the beginning of every
        sentence that starts the turn.
        """
        prevSpeaker = ''
        for i in range(len(sentences)):
            if 'meta' not in sentences[i] or 'speaker' not in sentences[i][
                    'meta']:
                continue
            speaker = '[' + sentences[i]['meta']['speaker'] + ']'
            addOffset = len(speaker) + 2
            if sentences[i]['meta']['speaker'] != prevSpeaker:
                sentences[i][
                    'text'] = '\n' + speaker + ' ' + sentences[i]['text']
                sentences[i]['words'].insert(
                    0, {
                        'off_start': -len(speaker) - 1,
                        'off_end': -1,
                        'wf': speaker,
                        'wtype': 'punc',
                        'next_word': 0
                    })
                sentences[i]['words'].insert(
                    0, {
                        'off_start': -len(speaker) - 2,
                        'off_end': -len(speaker) - 1,
                        'wf': '\n',
                        'wtype': 'punc',
                        'next_word': -1
                    })
                for w in sentences[i]['words']:
                    w['off_start'] += addOffset
                    w['off_end'] += addOffset
                    w['next_word'] += 2
                if 'para_alignment' in sentences[i]:
                    for pa in sentences[i]['para_alignment']:
                        if pa['off_start'] > 0:
                            pa['off_start'] += addOffset
                        pa['off_end'] += addOffset
                if 'src_alignment' in sentences[i]:
                    for sa in sentences[i]['src_alignment']:
                        if sa['off_start_sent'] > 0:
                            sa['off_start_sent'] += addOffset
                        sa['off_end_sent'] += addOffset
            prevSpeaker = sentences[i]['meta']['speaker']
            if 'last' in sentences[i] and sentences[i]['last']:
                prevSpeaker = ''

    def add_sentence_meta(self, sentences, meta):
        """
        Add some of the document-level metadata to the sentences.
        """
        for s in sentences:
            if 'meta' not in s:
                continue
            if 'year1' in meta and 'year2' in meta and meta['year1'] == meta[
                    'year2']:
                s['meta']['year'] = meta['year1']

    def convert_file(self, fnameSrc, fnameTarget):
        curMeta = self.get_meta(fnameSrc)
        textJSON = {'meta': curMeta, 'sentences': []}
        nTokens, nWords, nAnalyzed = 0, 0, 0
        srcTree = etree.parse(fnameSrc)
        self.tlis = self.get_tlis(srcTree)
        self.build_segment_tree(srcTree)
        srcFileNode = srcTree.xpath(
            '/ANNOTATION_DOCUMENT/HEADER/MEDIA_DESCRIPTOR')
        if len(srcFileNode
               ) > 0 and 'RELATIVE_MEDIA_URL' in srcFileNode[0].attrib:
            srcFile = self.rxStripDir.sub(
                '', html.unescape(srcFileNode[0].attrib['RELATIVE_MEDIA_URL']))
        elif len(srcFileNode) > 0 and 'MEDIA_URL' in srcFileNode[0].attrib:
            srcFile = self.rxStripDir.sub(
                '', html.unescape(srcFileNode[0].attrib['MEDIA_URL']))
        else:
            srcFile = ''
        textJSON['sentences'] = [
            s for s in self.get_sentences(srcTree, srcFile)
        ]
        textJSON['sentences'].sort(key=lambda s: (s['lang'], s['src_alignment']
                                                  [0]['true_off_start_src']))
        for i in range(len(textJSON['sentences']) - 1):
            # del textJSON['sentences'][i]['src_alignment'][0]['true_off_start_src']
            if textJSON['sentences'][i]['lang'] != textJSON['sentences'][
                    i + 1]['lang']:
                textJSON['sentences'][i]['last'] = True
            for word in textJSON['sentences'][i]['words']:
                nTokens += 1
                if word['wtype'] == 'word':
                    nWords += 1
                if 'ana' in word and len(word['ana']) > 0:
                    nAnalyzed += 1
        self.tp.splitter.recalculate_offsets(textJSON['sentences'])
        self.tp.splitter.add_next_word_id(textJSON['sentences'])
        self.add_speaker_marks(textJSON['sentences'])
        self.add_sentence_meta(textJSON['sentences'], curMeta)
        self.write_output(fnameTarget, textJSON)
        return nTokens, nWords, nAnalyzed

    def process_corpus(self, cutMedia=False):
        """
        Take every eaf file from the source directory subtree, turn it
        into a parsed json and store it in the target directory.
        """
        Txt2JSON.process_corpus(self)
        if not cutMedia:
            return
        for path, dirs, files in os.walk(os.path.join('..', self.srcExt)):
            for fname in files:
                fileExt = os.path.splitext(fname.lower())[1]
                if fileExt in self.mediaExtensions:
                    fname = os.path.abspath(os.path.join(path, fname))
                    print('Cutting media file', fname)
                    self.mc.cut_media(fname)
Пример #6
0
class ISO_TEI_Hamburg2JSON(Txt2JSON):
    """
    Contains methods to make JSONs ready for indexing from transcriptions
    aligned in Exmaralda in the format used in documentation projects
    carried out in Hamburg and then translated into a certain ISO TEI subset.
    """

    rxBracketGloss = re.compile('[.-]?\\[.*?\\]')
    rxWordPunc = re.compile('^( *)([^\\w]*)(.*?)([^\\w]*?)( *)$')
    rxLetters = re.compile('\w+')
    rxFloat = re.compile('^[0-9]+(?:\.[0-9]+)?$')
    rxTrailingZeroes = re.compile('^0+(?=[1-9])|\.0+$')
    rxNonDigit = re.compile('[^0-9]+')
    mediaExtensions = {'.wav', '.mp3', '.mp4', '.avi'}
    sentenceEndPunct = {'declarative': '.', 'interrogative': '?'}
    namespaces = {
        'tei': 'http://www.tei-c.org/ns/1.0',
        'xml': 'http://www.w3.org/XML/1998/namespace'
    }
    pfx_xml = '{http://www.w3.org/XML/1998/namespace}'
    pfx_tei = '{http://www.tei-c.org/ns/1.0}'

    def __init__(self, settingsDir='conf'):
        Txt2JSON.__init__(self, settingsDir=settingsDir)
        self.mc = MediaCutter(settings=self.corpusSettings)
        self.srcExt = 'xml'  # extension of the source files to be converted
        self.participants = {}  # participant ID -> dictionary of properties
        self.tlis = {}  # time labels (id -> {'n': number, 'time': time value})
        self.wordsByID = {}  # word ID -> word object
        self.morph2wordID = {}  # morph ID -> (word ID, position in the word)
        self.pID = 0  # id of last aligned segment
        self.seg2pID = {
        }  # ids of <seg> tags -> parallel IDs of corresponding sentences
        self.wordIDseq = []  # sequence of word/punctuation/incident IDs
        # (needed to understand ranges such as "w13 to inc2")
        self.glosses = set()
        self.posRules = {}
        self.load_pos_rules(
            os.path.join(self.corpusSettings['corpus_dir'],
                         'conf/posRules.txt'))

    def load_pos_rules(self, fname):
        """
        Load mapping of the POS tags used in the source files to your corpus POS tags.
        """
        if len(fname) <= 0 or not os.path.isfile(fname):
            return
        rules = {}
        f = open(fname, 'r', encoding='utf-8-sig')
        for line in f:
            line = line.strip('\r\n')
            if len(line) > 0:
                rule = [i.strip() for i in line.split('\t')]
                if len(rule) != 2:
                    continue
                rules[rule[0]] = rule[1]
        f.close()
        self.posRules = rules

    def load_speaker_meta(self, srcTree):
        speakerMeta = {}
        if 'speaker_meta_filename' in self.corpusSettings:
            try:
                f = open(os.path.join(
                    self.corpusSettings['corpus_dir'],
                    self.corpusSettings['speaker_meta_filename']),
                         'r',
                         encoding='utf-8-sig')
                speakerMeta = json.loads(f.read())
                f.close()
            except FileNotFoundError:
                self.log_message('The speaker metadata file not found.')
        else:
            for speaker in srcTree.xpath(
                    '/tei:TEI/tei:teiHeader/tei:profileDesc/tei:particDesc/tei:person',
                    namespaces=self.namespaces):
                if self.pfx_xml + 'id' not in speaker.attrib:
                    continue
                speakerID = speaker.attrib[self.pfx_xml + 'id']
                if 'n' in speaker.attrib:
                    speakerCode = speaker.attrib['n']
                else:
                    speakerCode = speakerID
                speakerMeta[speakerID] = {'speaker': speakerCode}
                if 'sex' in speaker.attrib:
                    if speaker.attrib['sex'] in ['1', 'M']:
                        speakerMeta[speakerID]['gender'] = 'M'
                    elif speaker.attrib['sex'] in ['2', 'F']:
                        speakerMeta[speakerID]['gender'] = 'F'
                    else:
                        speakerMeta[speakerID]['gender'] = speaker.attrib[
                            'sex']
                if 'age' in speaker.attrib:
                    speakerMeta[speakerID]['age'] = speaker.attrib['age']
                if 'role' in speaker.attrib:
                    speakerMeta[speakerID]['role'] = speaker.attrib['role']
        return speakerMeta

    def get_tlis(self, srcTree):
        """
        Retrieve and return all time labels from the XML tree.
        """
        tlis = {}
        iTli = 0
        for tli in srcTree.xpath('/tei:TEI/tei:text/tei:timeline',
                                 namespaces=self.namespaces)[0]:
            timeValue = tli.attrib[self.pfx_xml + 'id']
            if 'interval' in tli.attrib:
                timeValue = tli.attrib['interval']
            elif tli.attrib[self.pfx_xml + 'id'] in ['T0', 'T_START']:
                timeValue = '0'
            timeValue = self.rxTrailingZeroes.sub('', timeValue)
            tlis[tli.attrib[self.pfx_xml + 'id']] = {
                'n': iTli,
                'time': timeValue
            }
            iTli += 1
        return tlis

    def id_range2list(self, idFrom, idTo):
        """
        Turn a range of word/punctuation/incident (such as "w13 to inc2") IDs into a list
        of consecutive IDs.
        """
        if idFrom not in self.wordIDseq or idTo not in self.wordIDseq:
            return []
        return self.wordIDseq[self.wordIDseq.
                              index(idFrom):self.wordIDseq.index(idTo) + 1]

    def add_pos_ana(self, ana, pos):
        """
        Add the part of speech tag to single JSON analysis, taking into
        account the correspondences between source file tags and the target
        corpus tags. Change the analysis, do not return anything.
        """
        if pos in self.posRules:
            pos = self.posRules[pos]
        if 'gr.pos' not in ana:
            ana['gr.pos'] = pos
        elif type(ana['gr.pos']) == str and ana['gr.pos'] != pos:
            ana['gr.pos'] = [ana['gr.pos'], pos]
        elif pos not in ana['gr.pos']:
            ana['gr.pos'].append(pos)

    def collect_annotation(self, annoTree):
        """
        Return a dictionary that contains all word-level annotation events
        within an annotation block, the keys are word IDs.
        """
        wordAnno = {}
        for tier in annoTree.xpath('tei:spanGrp', namespaces=self.namespaces):
            if 'type' not in tier.attrib:
                continue
            tierID = tier.attrib['type']
            prevWordID = '-1'
            curWordNMorphs = 0
            for wSpan in tier:
                if 'from' not in wSpan.attrib or 'to' not in wSpan.attrib:
                    continue
                spanIDs = [wSpan.attrib['from']]
                wSpanTexts = [wSpan.text]
                if wSpan.attrib['from'] != wSpan.attrib['to']:
                    # continue
                    if (wSpan.attrib['from'].startswith(('w', 'pc', 'inc'))
                            and wSpan.attrib['to'].startswith(
                                ('w', 'pc', 'inc'))):
                        # Some tiers, such as information structure, allow spans that include
                        # multiple words. In this case, assign the value to each of the words
                        # in the span in case of annotation tiers. However, if the tier is
                        # SpeakerContribution_Event, try to split it into words so that each
                        # word gets a corresponding part of the value.
                        if tierID == 'SpeakerContribution_Event' and wSpan.text is not None:
                            wSpanParts = re.findall('[^ ]+ *', wSpan.text)
                            wSpanTexts = []
                        iSpanPart = 0
                        spanIDs = self.id_range2list(wSpan.attrib['from'],
                                                     wSpan.attrib['to'])
                        for wID in spanIDs:
                            if tierID == 'SpeakerContribution_Event' and wSpan.text is not None:
                                if iSpanPart < len(wSpanParts):
                                    wSpanTexts.append(wSpanParts[iSpanPart])
                                else:
                                    wSpanTexts.append('')
                                iSpanPart += 1
                            else:
                                wSpanTexts.append(wSpan.text)
                        if wSpan.text is not None:
                            self.log_message('Warning: span[from] = ' +
                                             wSpan.attrib['from'] +
                                             ', span[to] = ' +
                                             wSpan.attrib['to'] +
                                             ', text = "' + wSpan.text + '".')
                        else:
                            self.log_message('Warning: span[from] = ' +
                                             wSpan.attrib['from'] +
                                             ', span[to] = ' +
                                             wSpan.attrib['to'] +
                                             ', text is empty.')
                    else:
                        continue
                for spanID in spanIDs:
                    wSpanText = wSpanTexts.pop(0)
                    if spanID.startswith('seg'):
                        continue
                    elif spanID.startswith('w'):
                        wordID = spanID
                    elif spanID.startswith('inc'):
                        wordID = spanID
                    elif spanID.startswith('m'):
                        wordID = self.morph2wordID[spanID][0]
                    else:
                        continue
                    if wordID != prevWordID:
                        prevWordID = wordID
                        curWordNMorphs = 0
                    if wordID not in wordAnno:
                        wordAnno[wordID] = {}
                    if self.pfx_xml + 'id' in wSpan.attrib:
                        self.morph2wordID[wSpan.attrib[self.pfx_xml +
                                                       'id']] = (
                                                           wordID,
                                                           curWordNMorphs)
                        curWordNMorphs += 1
                        if wSpanText is not None:
                            wordAnno[wordID][tierID] = wSpanText
                        else:
                            wordAnno[wordID][tierID] = ''
                    elif tierID not in ['mb', 'mp', 'ge', 'gr']:
                        # Word-based annotations: one flat span for each word
                        if tierID not in wordAnno[wordID]:
                            wordAnno[wordID][tierID] = ''
                        if len(wordAnno[wordID][tierID]) > 0:
                            wordAnno[wordID][tierID] += '-'
                        if wSpanText is not None:
                            wordAnno[wordID][tierID] += wSpanText
                    else:
                        # Multiple morphemes inside one span in e.g. the mb tier
                        wordAnno[wordID][tierID] = ''
                        for mSpan in wSpan:
                            mText = mSpan.text
                            if self.pfx_xml + 'id' in mSpan.attrib:
                                mID = mSpan.attrib[self.pfx_xml + 'id']
                            elif ('from' in mSpan.attrib
                                  and 'to' in mSpan.attrib and
                                  mSpan.attrib['from'] == mSpan.attrib['to']):
                                mID = mSpan.attrib['from']
                            else:
                                # continue
                                mID = wordID + '_covert'  # categories not expressed overtly
                                if mText is None:
                                    self.log_message(
                                        'Empty morpheme description cell: word ID '
                                        + wordID + ', tier ' + tierID + '.')
                                    continue
                                mText = '[' + mText + ']'
                                # if 'mb' not in wordAnno[wordID]:
                                #     wordAnno[wordID]['mb'] = '∅'
                                # elif mID not in self.morph2wordID:
                                #     wordAnno[wordID]['mb'] += '-∅'
                                # if 'mp' not in wordAnno[wordID]:
                                #     wordAnno[wordID]['mp'] = '∅'
                                # elif mID not in self.morph2wordID:
                                #     wordAnno[wordID]['mp'] += '-∅'
                            self.morph2wordID[mID] = (wordID, curWordNMorphs)
                            curWordNMorphs += 1
                            if tierID not in wordAnno[wordID]:
                                wordAnno[wordID][tierID] = ''
                            if len(wordAnno[wordID][tierID]) > 0:
                                wordAnno[wordID][tierID] += '-'
                            if mText is not None:
                                wordAnno[wordID][tierID] += mText
                            else:
                                wordAnno[wordID][tierID] += '∅'
        return wordAnno

    def add_ana_fields(self, ana, curWordAnno):
        """
        Add the information from the annotation tier events for the
        current word to the analysis. For each tier, the name of the
        tier is the used as the name of the field, and the text of
        the event is used as the value.
        """
        for tierName in curWordAnno:
            if tierName in [
                    'tx', 'mb', 'mp', 'gr', 'ge', 'ps',
                    'SpeakerContribution_Event'
            ]:
                continue
            elif len(curWordAnno[tierName]) > 0:
                ana[tierName] = curWordAnno[tierName]

    def process_words(self, annoTree):
        """
        Iterate over words in an annotation block and add their
        analyses to the corresponding word objects in the sentences.
        """
        wordAnno = self.collect_annotation(annoTree)
        for wordID in wordAnno:
            ana = {}
            curWordAnno = wordAnno[wordID]
            # mp: morph breaks with empty morphemes (corresponds to the mc tier: POS and morph categories)
            # mb: morph breaks without empty morphemes (corresponds to the gr/ge tiers: actual glosses)
            if 'ge' in curWordAnno:
                ana['gloss'] = curWordAnno['ge']
                self.glosses |= set(g for g in ana['gloss'].split('-')
                                    if g.upper() == g)
            if 'mp' in curWordAnno:
                # mp contains normalized versions of morphemes. If this tier exists,
                # take normalized stem from it and make it a lemma. Then forget mp
                # and write glosses based on the mb tier, if it exists.
                ana['parts'] = curWordAnno['mp']
                self.tp.parser.process_gloss_in_ana(ana)
                if 'gloss_index' in ana:
                    stems, newIndexGloss = self.tp.parser.find_stems(
                        ana['gloss_index'],
                        self.corpusSettings['languages'][0])
                    ana['lex'] = ' '.join(s[1] for s in stems)
            if 'mb' in curWordAnno:
                ana['parts'] = curWordAnno['mb']
            if 'gr' in curWordAnno:
                ana['gloss_ru'] = curWordAnno['gr']
                self.tp.parser.process_gloss_in_ana(ana, 'ru')
            if 'ps' in curWordAnno:
                self.add_pos_ana(ana, curWordAnno['ps'])
            self.tp.parser.process_gloss_in_ana(ana)
            if 'gloss_index' in ana:
                stems, newIndexGloss = self.tp.parser.find_stems(
                    ana['gloss_index'], self.corpusSettings['languages'][0])
                if 'lex' not in ana:
                    ana['lex'] = ' '.join(s[1] for s in stems)
                ana['trans_en'] = self.rxBracketGloss.sub(
                    '', ' '.join(s[0] for s in stems))
                self.add_ana_fields(ana, curWordAnno)
                self.tp.parser.gloss2gr(ana,
                                        self.corpusSettings['languages'][0])
                ana['gloss_index'] = self.rxBracketGloss.sub('', newIndexGloss)
            if 'gloss_index_ru' in ana:
                stems, newIndexGloss = self.tp.parser.find_stems(
                    ana['gloss_index_ru'], self.corpusSettings['languages'][0])
                ana['trans_ru'] = self.rxBracketGloss.sub(
                    '', ' '.join(s[0] for s in stems))
                del ana['gloss_index_ru']
                del ana['gloss_ru']
                if 'glosses_covert_ru' in ana:
                    del ana['glosses_covert_ru']
            if 'gloss' in ana:
                ana['gloss'] = self.rxBracketGloss.sub('', ana['gloss'])
            self.wordsByID[wordID]['ana'] = [ana]
            self.wordsByID[wordID]['word_source'] = ''
            if 'SpeakerContribution_Event' in curWordAnno:
                self.wordsByID[wordID]['word_source'] = curWordAnno[
                    'SpeakerContribution_Event']

    def fragmentize_src_alignment(self, alignment):
        """
        Find corresponding media file fragment and transform a JSON
        dictionary with the information about the alignment.
        """
        fileName, fileExt = os.path.splitext(alignment['src'].lower())
        if fileExt not in self.mediaExtensions:
            return
        ts1 = alignment['off_start_src']
        ts2 = alignment['off_end_src']
        if len(ts1) <= 0 or len(ts2) <= 0:
            return
        ts1frag, ts2frag, srcFileFrag = self.mc.get_media_name(
            alignment['src'], float(ts1), float(ts2))
        alignment['src'] = srcFileFrag
        alignment['off_start_src'] = str(ts1frag)
        alignment['off_end_src'] = str(ts2frag)

    def add_src_alignment(self, sent, sentBoundaries, srcFile):
        """
        Add the alignment of the sentence with the sound/video.
        """
        alignment = {
            'off_start_src': self.tlis[sentBoundaries[0]]['time'],
            'off_end_src': self.tlis[sentBoundaries[1]]['time'],
            'off_start_sent': 0,
            'off_end_sent': len(sent['text']),
            'mtype': 'audio',
            'src_id': sentBoundaries[0] + '_' + sentBoundaries[1],
            'src': srcFile
        }
        if (self.rxFloat.search(alignment['off_start_src']) is None
                or self.rxFloat.search(alignment['off_end_src']) is None):
            return
        self.fragmentize_src_alignment(alignment)
        sent['src_alignment'] = [alignment]

    def get_parallel_sentences(self, srcTree, sentBoundaries, srcFile):
        """
        Iterate over sentences in description tiers aligned with the
        sentence in the main tx tier. The sentence to align with is
        defined by the tuple sentBoundaries that contains the start
        and the end time label for the sentence.
        """
        self.pID += 1
        for iTier in range(len(self.corpusSettings['translation_tiers'])):
            tierName = self.corpusSettings['translation_tiers'][iTier]
            events = srcTree.xpath('/tei:basic-transcription/tei:basic-body/'
                                   'tei:tier[@xml:id=\'' + tierName + '\']/'
                                   'tei:event[@tei:start=\'' +
                                   sentBoundaries[0] + '\' and @tei:end=\'' +
                                   sentBoundaries[1] + '\']',
                                   namespaces=self.namespaces)
            for event in events:
                text = ''
                for child in event:
                    if child.tail is not None:
                        text += child.tail
                if len(text) <= 0:
                    text = event.text
                if text is None or len(text) <= 0:
                    continue
                text = self.tp.cleaner.clean_text(text)
                words = self.tp.tokenizer.tokenize(text)
                paraAlignment = {
                    'off_start': 0,
                    'off_end': len(text),
                    'para_id': self.pID
                }
                paraSent = {
                    'words': words,
                    'text': text,
                    'para_alignment': [paraAlignment],
                    'lang': len(self.corpusSettings['languages']) + iTier
                }
                self.add_src_alignment(paraSent, sentBoundaries, srcFile)
                yield paraSent

    def get_segment_words(self, segment):
        """
        Extract all words and punctuation from a <seg> node.
        Return list of words and fill the self.wordsByID dictionary
        ({word ID -> word object in the list}).
        """
        wordList = []
        prevTag = ''
        for wordNode in segment:
            if wordNode in (self.pfx_tei + 'w', self.pfx_tei + 'pc'
                            ) and self.pfx_xml + 'id' not in wordNode.attrib:
                continue
            try:
                wordID = wordNode.attrib[self.pfx_xml + 'id']
            except KeyError:
                continue
            if wordNode.tag == self.pfx_tei + 'w':
                # if prevTag == self.pfx_tei + 'w' and len(wordList) > 0:
                #     # If there is no time anchor between two words,
                #     # treat it as a single token divided by a word-internal whitespace.
                #     # TODO: This is a temporary solution. Changes have to be made
                #     # to the source Exmaralda files to avoid splitting such words.
                #     wordList[-1]['wf'] += ' ' + wordNode.text.strip()
                #     self.wordsByID[wordNode.attrib[self.pfx_xml + 'id']] = wordList[-1]
                #     print('Warning: consecutive words with no acnhor between them (' + wordList[-1]['wf'] + ')')
                # else:
                word = {'wf': wordNode.text.strip(), 'wtype': 'word'}
                wordList.append(word)
                self.wordsByID[wordID] = word
                self.wordIDseq.append(wordID)
            elif wordNode.tag == self.pfx_tei + 'pc':
                word = {'wf': wordNode.text.strip(), 'wtype': 'punct'}
                wordList.append(word)
                self.wordsByID[wordID] = word
            elif wordNode.tag == self.pfx_tei + 'incident':
                # Treat "incidents" as punctuation
                # continue
                word = {
                    'wf': '((' + wordNode[0].text.strip() + '))',
                    'wtype': 'punct',
                    'incident': True
                }
                wordList.append(word)
                self.wordsByID[wordID] = word
                self.wordIDseq.append(wordID)
            prevTag = wordNode.tag
        return wordList

    def align_words_and_baseline(self, sent):
        """
        Fill in the offset fields for individual words in the sentence.
        """
        iSentPos = 0
        for iWord in range(len(sent['words'])):
            iWordPos = 0
            word = sent['words'][iWord]
            wf = word['wf']
            if len(wf) <= 0:
                continue
            # if 'incident' in word:
            #     sent['text'] = sent['text'][:iSentPos] + ' ' + wf + ' ' + sent['text'][iSentPos:]
            while (iSentPos < len(sent['text'])
                   and sent['text'][iSentPos].lower() != wf[iWordPos].lower()):
                iSentPos += 1
            if iSentPos == len(sent['text']):
                if iWord == 0 and word['wtype'] == 'punct':
                    # Try repairing it by inserting that punctuation to the sentence text
                    sent['text'] = wf + sent['text']
                    iSentPos = 0
                    print(
                        'Unexpected end of sentence, attempting to repair sentence text. '
                        'Details:\nSentence (SpeakerContribution_Event):',
                        sent['text'], '\nWords (annotationBlock/u/seg):',
                        '+'.join(w['wf'] for w in sent['words']))
                else:
                    for iWordRest in range(iWord, len(sent['words'])):
                        sent['words'][iWordRest]['off_start'] = len(
                            sent['text']) - 1
                        sent['words'][iWordRest]['off_end'] = len(
                            sent['text']) - 1
                    word['off_end'] = len(sent['text']) - 1
                    print(
                        'Unexpected end of sentence, terminating alignment now. '
                        'Details:\nSentence (SpeakerContribution_Event):',
                        sent['text'], '\nWords (annotationBlock/u/seg):',
                        '+'.join(w['wf'] for w in sent['words']))
                    return
            word['off_start'] = iSentPos
            word['off_end'] = iSentPos + len(wf)
            while iSentPos < len(sent['text']) and iWordPos < len(wf):
                if sent['text'][iSentPos].lower() == wf[iWordPos].lower():
                    iSentPos += 1
                    iWordPos += 1
                    continue
                if self.rxLetters.search(
                        wf[iWordPos]) is None and self.rxLetters.search(
                            sent['text'][iSentPos]) is not None:
                    iWordPos += 1
                    continue
                iSentPos += 1
            word['off_end'] = iSentPos
        if len(sent['words']) > 0 and sent['words'][0]['off_start'] > 0:
            # Add the beginning of the sentence as punctuation.
            leadingPunct = {
                'wf': sent['text'][:sent['words'][0]['off_start']],
                'wtype': 'punct',
                'off_start': 0,
                'off_end': sent['words'][0]['off_start']
            }
            sent['words'].insert(0, leadingPunct)

    def add_full_text(self, anno, curSentences, tierName=''):
        """
        Add full texts of the sentences from the tier requested
        (ts stands for the main text tier). Find relevant sentences
        based on the time anchors. If there is no such
        tier, restore the text of the sentence from the word_source
        properties of individual words. 
        Do not return anything.
        """
        seg2text = {}  # (from, to) -> sentence text
        for spanGr in anno.xpath('tei:spanGrp', namespaces=self.namespaces):
            if 'type' in spanGr.attrib and spanGr.attrib['type'] == tierName:
                for span in spanGr.xpath('tei:span',
                                         namespaces=self.namespaces):
                    if 'from' not in span.attrib or 'to' not in span.attrib:
                        continue
                    if span.attrib['from'] != span.attrib['to']:
                        self.log_message(
                            '"from" attribute != "to" attribute: ' +
                            span.attrib['from'] + '; ' + span.attrib['to'])
                    if span.attrib['from'] not in self.seg2pID:
                        self.log_message('Wrong "from" attribute: ' +
                                         span.attrib['from'])
                        continue
                    if span.attrib['to'] not in self.seg2pID:
                        self.log_message('Wrong "to" attribute: ' +
                                         span.attrib['to'])
                        continue
                    spanText = span.text
                    if spanText is None:
                        spanText = ''
                    seg2text[(
                        self.seg2pID[span.attrib['from']],
                        self.seg2pID[span.attrib['from']])] = spanText.strip()
        for s in curSentences:
            if 'para_alignment' not in s or len(s['para_alignment']) <= 0:
                continue
            paraID = (s['para_alignment'][0]['para_id'],
                      s['para_alignment'][0]['para_id'])
            if 'text' not in s:
                s['text'] = ''
            if paraID in seg2text:
                s['text'] += seg2text[paraID]
            else:
                for w in s['words']:
                    if 'word_source' in w:
                        s['text'] += w['word_source']
                        del w['word_source']
                s['text'] = s['text'].strip(' \t')
            if 'src_alignment' in s:
                for sa in s['src_alignment']:
                    sa['off_end_sent'] = len(s['text'])

    def add_para_offsets(self, sentences):
        """
        Add character offsets to the parallel alignments of each of the sentences.
        Do not return anything.
        """
        for s in sentences:
            if 'para_alignment' not in s:
                continue
            for para in s['para_alignment']:
                para['off_start'] = 0
                para['off_end'] = len(s['text'])

    def get_sentences(self, srcTree, srcFile):
        """
        Iterate over sentences in the XML tree.
        """
        annotations = srcTree.xpath(
            '/tei:TEI/tei:text/tei:body/tei:annotationBlock',
            namespaces=self.namespaces)
        if len(annotations) <= 0:
            return
        for anno in annotations:
            firstSentence = False
            if len(annotations) > 1:
                firstSentence = True
            curSentences = []
            paraSentences = {
            }  # tier name -> parallel sentences (translations, alternative transcriptions, etc.)
            sentMeta = {}
            if 'start' not in anno.attrib or 'end' not in anno.attrib:
                self.log_message(
                    'No start or end attribute in annotationBlock ' +
                    anno.attrib[self.pfx_xml + 'id'])
                continue
            if 'who' in anno.attrib and anno.attrib['who'] in self.participants:
                sentMeta = self.participants[anno.attrib['who']]
            curAnchor = prevAnchor = anno.attrib['start']
            endAnchor = anno.attrib['end']
            curSent = None
            for u in anno.xpath('tei:u', namespaces=self.namespaces):
                for seg_anchor in u:
                    if seg_anchor.tag == self.pfx_tei + 'anchor' and 'synch' in seg_anchor.attrib:
                        curAnchor = seg_anchor.attrib['synch']
                        if curSent is not None:
                            self.add_src_alignment(curSent,
                                                   [prevAnchor, curAnchor],
                                                   srcFile)
                        prevAnchor = curAnchor
                    elif (seg_anchor.tag == self.pfx_tei + 'seg'
                          and self.pfx_xml + 'id' in seg_anchor.attrib):
                        if curSent is not None:
                            curSentences.append(curSent)
                        self.pID += 1
                        segID = seg_anchor.attrib[self.pfx_xml + 'id']
                        self.seg2pID[segID] = self.pID
                        curSent = {
                            'words': self.get_segment_words(seg_anchor),
                            'text': '',
                            'lang': 0,
                            'para_alignment': [{
                                'para_id': self.pID
                            }]
                        }
                        if firstSentence and 'who' in anno.attrib and anno.attrib[
                                'who'] in self.participants:
                            firstSentence = False
                            curSent['words'].insert(
                                0, {
                                    'wtype':
                                    'punct',
                                    'wf':
                                    '[' + self.participants[anno.attrib['who']]
                                    ['speaker'] + ']'
                                })
                            curSent['words'].insert(0, {
                                'wtype': 'punct',
                                'wf': '\n'
                            })
                            curSent['text'] = '\n[' + self.participants[
                                anno.attrib['who']]['speaker'] + '] '
                        if len(sentMeta) > 0:
                            curSent['meta'] = copy.deepcopy(sentMeta)
                if curSent is not None:
                    curSentences.append(curSent)
            if curSent is not None:
                self.add_src_alignment(curSent, [curAnchor, endAnchor],
                                       srcFile)
            self.process_words(anno)
            self.add_full_text(anno, curSentences)
            self.add_para_offsets(curSentences)
            for tierName in self.corpusSettings['tier_languages']:
                lang = self.corpusSettings['tier_languages'][tierName]
                langID = self.corpusSettings['languages'].index(lang)
                if langID == 0:
                    continue
                paraSentences[tierName] = []
                for sent in curSentences:
                    paraSent = {
                        'words': [],
                        'text': '',
                        'lang': langID,
                        'para_alignment': copy.deepcopy(sent['para_alignment'])
                    }
                    if 'src_alignment' in sent:
                        paraSent['src_alignment'] = copy.deepcopy(
                            sent['src_alignment'])
                    paraSentences[tierName].append(paraSent)
                self.add_full_text(anno, paraSentences[tierName], tierName)
            for sent in curSentences:
                if len(sent['text']) <= 0:
                    self.log_message(
                        'Zero length sentence: ' +
                        json.dumps(sent, ensure_ascii=False, indent=None))
                    continue
                self.align_words_and_baseline(sent)
                yield sent
            for tierName in paraSentences:
                for paraSent in paraSentences[tierName]:
                    if len(paraSent['text']) <= 0:
                        paraSent['words'] = [{
                            'wf': '—',
                            'wtype': 'punct',
                            'off_start': 0,
                            'off_end': 1
                        }]
                        paraSent['text'] = '—'
                    else:
                        paraSent['words'] = self.tp.tokenizer.tokenize(
                            paraSent['text'])
                    paraSent['para_alignment'][0]['off_end'] = len(
                        paraSent['text'])
                    yield paraSent

    def convert_file(self, fnameSrc, fnameTarget):
        """
        Take one source Exmaralda file fnameSrc, parse the XML tree,
        extract timestamps, align sentences with words and their
        analyses and ultimately generate a parsed JSON file
        ready for indexing. Write the output to fnameTarget.
        Return number of tokens, number of words and number of
        words with at least one analysis in the document.
        """
        # curMeta = self.get_meta(fnameSrc)
        # Currently, no metadata are loaded:
        print(fnameSrc)
        curMeta = {
            'title': fnameSrc,
            'author': '',
            'year1': '1900',
            'year2': '2017'
        }

        textJSON = {'meta': curMeta, 'sentences': []}
        nTokens, nWords, nAnalyze = 0, 0, 0
        self.seg2pID = {}
        self.morph2wordID = {}
        self.wordIDseq = []
        srcTree = etree.parse(fnameSrc)
        self.tlis = self.get_tlis(srcTree)
        self.participants = self.load_speaker_meta(srcTree)
        srcFileNode = srcTree.xpath(
            '/tei:TEI/tei:teiHeader/tei:fileDesc/tei:sourceDesc/tei:recordingStmt/tei:recording/tei:media',
            namespaces=self.namespaces)
        if len(srcFileNode) > 0 and 'url' in srcFileNode[0].attrib:
            srcFile = self.rxStripDir.sub('', srcFileNode[0].attrib['url'])
        else:
            srcFile = ''
        textJSON['sentences'] = [
            s for s in self.get_sentences(srcTree, srcFile)
        ]
        textJSON['sentences'].sort(key=lambda s: s['lang'])
        for i in range(len(textJSON['sentences']) - 1):
            if textJSON['sentences'][i]['lang'] != textJSON['sentences'][
                    i + 1]['lang']:
                textJSON['sentences'][i]['last'] = True
        self.tp.splitter.recalculate_offsets(textJSON['sentences'])
        self.tp.splitter.add_next_word_id(textJSON['sentences'])
        self.write_output(fnameTarget, textJSON)
        return nTokens, nWords, nAnalyze

    def process_corpus(self, cutMedia=True):
        """
        Take every Exmaralda file from the source directory subtree, turn it
        into a parsed json and store it in the target directory.
        Split all the corpus media files into overlapping chunks of
        small duration.
        This is the main function of the class.
        """
        Txt2JSON.process_corpus(self)
        if not cutMedia:
            return
        for path, dirs, files in os.walk(
                os.path.join(self.corpusSettings['corpus_dir'], self.srcExt)):
            for fname in files:
                fileExt = os.path.splitext(fname.lower())[1]
                if fileExt in self.mediaExtensions:
                    fname = os.path.abspath(os.path.join(path, fname))
                    print('Cutting media file', fname)
                    self.mc.cut_media(fname)
Пример #7
0
class Eaf2JSON(Txt2JSON):
    """
    Contains methods to make JSONs ready for indexing from
    ELAN aligned files, a csv with metadata and a list with parsed
    word forms.
    """

    mediaExtensions = {'.wav', '.mp3', '.mp4', '.avi', '.mov', '.mts'}
    rxSpaces = re.compile('[ \t]+')
    rxLetters = re.compile('\w+')
    bracketPairs = {
        ']': re.compile('\\[[^ \\]]*$'),
        ')': re.compile('\\([^ \\)]*$'),
        '>': re.compile('<[^ >]*$'),
        '}': re.compile('\\{[^ \\}]*$'),
    }
    standardAnaTiers = ['pos', 'gramm', 'lemma', 'parts', 'gloss']

    def __init__(self, settingsDir='conf_conversion'):
        Txt2JSON.__init__(self, settingsDir=settingsDir)
        self.speakerMeta = self.load_speaker_meta()
        self.mc = MediaCutter(settings=self.corpusSettings)
        self.srcExt = 'eaf'
        self.tlis = {}  # time labels
        self.pID = 0  # id of last aligned segment
        self.glosses = set()
        self.participants = {}  # main tier ID -> participant ID
        self.segmentTree = {}  # aID -> (contents, parent aID, tli1, tli2)
        self.segmentChildren = {}  # (aID, child tier type) -> [child aID]
        self.spanAnnoTiers = {}  # span annotation tier type -> {tier ID -> [(tli1, tli2, contents)}
        self.alignedSpanAnnoTiers = {}  # aID of a segment -> {span annotation tier ID -> contents}
        self.additionalWordFields = []  # names of additional word-level fields associated with some analysis tiers
        self.privacySegments = {}  # segments (start_ms, end_ms) that should be beeped out, one list per source file
        self.rxIgnoreTokens = None
        self.set_ignore_tokens()
        self.usedMediaFiles = set()  # filenames of media fragments referenced in the JSONs

    def set_ignore_tokens(self):
        """
        Compile regexes for tokens which should be ignored when
        aligning the token tier with the text tier.
        """
        if 'ignore_tokens' not in self.corpusSettings:
            self.corpusSettings['ignore_tokens'] = ''
        if not self.corpusSettings['ignore_tokens'].startswith('^'):
            self.corpusSettings['ignore_tokens'] = '^' + self.corpusSettings['ignore_tokens']
        if not self.corpusSettings['ignore_tokens'].endswith('$'):
            self.corpusSettings['ignore_tokens'] += '$'
        try:
            self.rxIgnoreTokens = re.compile(self.corpusSettings['ignore_tokens'])
        except:
            print('Please check your ignore token regex.')

    def load_speaker_meta(self):
        speakerMeta = {}
        if 'speaker_meta_filename' not in self.corpusSettings:
            return speakerMeta
        try:
            f = open(os.path.join(self.corpusSettings['corpus_dir'], self.corpusSettings['speaker_meta_filename']),
                     'r', encoding='utf-8-sig')
            speakerMeta = json.loads(f.read())
            f.close()
        except FileNotFoundError:
            print('The speaker metadata file not found.')
        return speakerMeta

    def get_tlis(self, srcTree):
        """
        Retrieve and return all time labels from the XML tree.
        """
        tlis = {}
        iTli = 0
        for tli in srcTree.xpath('/ANNOTATION_DOCUMENT/TIME_ORDER/TIME_SLOT'):
            timeValue = ''
            if 'TIME_VALUE' in tli.attrib:
                timeValue = tli.attrib['TIME_VALUE']
            tlis[tli.attrib['TIME_SLOT_ID']] = {'n': iTli, 'time': timeValue}
            iTli += 1
        return tlis

    def traverse_tree(self, srcTree, callback):
        """
        Iterate over all tiers in the XML tree and call the callback function
        for each of them.
        """
        for tierNode in srcTree.xpath('/ANNOTATION_DOCUMENT/TIER'):
            if 'TIER_ID' not in tierNode.attrib:
                continue
            callback(tierNode)

    def add_aligned_style_span_data(self, parentID, annoTierID, text):
        if annoTierID is None or len(annoTierID) <= 0 or parentID is None:
            return
        if parentID not in self.alignedSpanAnnoTiers:
            self.alignedSpanAnnoTiers[parentID] = {}
        self.alignedSpanAnnoTiers[parentID][annoTierID] = text

    def get_span_tier_id(self, tierNode):
        """
        Return tier ID and the sentence-level metadata field name for a tier that contains
        sentence-level annotation, based on the span_annotation_tiers dictionary
        in conversion_settings.json.
        """
        if 'span_annotation_tiers' not in self.corpusSettings:
            return tierNode.attrib['TIER_ID'], None
        annoTierRules = {}
        if ('LINGUISTIC_TYPE_REF' in tierNode.attrib and
                tierNode.attrib['LINGUISTIC_TYPE_REF'] in self.corpusSettings['span_annotation_tiers']):
            annoTierRules = self.corpusSettings['span_annotation_tiers'][tierNode.attrib['LINGUISTIC_TYPE_REF']]
        else:
            for k, v in self.corpusSettings['span_annotation_tiers'].items():
                if not k.startswith('^'):
                    k = '^' + k
                if not k.endswith('$'):
                    k += '$'
                try:
                    rxTierID = re.compile(k)
                    if rxTierID.search(tierNode.attrib['TIER_ID']) is not None:
                        annoTierRules = v
                        break
                except:
                    continue
        if len(annoTierRules) <= 0 or 'sentence_meta' not in annoTierRules:
            return tierNode.attrib['TIER_ID'], None
        return tierNode.attrib['TIER_ID'], annoTierRules['sentence_meta']

    def cb_build_segment_tree(self, tierNode):
        tierType = ''  # analysis tiers: word/POS/gramm/gloss etc.
        if 'analysis_tiers' in self.corpusSettings:
            for k, v in self.corpusSettings['analysis_tiers'].items():
                if not k.startswith('^'):
                    k = '^' + k
                if not k.endswith('$'):
                    k += '$'
                try:
                    rxTierID = re.compile(k)
                    if (rxTierID.search(tierNode.attrib['TIER_ID']) is not None
                            or rxTierID.search(tierNode.attrib['LINGUISTIC_TYPE_REF']) is not None):
                        tierType = v
                        if tierType not in self.standardAnaTiers:
                            self.additionalWordFields.append(tierType)
                        break
                except:
                    print('Something is wrong with an analysis tier regex: ' + k)
        for segNode in tierNode.xpath('ANNOTATION/REF_ANNOTATION | ANNOTATION/ALIGNABLE_ANNOTATION'):
            if 'ANNOTATION_ID' not in segNode.attrib:
                continue
            aID = segNode.attrib['ANNOTATION_ID']
            try:
                segContents = segNode.xpath('ANNOTATION_VALUE')[0].text.strip()
            except AttributeError:
                segContents = ''
            try:
                segParent = segNode.attrib['ANNOTATION_REF']
            except KeyError:
                segParent = None
            tli1, tli2 = None, None
            if 'TIME_SLOT_REF1' in segNode.attrib:
                tli1 = segNode.attrib['TIME_SLOT_REF1']
            elif segParent in self.segmentTree and self.segmentTree[segParent][2] is not None:
                tli1 = self.segmentTree[segParent][2]
            if 'TIME_SLOT_REF2' in segNode.attrib:
                tli2 = segNode.attrib['TIME_SLOT_REF2']
            elif segParent in self.segmentTree and self.segmentTree[segParent][3] is not None:
                tli2 = self.segmentTree[segParent][3]
            self.segmentTree[aID] = (segContents, segParent, tli1, tli2)
            if segParent is None:
                continue
            if len(tierType) > 0:
                try:
                    self.segmentChildren[(segParent, tierType)].append(aID)
                except KeyError:
                    self.segmentChildren[(segParent, tierType)] = [aID]
            annoTierID, annoTierType = self.get_span_tier_id(tierNode)
            self.add_aligned_style_span_data(segParent, annoTierType, segContents)

    def build_segment_tree(self, srcTree):
        """
        Read the entire XML tree and save all segment data (contents, links to
        the parents and timestamps, if any).
        """
        self.segmentTree = {}
        self.segmentChildren = {}
        self.traverse_tree(srcTree, self.cb_build_segment_tree)

    def fragmentize_src_alignment(self, sent):
        """
        Find corresponding media file fragment and transform a JSON
        dictionaries with the information about the alignment.
        """
        if 'src_alignment' not in sent:
            return
        sent['src_alignment'].sort(key=lambda a: a['off_start_src'])
        minTime = sent['src_alignment'][0]['off_start_src']
        maxTime = sent['src_alignment'][-1]['off_end_src']
        for alignment in sent['src_alignment']:
            fileName, fileExt = os.path.splitext(alignment['src'].lower())
            if fileExt not in self.mediaExtensions:
                return
            segStart = alignment['off_start_src']
            segEnd = alignment['off_end_src']
            ts1frag, ts2frag, srcFileFrag = self.mc.get_media_name(alignment['src'],
                                                                   segStart,
                                                                   segEnd,
                                                                   minTime=minTime,
                                                                   maxTime=maxTime)
            self.usedMediaFiles.add(srcFileFrag)
            alignment['src'] = srcFileFrag
            alignment['off_start_src'] = ts1frag
            alignment['off_end_src'] = ts2frag

    def add_src_alignment(self, sent, tli1, tli2, srcFile):
        """
        Add the alignment of the sentence with the sound/video. If
        word-level time data is available, align words, otherwise
        align the whole sentence.
        """
        sentAlignments = []
        ts1 = self.tlis[tli1]['time']
        ts2 = self.tlis[tli2]['time']
        sentAlignments.append({'off_start_src': float(ts1) / EAF_TIME_MULTIPLIER,
                               'off_end_src': float(ts2) / EAF_TIME_MULTIPLIER,
                               'true_off_start_src': float(ts1) / EAF_TIME_MULTIPLIER,
                               'off_start_sent': 0,
                               'off_end_sent': len(sent['text']),
                               'mtype': 'audio',
                               'src_id': ts1 + '_' + ts2,
                               'src': srcFile})
        # for alignment in sentAlignments:
        #     self.fragmentize_src_alignment(alignment)
        sent['src_alignment'] = sentAlignments

    def add_punc(self, words, text, prevText, startOffset):
        """
        Make one or several punctuation tokens out of the text and
        add them to the words list.
        """
        if len(text) <= 0:
            return

        # First, check for closing brackets that should belong to the word:
        if text[0] in self.bracketPairs and len(words) > 0:
            if self.bracketPairs[text[0]].search(prevText) is not None:
                words[-1]['off_end'] += 1
                text = text[1:]

        curToken = {'wf': '', 'off_start': startOffset, 'off_end': startOffset, 'wtype': 'punct'}
        for i in range(len(text)):
            if self.rxSpaces.search(text[i]) is not None:
                if len(curToken['wf']) > 0:
                    curToken['off_end'] = startOffset + i
                    words.append(curToken)
                    curToken = {'wf': '', 'off_start': startOffset + i, 'off_end': startOffset + i, 'wtype': 'punct'}
            else:
                curToken['wf'] += text[i]
        if len(curToken['wf']) > 0:
            curToken['off_end'] = startOffset + len(text)
            words.append(curToken)

    def retrieve_analyses(self, aID, lang='', topLevel=True):
        """
        Compile list of analyses retrieved from the relevant tiers of an analyzed
        EAF file associated with the token identified by aID.
        topLevel == True iff the function was called by a token processor,
        rather than by the same function recursively. This is needed because
        certain wrap-up operations should be performed only on the top level,
        e.g. gloss-to-tag conversion or collation of analyses.
        TODO: actually, the top-level tier here is the lowest tier in the
        hierarchy where subdivision of a parent cell implies multiple
        analyses. A POS or a lemma tier could be top-level, for example.
        """
        analyses = []
        analysisTiers = []
        for tierType in set(self.standardAnaTiers) | set(self.additionalWordFields):
            if (aID, tierType) not in self.segmentChildren:
                continue
            analysisTiers.append([])
            for childID in self.segmentChildren[(aID, tierType)]:
                if childID not in self.segmentTree:
                    continue
                contents = self.segmentTree[childID][0]
                for ana in self.retrieve_analyses(childID, lang=lang, topLevel=False):
                    if tierType == 'lemma':
                        ana['lex'] = contents
                    elif tierType == 'parts':
                        ana['parts'] = contents
                    elif tierType == 'gloss':
                        ana['gloss'] = contents
                    elif tierType == 'pos' and len(contents) > 0:
                        ana['gr.pos'] = contents
                    elif tierType == 'gramm':
                        grJSON = self.tp.parser.transform_gramm_str(contents, lang=lang)
                        ana.update(grJSON)
                    elif tierType in self.additionalWordFields:
                        ana[tierType] = contents
                    analysisTiers[-1].append(ana)
            analysisTiers[-1] = [ana for ana in analysisTiers[-1] if len(ana) > 0]
        if len(analysisTiers) <= 0:
            return [{}]
        for combination in itertools.product(*analysisTiers):
            ana = {}
            for partAna in combination:
                ana.update(partAna)
            if len(ana) > 0:
                analyses.append(ana)
        if topLevel:
            if ('one_morph_per_cell' in self.corpusSettings
                    and self.corpusSettings['one_morph_per_cell']):
                curLex = set()
                curStemGloss = set()
                allAnaFields = set()
                for ana in analyses:
                    for k in ana:
                        allAnaFields.add(k)
                totalAna = {k: '' for k in allAnaFields}
                for k in totalAna:
                    for ana in analyses:
                        if k in ['lex'] or k.startswith('gr.'):
                            if k in ana:
                                if len(totalAna[k]) <= 0:
                                    totalAna[k] = ana[k]
                                elif type(totalAna[k]) == str and totalAna[k] != ana[k]:
                                    totalAna[k] = [totalAna[k], ana[k]]
                                elif type(totalAna[k]) == list and ana[k] not in totalAna[k]:
                                    totalAna[k].append(ana[k])
                        else:
                            if len(totalAna[k]) > 0 and k not in ['parts']:
                                totalAna[k] += '-'
                            if k not in ana:
                                totalAna[k] += '∅'
                            else:
                                totalAna[k] += ana[k]
                                if k == 'parts' and not ana[k].startswith('-') and not ana[k].endswith('-'):
                                    curLex.add(ana[k])
                                    if 'gloss' in ana:
                                        curStemGloss.add(ana['gloss'])
                if 'lex' not in totalAna or len(totalAna['lex']) <= 0:
                    totalAna['lex'] = [l for l in sorted(curLex)]
                    if len(totalAna['lex']) == 1:
                        totalAna['lex'] = totalAna['lex'][0]
                if 'trans_en' not in totalAna or len(totalAna['trans_en']) <= 0:
                    totalAna['trans_en'] = [t for t in sorted(curStemGloss)]
                    if len(totalAna['trans_en']) == 1:
                        totalAna['trans_en'] = totalAna['trans_en'][0]
                analyses = [totalAna]

            for ana in analyses:
                self.tp.parser.process_gloss_in_ana(ana)
                if 'gloss_index' in ana:
                    if 'analysis_tiers' in self.corpusSettings and 'gramm' not in self.corpusSettings['analysis_tiers']:
                        self.tp.parser.gloss2gr(ana, self.corpusSettings['languages'][0])
        if len(analyses) <= 0:
            return [{}]
        return analyses

    def retrieve_words(self, text, wordIDs, lang=''):
        """
        Return a list of words with their analyses retrieved from the relevant
        tiers of an analyzed EAF file. Try to align words with the text of the
        entire sentence. Return the text as well, since it may be slightly altered
        if there is no exact correspondence between the text tier and the token tier.
        """
        words = []
        iSentPos = 0
        iBufferStart = 0
        sBuffer = ''
        for iWord in range(len(wordIDs)):
            iWordPos = 0
            word = self.segmentTree[wordIDs[iWord]][0]
            if len(sBuffer) <= 0:
                iBufferStart = iSentPos
            if len(word) <= 0 or self.rxIgnoreTokens.search(word) is not None:
                continue
            while iSentPos < len(text) and text[iSentPos].lower() != word[iWordPos].lower():
                sBuffer += text[iSentPos]
                iSentPos += 1
            if len(sBuffer) > 0:
                self.add_punc(words, sBuffer, text[:iBufferStart], iBufferStart)
                sBuffer = ''
                iBufferStart = iSentPos
            if iSentPos == len(text):
                # If the remaining tokens consist of punctuation, add them to the sentence
                if self.rxLetters.search(word) is None and self.rxIgnoreTokens.search(word) is None:
                    text += word
                    self.add_punc(words, word, text[:iSentPos], iSentPos)
                    continue
                else:
                    print('Unexpected end of sentence:', text)
                    return words, text
            token = {'wf': word, 'off_start': iSentPos, 'off_end': iSentPos + len(word), 'wtype': 'word',
                     'n_orig': iWord}
            while iSentPos < len(text) and iWordPos < len(word):
                if text[iSentPos].lower() == word[iWordPos].lower():
                    iSentPos += 1
                    iWordPos += 1
                    continue
                if self.rxLetters.search(word[iWordPos]) is None and self.rxLetters.search(text[iSentPos]) is not None:
                    iWordPos += 1
                    continue
                iSentPos += 1
            token['off_end'] = iSentPos
            analyses = [ana for ana in self.retrieve_analyses(wordIDs[iWord], lang=lang) if len(ana) > 0]
            if len(analyses) > 0:
                token['ana'] = analyses
            words.append(token)
        if iSentPos < len(text):
            self.add_punc(words, text[iSentPos:], text[:iSentPos], iSentPos)
        return words, text

    def process_span_annotation_tier(self, tierNode):
        """
        If the tier in tierNode is a span annotation tier, extract its data.
        If the tier is time-aligned, save the data to self.spanAnnoTiers[annoTierID]
        as time labels.
        """
        if ('span_annotation_tiers' not in self.corpusSettings
                or len(self.corpusSettings['span_annotation_tiers']) <= 0):
            return
        annoTierID, annoTierType = self.get_span_tier_id(tierNode)
        if annoTierType is None or len(annoTierType) <= 0:
            return
        if annoTierType not in self.spanAnnoTiers:
            self.spanAnnoTiers[annoTierType] = {}
        if annoTierID not in self.spanAnnoTiers[annoTierType]:
            self.spanAnnoTiers[annoTierType][annoTierID] = []

        segments = tierNode.xpath('ANNOTATION/ALIGNABLE_ANNOTATION')
        for segNode in segments:
            if ('ANNOTATION_ID' not in segNode.attrib
                    or segNode.attrib['ANNOTATION_ID'] not in self.segmentTree):
                continue
            segData = self.segmentTree[segNode.attrib['ANNOTATION_ID']]
            if segData[2] is None or segData[3] is None:
                continue
            tli1 = segData[2]
            tli2 = segData[3]
            text = segData[0]
            self.spanAnnoTiers[annoTierType][annoTierID].append((tli1, tli2, text))
        self.spanAnnoTiers[annoTierType][annoTierID].sort(
            key=lambda x: (float(self.tlis[x[0]]['time']), float(self.tlis[x[1]]['time']), x[2])
        )

    def add_privacy_segments(self, srcTree, srcFile):
        """
        Remember segments that should be beeped out because they
        contain sensitive data.
        """
        if 'privacy_tier' not in self.corpusSettings or len(srcFile) <= 0:
            return
        privTierID = self.corpusSettings['privacy_tier']
        if srcFile not in self.privacySegments:
            self.privacySegments[srcFile] = []

        for tierNode in srcTree.xpath('/ANNOTATION_DOCUMENT/TIER'):
            if 'TIER_ID' not in tierNode.attrib:
                continue
            if (tierNode.attrib['TIER_ID'] == privTierID or
                    ('LINGUISTIC_TYPE_REF' in tierNode.attrib
                     and tierNode.attrib['LINGUISTIC_TYPE_REF'] == privTierID)):
                segments = tierNode.xpath('ANNOTATION/ALIGNABLE_ANNOTATION')
                for segNode in segments:
                    if ('ANNOTATION_ID' not in segNode.attrib
                            or segNode.attrib['ANNOTATION_ID'] not in self.segmentTree):
                        continue
                    segData = self.segmentTree[segNode.attrib['ANNOTATION_ID']]
                    if segData[2] is None or segData[3] is None:
                        continue
                    tli1 = segData[2]
                    tli2 = segData[3]
                    self.privacySegments[srcFile].append((int(self.tlis[tli1]['time']), int(self.tlis[tli2]['time'])))

    def process_tier(self, tierNode, aID2pID, srcFile, alignedTier=False):
        """
        Extract segments from the tier node and iterate over them, returning
        them as JSON sentences. If alignedTier is False, store the start and end
        timestamps, as well as pIDs for alignment, in the dictionary aID2pID.
        If alignedTier is True, use the information from aID2pID for establishing
        time boundaries of the sentences and aligning it with the source tier. 
        """
        lang = ''
        # We have to find out what language the tier represents.
        # First, check the tier type. If it is not associated with any language,
        # check all tier ID regexes.
        if 'TIER_ID' not in tierNode.attrib:
            return

        # Find out the participant (speaker) and save that information
        speaker = ''
        if not alignedTier and 'PARTICIPANT' in tierNode.attrib:
            speaker = tierNode.attrib['PARTICIPANT']
            self.participants[tierNode.attrib['TIER_ID']] = speaker
        else:
            if ('PARENT_REF' in tierNode.attrib
                    and tierNode.attrib['PARENT_REF'] in self.participants):
                speaker = self.participants[tierNode.attrib['PARENT_REF']]
                self.participants[tierNode.attrib['TIER_ID']] = speaker
            elif 'PARTICIPANT' in tierNode.attrib:
                speaker = tierNode.attrib['PARTICIPANT']
                self.participants[tierNode.attrib['TIER_ID']] = speaker

        # Find out the language of the tier
        if ('LINGUISTIC_TYPE_REF' in tierNode.attrib and
                tierNode.attrib['LINGUISTIC_TYPE_REF'] in self.corpusSettings['tier_languages']):
            lang = self.corpusSettings['tier_languages'][tierNode.attrib['LINGUISTIC_TYPE_REF']]
        else:
            for k, v in self.corpusSettings['tier_languages'].items():
                if not k.startswith('^'):
                    k = '^' + k
                if not k.endswith('$'):
                    k += '$'
                try:
                    rxTierID = re.compile(k)
                    if rxTierID.search(tierNode.attrib['TIER_ID']) is not None:
                        lang = v
                        break
                except:
                    continue
        if len(lang) <= 0 or lang not in self.corpusSettings['languages']:
            # A tier can also contain span annotations, let's check it:
            if len(lang) <= 0 and not alignedTier:
                self.process_span_annotation_tier(tierNode)
            # Otherwise, we do not want a tier with no language association
            return
        langID = self.corpusSettings['languages'].index(lang)

        segments = tierNode.xpath('ANNOTATION/REF_ANNOTATION | ANNOTATION/ALIGNABLE_ANNOTATION')

        for segNode in segments:
            if ('ANNOTATION_ID' not in segNode.attrib
                    or segNode.attrib['ANNOTATION_ID'] not in self.segmentTree):
                continue
            segData = self.segmentTree[segNode.attrib['ANNOTATION_ID']]
            if not alignedTier:
                if segData[2] is None or segData[3] is None:
                    continue
                tli1 = segData[2]
                tli2 = segData[3]
            elif segData[1] is not None:
                aID = segData[1]
                pID, tli1, tli2 = aID2pID[aID]
            else:
                continue
            text = segData[0]
            curSent = {'text': text, 'words': None, 'lang': langID,
                       'meta': {'speaker': speaker}}
            # Add speaker metadata
            if speaker in self.speakerMeta:
                for k, v in self.speakerMeta[speaker].items():
                    curSent['meta'][k] = v
            # Add metadata and style spans from sentence-aligned annotation tiers
            if segNode.attrib['ANNOTATION_ID'] in self.alignedSpanAnnoTiers:
                spanAnnoData = self.alignedSpanAnnoTiers[segNode.attrib['ANNOTATION_ID']]
                for annoTierID in spanAnnoData:
                    curSpanValue = spanAnnoData[annoTierID]
                    if annoTierID not in curSent['meta']:
                        curSent['meta'][annoTierID] = []
                    if curSpanValue not in curSent['meta'][annoTierID]:
                        curSent['meta'][annoTierID].append(curSpanValue)
                    # Add style spans
                    curRules = {}
                    for tierID in self.corpusSettings['span_annotation_tiers']:
                        if ('sentence_meta' in self.corpusSettings['span_annotation_tiers'][tierID]
                                and self.corpusSettings['span_annotation_tiers'][tierID][
                                    'sentence_meta'] == annoTierID):
                            curRules = self.corpusSettings['span_annotation_tiers'][tierID]
                            break
                    if len(curRules) <= 0:
                        continue
                    if 'styles' in curRules and curSpanValue in curRules['styles']:
                        spanStyle = curRules['styles'][curSpanValue]
                        if 'style_spans' not in curSent:
                            curSent['style_spans'] = []
                        curSent['style_spans'].append({
                            'off_start': 0,
                            'off_end': len(curSent['text']),
                            'span_class': spanStyle,
                            'tooltip_text': curSpanValue
                        })
            # Tokenize the sentence or align it with an existing tokenization
            if (segNode.attrib['ANNOTATION_ID'], 'word') not in self.segmentChildren:
                curSent['words'] = self.tp.tokenizer.tokenize(text)
                self.tp.splitter.add_next_word_id_sentence(curSent)
                self.tp.parser.analyze_sentence(curSent, lang=lang)
                curSent['nTokensOrig'] = len(curSent['words'])
            else:
                tokensOrig = self.segmentChildren[(segNode.attrib['ANNOTATION_ID'], 'word')]
                curSent['nTokensOrig'] = len(tokensOrig)
                curSent['words'], curSent['text'] = self.retrieve_words(text,
                                                                        tokensOrig,
                                                                        lang=lang)
                self.tp.splitter.add_next_word_id_sentence(curSent)
            if len(self.corpusSettings['aligned_tiers']) > 0:
                if not alignedTier:
                    self.pID += 1
                    aID = segNode.attrib['ANNOTATION_ID']
                    aID2pID[aID] = (self.pID, tli1, tli2)
                    paraAlignment = {'off_start': 0, 'off_end': len(curSent['text']), 'para_id': self.pID}
                    curSent['para_alignment'] = [paraAlignment]
                else:
                    paraAlignment = {'off_start': 0, 'off_end': len(curSent['text']), 'para_id': pID}
                    curSent['para_alignment'] = [paraAlignment]
            self.add_src_alignment(curSent, tli1, tli2, srcFile)
            yield curSent

    def add_span_annotations(self, sentences):
        """
        Add span-like annotations, i.e. annotations that could span several
        tokens or even sentences and reside in time-aligned tiers.
        Add them to the relevant sentences as style spans and/or as sentence-level
        metadata values, depending on what is said in corpusSettings['span_annotation_tiers'].
        Modify sentences, do not return anything.
        """
        sentences.sort(key=lambda s: s['src_alignment'][0]['true_off_start_src'])
        for annoTierType in self.spanAnnoTiers:
            curRules = {}
            for tierID in self.corpusSettings['span_annotation_tiers']:
                if ('sentence_meta' in self.corpusSettings['span_annotation_tiers'][tierID]
                        and self.corpusSettings['span_annotation_tiers'][tierID]['sentence_meta'] == annoTierType):
                    curRules = self.corpusSettings['span_annotation_tiers'][tierID]
                    break
            if len(curRules) <= 0:
                continue

            for annoTierID in self.spanAnnoTiers[annoTierType]:
                # There may be more than one span-like annotation tier of a given type.
                # Different tiers may refer to different participants, so we have to
                # check which tiers should trigger metadata changes for which sentences.
                curSpeaker = ''
                if annoTierID in self.participants:
                    curSpeaker = self.participants[annoTierID]

                iSentence = 0
                iSpan = 0
                while iSentence < len(sentences) and iSpan < len(self.spanAnnoTiers[annoTierType][annoTierID]):
                    curSpan = self.spanAnnoTiers[annoTierType][annoTierID][iSpan]
                    curSentence = sentences[iSentence]
                    if 'languages' in curRules and 'lang' in curSentence:
                        if self.corpusSettings['languages'][curSentence['lang']] not in curRules['languages']:
                            iSentence += 1
                            continue
                    if (len(curSpeaker) > 0 and 'meta' in curSentence
                            and 'speaker' in curSentence['meta']
                            and curSentence['meta']['speaker'] != curSpeaker):
                        iSentence += 1
                        continue
                    curSpanStart = float(self.tlis[curSpan[0]]['time']) / EAF_TIME_MULTIPLIER
                    curSpanEnd = float(self.tlis[curSpan[1]]['time']) / EAF_TIME_MULTIPLIER
                    curSpanValue = curSpan[2]
                    # This is happening after the offsets are recalculated to account for media cutting
                    curSentenceStart = curSentence['src_alignment'][0]['true_off_start_src']
                    curSentenceEnd = curSentenceStart + (float(curSentence['src_alignment'][0]['off_end_src'])
                                                         - float(curSentence['src_alignment'][0]['off_start_src']))
                    if curSpanStart >= curSentenceEnd - 0.03 or len(curSentence['words']) <= 0:
                        iSentence += 1
                        continue
                    elif curSpanEnd <= curSentenceStart + 0.03:
                        iSpan += 1
                        continue

                    if 'meta' not in curSentence:
                        curSentence['meta'] = {}
                    if annoTierType not in curSentence['meta']:
                        curSentence['meta'][annoTierType] = []
                    if curSpanValue not in curSentence['meta'][annoTierType]:
                        curSentence['meta'][annoTierType].append(curSpanValue)

                    # The ugly part: span-like annotations in ELAN are time-aligned, but usually
                    # they refer to tokens, which are symbolical subdivisions of a time-aligned
                    # sentence. So the "real" time boundaries of span-like annotations are visually
                    # aligned with "imaginary" time boundaries of tokens.
                    # We will calculate these imaginary boundaries to compare them to the annotation
                    # boundaries and know which tokens the annotation should cover.
                    # Note that the visual alignment can be imperfect, so we have to account for that.
                    # We use the original tokenization as represented in ELAN for calcuations,
                    # which might be different from what is in curSentence['words'] now (e.g. punctuation
                    # might have been absent from the original tokens).
                    tokenDuration = (curSentenceEnd - curSentenceStart) / curSentence['nTokensOrig']
                    tokensInvolvedOrig = []
                    tokensInvolved = []
                    for iToken in range(curSentence['nTokensOrig']):
                        tokenStart = curSentenceStart + (iToken + 0.1) * tokenDuration
                        tokenEnd = curSentenceStart + (iToken + 0.9) * tokenDuration
                        if curSpanStart <= tokenStart and tokenEnd <= curSpanEnd:
                            tokensInvolvedOrig.append(iToken)
                    # Find which actual token numbers correspond to the original ones.
                    if any('n_orig' in t for t in curSentence['words']):
                        for iToken in range(len(curSentence['words'])):
                            curToken = curSentence['words'][iToken]
                            if 'n_orig' in curToken and curToken['n_orig'] in tokensInvolvedOrig:
                                tokensInvolved.append(iToken)
                    else:
                        tokensInvolved = tokensInvolvedOrig     # I'm not sure this is really necessary
                    if (len(tokensInvolved) > 0
                            and 'styles' in curRules
                            and curSpanValue in curRules['styles']):
                        spanOffStart = curSentence['words'][tokensInvolved[0]]['off_start']
                        spanOffEnd = curSentence['words'][tokensInvolved[-1]]['off_end']
                        spanStyle = curRules['styles'][curSpanValue]
                        if 'style_spans' not in curSentence:
                            curSentence['style_spans'] = []
                        curSentence['style_spans'].append({
                            'off_start': spanOffStart,
                            'off_end': spanOffEnd,
                            'span_class': spanStyle,
                            'tooltip_text': curSpanValue + ' [' + str(iSpan) + ']'
                        })
                    if curSpanEnd < curSentenceEnd:
                        iSpan += 1
                    else:
                        iSentence += 1

    def get_sentences(self, srcTree, srcFile):
        """
        Iterate over sentences in the XML tree.
        """
        # mainTierTypes = '(' + ' | '.join('/ANNOTATION_DOCUMENT/TIER[@LINGUISTIC_TYPE_REF=\'' + x + '\'] | ' +
        #                                  '/ANNOTATION_DOCUMENT/TIER[@TIER_ID=\'' + x + '\']'
        #                                  for x in self.corpusSettings['main_tiers']) + ')'
        # mainTiers = srcTree.xpath(mainTierTypes)
        mainTiers = []
        alignedTiers = []
        for tierNode in srcTree.xpath('/ANNOTATION_DOCUMENT/TIER'):
            for tierRegex in self.corpusSettings['main_tiers']:
                if not tierRegex.startswith('^'):
                    tierRegex = '^' + tierRegex
                if not tierRegex.endswith('$'):
                    tierRegex += '$'
                try:
                    if re.search(tierRegex, tierNode.attrib['TIER_ID']) is not None:
                        mainTiers.append(tierNode)
                        break
                    elif ('LINGUISTIC_TYPE_REF' in tierNode.attrib
                          and re.search(tierRegex, tierNode.attrib['LINGUISTIC_TYPE_REF']) is not None):
                        mainTiers.append(tierNode)
                        break
                except:
                    pass
            for tierRegex in self.corpusSettings['aligned_tiers']:
                if not tierRegex.startswith('^'):
                    tierRegex = '^' + tierRegex
                if not tierRegex.endswith('$'):
                    tierRegex += '$'
                try:
                    if re.search(tierRegex, tierNode.attrib['TIER_ID']) is not None:
                        alignedTiers.append(tierNode)
                        break
                    elif ('LINGUISTIC_TYPE_REF' in tierNode.attrib
                          and re.search(tierRegex, tierNode.attrib['LINGUISTIC_TYPE_REF']) is not None):
                        alignedTiers.append(tierNode)
                        break
                except:
                    pass
        if len(mainTiers) <= 0:
            return
        # if len(self.corpusSettings['aligned_tiers']) > 0:
        #     alignedTierTypes = '(' + ' | '.join('/ANNOTATION_DOCUMENT/TIER[@LINGUISTIC_TYPE_REF=\'' + x + '\'] | ' +
        #                                         '/ANNOTATION_DOCUMENT/TIER[@TIER_ID=\'' + x + '\']'
        #                                         for x in self.corpusSettings['aligned_tiers']) + ')'
        #     alignedTiers = srcTree.xpath(alignedTierTypes)
        aID2pID = {}  # annotation ID -> (pID, tli1, tli2) correspondence
        for tier in mainTiers:
            for sent in self.process_tier(tier, aID2pID, srcFile, alignedTier=False):
                yield sent
        for tier in alignedTiers:
            for sent in self.process_tier(tier, aID2pID, srcFile, alignedTier=True):
                yield sent

    def add_speaker_marks(self, sentences):
        """
        Add the name/code of the speaker in the beginning of every
        sentence that starts the turn.
        """
        if 'insert_speaker_marks' in self.corpusSettings and not self.corpusSettings['insert_speaker_marks']:
            return
        langs2process = [i for i in range(len(self.corpusSettings['languages']))]
        if 'speaker_marks_languages' in self.corpusSettings:
            langs2process = [i for i in range(len(self.corpusSettings['languages']))
                               if self.corpusSettings['languages'][i] in self.corpusSettings['speaker_marks_languages']]
        langs2process = set(langs2process)
        prevSpeaker = ''
        for i in range(len(sentences)):
            if 'meta' not in sentences[i] or 'speaker' not in sentences[i]['meta']:
                continue
            if 'lang' in sentences[i] and sentences[i]['lang'] not in langs2process:
                continue
            speaker = '[' + sentences[i]['meta']['speaker'] + ']'
            addOffset = len(speaker) + 2
            if sentences[i]['meta']['speaker'] != prevSpeaker:
                sentences[i]['text'] = '\n' + speaker + ' ' + sentences[i]['text']
                sentences[i]['words'].insert(0, {'off_start': -len(speaker) - 1,
                                                 'off_end': -1,
                                                 'wf': speaker,
                                                 'wtype': 'punct',
                                                 'next_word': 0})
                sentences[i]['words'].insert(0, {'off_start': -len(speaker) - 2,
                                                 'off_end': -len(speaker) - 1,
                                                 'wf': '\n',
                                                 'wtype': 'punct',
                                                 'next_word': -1})
                for w in sentences[i]['words']:
                    w['off_start'] += addOffset
                    w['off_end'] += addOffset
                    w['next_word'] += 2
                if 'para_alignment' in sentences[i]:
                    for pa in sentences[i]['para_alignment']:
                        if pa['off_start'] > 0:
                            pa['off_start'] += addOffset
                        pa['off_end'] += addOffset
                if 'src_alignment' in sentences[i]:
                    for sa in sentences[i]['src_alignment']:
                        if sa['off_start_sent'] > 0:
                            sa['off_start_sent'] += addOffset
                        sa['off_end_sent'] += addOffset
                if 'style_spans' in sentences[i]:
                    for ss in sentences[i]['style_spans']:
                        ss['off_start'] += addOffset
                        ss['off_end'] += addOffset
            prevSpeaker = sentences[i]['meta']['speaker']
            if 'last' in sentences[i] and sentences[i]['last']:
                prevSpeaker = ''

    def add_sentence_meta(self, sentences, meta):
        """
        Add some of the document-level metadata to the sentences.
        """
        for s in sentences:
            if 'meta' not in s:
                continue
            if 'year1' in meta and 'year2' in meta and meta['year1'] == meta['year2']:
                s['meta']['year'] = meta['year1']

    def clean_up_sentences(self, sentences):
        """
        Remove temporary keys that are no longer needed.
        """
        for s in sentences:
            if 'nTokensOrig' in s:
                del s['nTokensOrig']
            for word in s['words']:
                if 'n_orig' in word:
                    del word['n_orig']

    def convert_file(self, fnameSrc, fnameTarget):
        curMeta = self.get_meta(fnameSrc)
        textJSON = {'meta': curMeta, 'sentences': []}
        nTokens, nWords, nAnalyzed = 0, 0, 0
        self.spanAnnoTiers = {}
        srcTree = etree.parse(fnameSrc)
        self.tlis = self.get_tlis(srcTree)
        self.build_segment_tree(srcTree)
        srcFileNode = srcTree.xpath('/ANNOTATION_DOCUMENT/HEADER/MEDIA_DESCRIPTOR')
        if len(srcFileNode) > 0 and 'RELATIVE_MEDIA_URL' in srcFileNode[0].attrib:
            srcFile = self.rxStripDir.sub('', html.unescape(srcFileNode[0].attrib['RELATIVE_MEDIA_URL']))
        elif len(srcFileNode) > 0 and 'MEDIA_URL' in srcFileNode[0].attrib:
            srcFile = self.rxStripDir.sub('', html.unescape(srcFileNode[0].attrib['MEDIA_URL']))
        else:
            srcFile = ''
        textJSON['sentences'] = [s for s in self.get_sentences(srcTree, srcFile)]
        self.add_privacy_segments(srcTree, srcFile)
        self.add_span_annotations(textJSON['sentences'])
        # First sorting: sort sentences by language, but keep them sorted by speaker
        # (which they are now, since each speaker has a separate set of tiers in ELAN).
        textJSON['sentences'].sort(key=lambda s: (s['lang']))
        if 'sentence_segmentation' in self.corpusSettings and self.corpusSettings['sentence_segmentation']:
            self.tp.splitter.resegment_sentences(textJSON['sentences'])
        for s in textJSON['sentences']:
            self.fragmentize_src_alignment(s)
        # Final sorting: inside each language, sort sentences by their time offsets.
        textJSON['sentences'].sort(key=lambda s: (s['lang'], s['src_alignment'][0]['true_off_start_src']))
        for i in range(len(textJSON['sentences']) - 1):
            # del textJSON['sentences'][i]['src_alignment'][0]['true_off_start_src']
            if textJSON['sentences'][i]['lang'] != textJSON['sentences'][i + 1]['lang']:
                textJSON['sentences'][i]['last'] = True
            for word in textJSON['sentences'][i]['words']:
                nTokens += 1
                if word['wtype'] == 'word':
                    nWords += 1
                if 'ana' in word and len(word['ana']) > 0:
                    nAnalyzed += 1
        self.tp.splitter.recalculate_offsets(textJSON['sentences'])
        self.tp.splitter.add_next_word_id(textJSON['sentences'])
        self.add_speaker_marks(textJSON['sentences'])
        self.add_sentence_meta(textJSON['sentences'], curMeta)
        self.clean_up_sentences(textJSON['sentences'])
        if 'capitalize_sentences' in self.corpusSettings and self.corpusSettings['capitalize_sentences']:
            self.tp.splitter.capitalize_sentences(textJSON['sentences'])
        self.write_output(fnameTarget, textJSON)
        return nTokens, nWords, nAnalyzed

    def process_corpus(self, cutMedia=True):
        """
        Take every eaf file from the source directory subtree, turn it
        into a parsed json and store it in the target directory.
        """
        Txt2JSON.process_corpus(self)
        if not cutMedia:
            return
        mediaDir = os.path.join(self.corpusSettings['corpus_dir'], self.srcExt)
        if 'media_dir' in self.corpusSettings:
            mediaDir = self.corpusSettings['media_dir']
        for path, dirs, files in os.walk(mediaDir):
            # Process video files first
            files = [fname for fname in files if fname.lower().endswith(('.avi', '.mts', '.mov'))] + \
                    [fname for fname in files if fname.lower().endswith('.mp4')] + \
                    [fname for fname in files if not fname.lower().endswith(('.avi', '.mts', '.mov', '.mp4'))]
            for fname in files:
                fileExt = os.path.splitext(fname.lower())[1]
                if fileExt in self.mediaExtensions:
                    privacySegments = []
                    if fname in self.privacySegments:
                        privacySegments = self.privacySegments[fname]
                    fname = os.path.abspath(os.path.join(path, fname))
                    print('Cutting media file', fname)
                    self.mc.cut_media(fname,
                                      usedFilenames=self.usedMediaFiles,
                                      privacySegments=privacySegments)
Пример #8
0
class Exmaralda_Hamburg2JSON(Txt2JSON):
    """
    Contains methods to make JSONs ready for indexing from aligned
    Exmaralda files in the format used in documentation projects
    carried out in Hamburg.
    """

    rxBracketGloss = re.compile('\\.?\\[.*?\\]')
    rxSplitGlosses = re.compile('-|\\.(?=\\[)')
    rxWordPunc = re.compile('^( *)([^\\w]*)(.*?)([^\\w]*?)( *)$')
    txTierXpath = '/basic-transcription/basic-body/tier[@id=\'tx\']'
    mediaExtensions = {'.wav', '.mp3', '.mp4', '.avi'}

    def __init__(self, settingsDir='conf'):
        Txt2JSON.__init__(self, settingsDir=settingsDir)
        self.mc = MediaCutter(settings=self.corpusSettings)
        self.srcExt = 'exb'  # extension of the source files to be converted
        self.tlis = {}  # time labels (id -> {'n': number, 'time': time value})
        self.pID = 0  # id of last aligned segment
        self.glosses = set()

    def get_tlis(self, srcTree):
        """
        Retrieve and return all time labels from the XML tree.
        """
        tlis = {}
        iTli = 0
        for tli in srcTree.xpath(
                '/basic-transcription/basic-body/common-timeline')[0]:
            timeValue = ''
            if 'time' in tli.attrib:
                timeValue = tli.attrib['time']
            tlis[tli.attrib['id']] = {'n': iTli, 'time': timeValue}
            iTli += 1
        return tlis

    def find_sentence_index(self, sentenceBoundaries, tli):
        """
        Find the number of the sentence the event with the given
        time label (start or end) belongs to.
        """
        if tli not in self.tlis:
            return -1
        for i in range(len(sentenceBoundaries)):
            tliStart, tliEnd = sentenceBoundaries[i]
            if (tli == tliStart or self.tlis[tliStart]['n'] <=
                    self.tlis[tli]['n'] < self.tlis[tliEnd]['n']):
                return i
        return -1

    def get_sentence_boundaries(self, refTier):
        """
        Go over the reference tier (as XML node). For each event
        in the tier, extract start and end attributes. Return a list
        with (start time label, end time label) tuples.
        """
        boundaries = []
        for event in refTier:
            if 'start' not in event.attrib or 'end' not in event.attrib:
                continue
            sentStart, sentEnd = event.attrib['start'], event.attrib['end']
            if sentStart not in self.tlis or sentEnd not in self.tlis:
                continue
            boundaries.append((sentStart, sentEnd))
        return boundaries

    def get_word_tlis(self, srcTree):
        """
        Collect all pairs of time labels that delimit words.
        """
        txTiers = srcTree.xpath(Exmaralda_Hamburg2JSON.txTierXpath)
        tliTuples = set()
        for txTier in txTiers:
            for event in txTier:
                if 'start' not in event.attrib or 'end' not in event.attrib:
                    continue
                tliTuple = (event.attrib['start'], event.attrib['end'])
                tliTuples.add(tliTuple)
        return tliTuples

    def collect_annotation(self, srcTree):
        """
        Return a dictionary that contains all word-level annotation events,
        the keys are tuples (start time label, end time label).
        """
        wordTlis = self.get_word_tlis(srcTree)
        wordAnno = {}
        for tier in srcTree.xpath(
                '/basic-transcription/basic-body/tier[@type=\'a\']'):
            if 'id' not in tier.attrib:
                continue
            # tierID = tier.attrib['id']
            tierID = tier.attrib['category']
            if tierID in self.corpusSettings[
                    'translation_tiers'] or tierID in ('tx', 'ts'):
                continue
            for event in tier:
                if ('start' not in event.attrib or 'end' not in event.attrib
                        or event.text is None):
                    continue
                tupleKey = (event.attrib['start'], event.attrib['end'])

                # If an annotation spans several tokens, add it to each of them:
                tupleKeys = [tupleKey]
                if tupleKey not in wordTlis:
                    for wordTli in wordTlis:
                        if ((wordTli[0] == tupleKey[0]
                             or self.tlis[tupleKey[0]]['n'] <=
                             self.tlis[wordTli[0]]['n'])
                                and (wordTli[1] == tupleKey[1]
                                     or self.tlis[tupleKey[1]]['n'] >=
                                     self.tlis[wordTli[1]]['n'])):
                            tupleKeys.append(wordTli)

                for tk in tupleKeys:
                    if tk not in wordAnno:
                        wordAnno[tk] = {}
                    wordAnno[tk][tierID] = event.text
        return wordAnno

    def add_ana_fields(self, ana, curWordAnno):
        """
        Add the information from the annotation tier events for the
        current word to the analysis. For each tier, the name of the
        tier is the used as the name of the field, and the text of
        the event is used as the value.
        """
        for tierName in curWordAnno:
            if tierName in ['tx', 'mb', 'mp', 'gr', 'ge']:
                continue
            if tierName == 'ps':
                ana['gr.pos'] = curWordAnno[tierName]
            else:
                ana[tierName] = curWordAnno[tierName]

    def get_words(self, srcTree):
        """
        Iterate over words found in the tx tier of the XML tree.
        """
        txTier = srcTree.xpath(Exmaralda_Hamburg2JSON.txTierXpath)
        wordAnno = self.collect_annotation(srcTree)
        for event in txTier[0]:
            if 'start' not in event.attrib or 'end' not in event.attrib:
                continue
            tupleKey = (event.attrib['start'], event.attrib['end'])
            if tupleKey not in wordAnno:
                continue
            wf = event.text
            if wf is None:
                continue
            curToken = {
                'wf': wf,
                'wtype': 'word',
                'tli_start': event.attrib['start'],
                'tli_end': event.attrib['end']
            }
            if self.tp.tokenizer.rxOnlyPunc.search(wf.strip()) is not None:
                curToken['wtype'] = 'punct'
                yield curToken
                continue
            ana = {}
            curWordAnno = wordAnno[tupleKey]
            # mp: morph breaks with empty morphemes (corresponds to the mc tier: POS and morph categories)
            # mb: morph breaks without empty morphemes (corresponds to the gr/ge tiers: actual glosses)
            if 'mb' in curWordAnno:
                ana['parts'] = curWordAnno['mb']
            if 'ge' in curWordAnno:
                ana['gloss'] = curWordAnno['ge']
                self.glosses |= set(
                    g for g in self.rxSplitGlosses.split(ana['gloss'])
                    if g.upper() == g)
                # print(ana['gloss'], self.rxSplitGlosses.split(ana['gloss']))
            self.tp.parser.process_gloss_in_ana(ana)
            if 'gloss_index' in ana:
                stems, newIndexGloss = self.tp.parser.find_stems(
                    ana['gloss_index'], self.corpusSettings['languages'][0])
                ana['lex'] = ' '.join(s[1] for s in stems)
                ana['trans_en'] = self.rxBracketGloss.sub(
                    '', ' '.join(s[0] for s in stems))
                self.add_ana_fields(ana, curWordAnno)
                useGlossList = False
                if 'glosses' in self.corpusSettings:
                    useGlossList = True
                self.tp.parser.gloss2gr(ana,
                                        self.corpusSettings['languages'][0],
                                        useGlossList=useGlossList)
                ana['gloss_index'] = self.rxBracketGloss.sub('', newIndexGloss)
            curToken['ana'] = [ana]
            yield curToken

    def fragmentize_src_alignment(self, alignment):
        """
        Find corresponding media file fragment and transform a JSON
        dictionary with the information about the alignment.
        """
        fileName, fileExt = os.path.splitext(alignment['src'].lower())
        if fileExt not in self.mediaExtensions:
            return
        ts1 = alignment['off_start_src']
        ts2 = alignment['off_end_src']
        if len(ts1) <= 0 or len(ts2) <= 0:
            return
        ts1frag, ts2frag, srcFileFrag = self.mc.get_media_name(
            alignment['src'], float(ts1), float(ts2))
        alignment['src'] = srcFileFrag
        alignment['off_start_src'] = str(ts1frag)
        alignment['off_end_src'] = str(ts2frag)

    def add_src_alignment(self, sent, sentBoundaries, srcFile):
        """
        Add the alignment of the sentence with the sound/video. If
        word-level time data is available, align words, otherwise
        align the whole sentence.
        """
        wordAlignments = []
        for word in sent['words']:
            if 'tli_start' not in word or 'tli_end' not in word:
                continue
            if len(self.tlis[word['tli_start']]['time']) > 0:
                for wa in wordAlignments:
                    if len(wa['off_end_src']) <= 0:
                        wa['off_end_src'] = self.tlis[
                            word['tli_start']]['time']
                        wa['src_id'] += word['tli_start']
                wordAlignments.append({
                    'off_start_src':
                    self.tlis[word['tli_start']]['time'],
                    'off_end_src':
                    '',
                    'off_start_sent':
                    word['off_start'],
                    'off_end_sent':
                    word['off_end'],
                    'mtype':
                    'audio',
                    'src':
                    srcFile,
                    'src_id':
                    word['tli_start'] + '_'
                })
            if len(self.tlis[word['tli_end']]['time']) > 0:
                for wa in wordAlignments:
                    if len(wa['off_end_src']) <= 0:
                        wa['off_end_src'] = self.tlis[word['tli_end']]['time']
                        wa['off_end_sent'] = word['off_end']
                        wa['src_id'] += word['tli_end']
        for wa in wordAlignments:
            if len(wa['off_end_src']) <= 0:
                if len(self.tlis[sentBoundaries[1]]['time']) > 0:
                    wa['off_end_src'] = self.tlis[sentBoundaries[1]]['time']
                    wa['src_id'] += sentBoundaries[1]
                else:
                    wa['off_end_src'] = wa['off_start_src']
                    wa['src_id'] += wa['src_id'][:-1]
                    wa['off_end_sent'] = len(sent['text'])
        # if len(wordAlignments) <= 0 and len(self.tlis[sentBoundaries[0]]['time']) > 0:
        if len(self.tlis[sentBoundaries[0]]['time']) > 0:
            wordAlignments = []  # for the time being
            wordAlignments.append({
                'off_start_src':
                self.tlis[sentBoundaries[0]]['time'],
                'off_end_src':
                self.tlis[sentBoundaries[1]]['time'],
                'off_start_sent':
                0,
                'off_end_sent':
                len(sent['text']),
                'mtype':
                'audio',
                'src_id':
                sentBoundaries[0] + '_' + sentBoundaries[1],
                'src':
                srcFile
            })
        if len(wordAlignments) > 0:
            for alignment in wordAlignments:
                self.fragmentize_src_alignment(alignment)
            sent['src_alignment'] = wordAlignments

    def get_parallel_sentences(self, srcTree, sentBoundaries, srcFile):
        """
        Iterate over sentences in description tiers aligned with the
        sentence in the main tx tier. The sentence to align with is
        defined by the tuple sentBoundaries that contains the start
        and the end time label for the sentence.
        """
        self.pID += 1
        for iTier in range(len(self.corpusSettings['translation_tiers'])):
            tierName = self.corpusSettings['translation_tiers'][iTier]
            events = srcTree.xpath('/basic-transcription/basic-body/'
                                   'tier[@id=\'' + tierName + '\']/'
                                   'event[@start=\'' + sentBoundaries[0] +
                                   '\' and @end=\'' + sentBoundaries[1] +
                                   '\']')
            for event in events:
                text = ''
                for child in event:
                    if child.tail is not None:
                        text += child.tail
                if len(text) <= 0:
                    text = event.text
                if text is None or len(text) <= 0:
                    text = ''
                text = self.tp.cleaner.clean_text(text)
                if len(text) <= 0:
                    words = [{
                        'wf': '—',
                        'wtype': 'punct',
                        'off_start': 0,
                        'off_end': 1
                    }]
                    text = '—'
                else:
                    words = self.tp.tokenizer.tokenize(text)
                paraAlignment = {
                    'off_start': 0,
                    'off_end': len(text),
                    'para_id': self.pID
                }
                paraSent = {
                    'words': words,
                    'text': text,
                    'para_alignment': [paraAlignment],
                    'lang': len(self.corpusSettings['languages']) + iTier
                }
                self.add_src_alignment(paraSent, sentBoundaries, srcFile)
                yield paraSent

    def get_sentences(self, srcTree, srcFile):
        """
        Iterate over sentences in the XML tree.
        """
        refTiers = srcTree.xpath(
            '/basic-transcription/basic-body/tier[@id=\'ref\']')
        if len(refTiers) <= 0:
            return
        refTier = refTiers[0]
        # TODO: Multiple layers
        sentBoundaries = self.get_sentence_boundaries(refTier)
        prevSentIndex = -1
        curSent = {'text': '', 'words': [], 'lang': 0}
        for word in self.get_words(srcTree):
            curSentIndex = self.find_sentence_index(sentBoundaries,
                                                    word['tli_start'])
            if curSentIndex != prevSentIndex and len(curSent['text']) > 0:
                paraAlignment = {
                    'off_start': 0,
                    'off_end': len(curSent['text']),
                    'para_id': self.pID
                }
                curSent['para_alignment'] = [paraAlignment]
                self.add_src_alignment(curSent, sentBoundaries[prevSentIndex],
                                       srcFile)
                yield curSent
                curSent = {'text': '', 'words': [], 'lang': 0}
                for paraSent in self.get_parallel_sentences(
                        srcTree, sentBoundaries[curSentIndex], srcFile):
                    yield paraSent
            prevSentIndex = curSentIndex
            if word['wtype'] == 'punct':
                word['off_start'] = len(curSent['text'])
                curSent['text'] += word['wf']
                word['off_end'] = len(curSent['text'])
                word['wf'] = word['wf'].strip()
                continue
            m = self.rxWordPunc.search(word['wf'])
            spacesL, punctL, wf, punctR, spacesR =\
                m.group(1), m.group(2), m.group(3), m.group(4), m.group(5)
            curSent['text'] += spacesL
            if len(punctL) > 0:
                punc = {
                    'wf': punctL,
                    'wtype': 'punct',
                    'off_start': len(curSent['text']),
                    'off_end': len(curSent['text']) + len(punctL)
                }
                curSent['text'] += punctL
                curSent['words'].append(punc)
            word['off_start'] = len(curSent['text'])
            curSent['text'] += wf
            word['off_end'] = len(curSent['text'])
            word['wf'] = wf
            curSent['words'].append(word)
            if len(punctR) > 0:
                punc = {
                    'wf': punctR,
                    'wtype': 'punct',
                    'off_start': len(curSent['text']),
                    'off_end': len(curSent['text']) + len(punctR)
                }
                curSent['text'] += punctR
                curSent['words'].append(punc)
            curSent['text'] += spacesR
        if len(curSent['text']) > 0:
            paraAlignment = {
                'off_start': 0,
                'off_end': len(curSent['text']),
                'para_id': self.pID
            }
            curSent['para_alignment'] = [paraAlignment]
            self.add_src_alignment(curSent, sentBoundaries[curSentIndex],
                                   srcFile)
            yield curSent

    def convert_file(self, fnameSrc, fnameTarget):
        """
        Take one source Exmaralda file fnameSrc, parse the XML tree,
        extract timestamps, align sentences with words and their
        analyses and ultimately generate a parsed JSON file
        ready for indexing. Write the output to fnameTarget.
        Return number of tokens, number of words and number of
        words with at least one analysis in the document.
        """
        curMeta = self.get_meta(fnameSrc)
        # curMeta = {'title': fnameSrc, 'author': '', 'year1': '1900', 'year2': '2017'}
        if curMeta is None:
            return 0, 0, 0
        textJSON = {'meta': curMeta, 'sentences': []}
        nTokens, nWords, nAnalyzed = 0, 0, 0
        srcTree = etree.parse(fnameSrc)
        self.tlis = self.get_tlis(srcTree)
        srcFileNode = srcTree.xpath(
            '/basic-transcription/head/meta-information/referenced-file')
        if len(srcFileNode) > 0 and 'url' in srcFileNode[0].attrib:
            srcFile = self.rxStripDir.sub('', srcFileNode[0].attrib['url'])
        else:
            srcFile = ''
        textJSON['sentences'] = [
            s for s in self.get_sentences(srcTree, srcFile)
        ]
        textJSON['sentences'].sort(key=lambda s: s['lang'])
        for i in range(len(textJSON['sentences']) - 1):
            if textJSON['sentences'][i]['lang'] != textJSON['sentences'][
                    i + 1]['lang']:
                textJSON['sentences'][i]['last'] = True
            for word in textJSON['sentences'][i]['words']:
                nTokens += 1
                if word['wtype'] == 'word':
                    nWords += 1
                if 'ana' in word and len(word['ana']) > 0:
                    nAnalyzed += 1
        self.tp.splitter.recalculate_offsets(textJSON['sentences'])
        self.tp.splitter.add_next_word_id(textJSON['sentences'])
        self.write_output(fnameTarget, textJSON)
        return nTokens, nWords, nAnalyzed

    def process_corpus(self, cutMedia=True):
        """
        Take every Exmaralda file from the source directory subtree, turn it
        into a parsed json and store it in the target directory.
        Split all the corpus media files into overlapping chunks of
        small duration.
        This is the main function of the class.
        """
        Txt2JSON.process_corpus(self)
        if not cutMedia:
            return
        for path, dirs, files in os.walk(
                os.path.join(self.corpusSettings['corpus_dir'], self.srcExt)):
            for fname in files:
                fileExt = os.path.splitext(fname.lower())[1]
                if fileExt in self.mediaExtensions:
                    fname = os.path.abspath(os.path.join(path, fname))
                    print('Cutting media file', fname)
                    self.mc.cut_media(fname)