Exemplo n.º 1
0
 def process_corpus(self, cutMedia=True):
     """
     Take every eaf file from the source directory subtree, turn it
     into a parsed json and store it in the target directory.
     """
     Txt2JSON.process_corpus(self)
     if not cutMedia:
         return
     for path, dirs, files in os.walk(
             os.path.join(self.corpusSettings['corpus_dir'], self.srcExt)):
         for fname in files:
             print(fname)
             fileExt = os.path.splitext(fname.lower())[1]
             if fileExt in self.mediaExtensions:
                 fname = os.path.abspath(os.path.join(path, fname))
                 print('Cutting media file', fname)
                 self.mc.cut_media(fname)
Exemplo n.º 2
0
 def __init__(self, settingsDir='conf_conversion'):
     Txt2JSON.__init__(self, settingsDir=settingsDir)
     self.srcExt = 'xml'
     self.pID = 0  # id of last aligned segment
     self.glosses = []
     self.grammRules = []
     self.posRules = {}
     self.load_rules()
     self.POSTags = set()  # All POS tags encountered in the XML
     self.rxStemGlosses = re.compile('^$')
     self.mainGlossLang = 'en'
     self.badAnalysisLangs = []
     if 'main_gloss_language' in self.corpusSettings:
         self.mainGlossLang = self.corpusSettings['main_gloss_language']
     if 'bad_analysis_languages' in self.corpusSettings:
         self.badAnalysisLangs = self.corpusSettings[
             'bad_analysis_languages']
Exemplo n.º 3
0
 def process_corpus(self):
     """
     Take every Exmaralda file from the source directory subtree, turn it
     into a parsed json and store it in the target directory.
     Split all the corpus media files into overlapping chunks of
     small duration.
     This is the main function of the class.
     """
     Txt2JSON.process_corpus(self)
     for path, dirs, files in os.walk(
             os.path.join(self.corpusSettings['corpus_dir'], self.srcExt)):
         for fname in files:
             fileExt = os.path.splitext(fname.lower())[1]
             if fileExt in self.mediaExtensions:
                 fname = os.path.abspath(os.path.join(path, fname))
                 print('Cutting media file', fname)
                 self.mc.cut_media(fname)
Exemplo n.º 4
0
 def __init__(self, settingsDir='conf'):
     Txt2JSON.__init__(self, settingsDir=settingsDir)
     self.mc = MediaCutter(settings=self.corpusSettings)
     self.srcExt = 'xml'  # extension of the source files to be converted
     self.participants = {}  # participant ID -> dictionary of properties
     self.tlis = {}  # time labels (id -> {'n': number, 'time': time value})
     self.wordsByID = {}  # word ID -> word object
     self.morph2wordID = {}  # morph ID -> (word ID, position in the word)
     self.pID = 0  # id of last aligned segment
     self.seg2pID = {
     }  # ids of <seg> tags -> parallel IDs of corresponding sentences
     self.wordIDseq = []  # sequence of word/punctuation/incident IDs
     # (needed to understand ranges such as "w13 to inc2")
     self.glosses = set()
     self.posRules = {}
     self.load_pos_rules(
         os.path.join(self.corpusSettings['corpus_dir'],
                      'conf/posRules.txt'))
Exemplo n.º 5
0
 def __init__(self, settingsDir='conf_conversion'):
     Txt2JSON.__init__(self, settingsDir=settingsDir)
     self.speakerMeta = self.load_speaker_meta()
     self.mc = MediaCutter(settings=self.corpusSettings)
     self.srcExt = 'eaf'
     self.tlis = {}  # time labels
     self.pID = 0  # id of last aligned segment
     self.glosses = set()
     self.participants = {}  # main tier ID -> participant ID
     self.segmentTree = {}  # aID -> (contents, parent aID, tli1, tli2)
     self.segmentChildren = {}  # (aID, child tier type) -> [child aID]
     self.spanAnnoTiers = {}  # span annotation tier type -> {tier ID -> [(tli1, tli2, contents)}
     self.alignedSpanAnnoTiers = {}  # aID of a segment -> {span annotation tier ID -> contents}
     self.additionalWordFields = []  # names of additional word-level fields associated with some analysis tiers
     self.privacySegments = {}  # segments (start_ms, end_ms) that should be beeped out, one list per source file
     self.rxIgnoreTokens = None
     self.set_ignore_tokens()
     self.usedMediaFiles = set()  # filenames of media fragments referenced in the JSONs
Exemplo n.º 6
0
 def __init__(self, settingsDir='conf'):
     Txt2JSON.__init__(self, settingsDir=settingsDir)
     self.srcExt = 'csv'
     self.pID = 0  # id of last aligned segment
 def __init__(self, settingsDir='conf'):
     Txt2JSON.__init__(self, settingsDir=settingsDir)
     self.srcExt = 'json'
     self.glosses = set()
Exemplo n.º 8
0
 def __init__(self, settingsDir='conf_conversion'):
     Txt2JSON.__init__(self, settingsDir=settingsDir)
     self.srcExt = 'xml'
     self.pID = 0        # id of last aligned segment
Exemplo n.º 9
0
 def __init__(self, settingsDir='conf_conversion'):
     Txt2JSON.__init__(self, settingsDir=settingsDir)
     self.rxPuncSpaceBefore = re.compile(self.corpusSettings['punc_space_before'])
     self.rxPuncSpaceAfter = re.compile(self.corpusSettings['punc_space_after'])
     self.srcExt = 'yaml'
     self.pID = 0        # id of last aligned segment