def process_corpus(self, cutMedia=True): """ Take every eaf file from the source directory subtree, turn it into a parsed json and store it in the target directory. """ Txt2JSON.process_corpus(self) if not cutMedia: return for path, dirs, files in os.walk( os.path.join(self.corpusSettings['corpus_dir'], self.srcExt)): for fname in files: print(fname) fileExt = os.path.splitext(fname.lower())[1] if fileExt in self.mediaExtensions: fname = os.path.abspath(os.path.join(path, fname)) print('Cutting media file', fname) self.mc.cut_media(fname)
def __init__(self, settingsDir='conf_conversion'): Txt2JSON.__init__(self, settingsDir=settingsDir) self.srcExt = 'xml' self.pID = 0 # id of last aligned segment self.glosses = [] self.grammRules = [] self.posRules = {} self.load_rules() self.POSTags = set() # All POS tags encountered in the XML self.rxStemGlosses = re.compile('^$') self.mainGlossLang = 'en' self.badAnalysisLangs = [] if 'main_gloss_language' in self.corpusSettings: self.mainGlossLang = self.corpusSettings['main_gloss_language'] if 'bad_analysis_languages' in self.corpusSettings: self.badAnalysisLangs = self.corpusSettings[ 'bad_analysis_languages']
def process_corpus(self): """ Take every Exmaralda file from the source directory subtree, turn it into a parsed json and store it in the target directory. Split all the corpus media files into overlapping chunks of small duration. This is the main function of the class. """ Txt2JSON.process_corpus(self) for path, dirs, files in os.walk( os.path.join(self.corpusSettings['corpus_dir'], self.srcExt)): for fname in files: fileExt = os.path.splitext(fname.lower())[1] if fileExt in self.mediaExtensions: fname = os.path.abspath(os.path.join(path, fname)) print('Cutting media file', fname) self.mc.cut_media(fname)
def __init__(self, settingsDir='conf'): Txt2JSON.__init__(self, settingsDir=settingsDir) self.mc = MediaCutter(settings=self.corpusSettings) self.srcExt = 'xml' # extension of the source files to be converted self.participants = {} # participant ID -> dictionary of properties self.tlis = {} # time labels (id -> {'n': number, 'time': time value}) self.wordsByID = {} # word ID -> word object self.morph2wordID = {} # morph ID -> (word ID, position in the word) self.pID = 0 # id of last aligned segment self.seg2pID = { } # ids of <seg> tags -> parallel IDs of corresponding sentences self.wordIDseq = [] # sequence of word/punctuation/incident IDs # (needed to understand ranges such as "w13 to inc2") self.glosses = set() self.posRules = {} self.load_pos_rules( os.path.join(self.corpusSettings['corpus_dir'], 'conf/posRules.txt'))
def __init__(self, settingsDir='conf_conversion'): Txt2JSON.__init__(self, settingsDir=settingsDir) self.speakerMeta = self.load_speaker_meta() self.mc = MediaCutter(settings=self.corpusSettings) self.srcExt = 'eaf' self.tlis = {} # time labels self.pID = 0 # id of last aligned segment self.glosses = set() self.participants = {} # main tier ID -> participant ID self.segmentTree = {} # aID -> (contents, parent aID, tli1, tli2) self.segmentChildren = {} # (aID, child tier type) -> [child aID] self.spanAnnoTiers = {} # span annotation tier type -> {tier ID -> [(tli1, tli2, contents)} self.alignedSpanAnnoTiers = {} # aID of a segment -> {span annotation tier ID -> contents} self.additionalWordFields = [] # names of additional word-level fields associated with some analysis tiers self.privacySegments = {} # segments (start_ms, end_ms) that should be beeped out, one list per source file self.rxIgnoreTokens = None self.set_ignore_tokens() self.usedMediaFiles = set() # filenames of media fragments referenced in the JSONs
def __init__(self, settingsDir='conf'): Txt2JSON.__init__(self, settingsDir=settingsDir) self.srcExt = 'csv' self.pID = 0 # id of last aligned segment
def __init__(self, settingsDir='conf'): Txt2JSON.__init__(self, settingsDir=settingsDir) self.srcExt = 'json' self.glosses = set()
def __init__(self, settingsDir='conf_conversion'): Txt2JSON.__init__(self, settingsDir=settingsDir) self.srcExt = 'xml' self.pID = 0 # id of last aligned segment
def __init__(self, settingsDir='conf_conversion'): Txt2JSON.__init__(self, settingsDir=settingsDir) self.rxPuncSpaceBefore = re.compile(self.corpusSettings['punc_space_before']) self.rxPuncSpaceAfter = re.compile(self.corpusSettings['punc_space_after']) self.srcExt = 'yaml' self.pID = 0 # id of last aligned segment