Exemplo n.º 1
0
 def __init__(self, settingsDir='conf'):
     Txt2JSON.__init__(self, settingsDir=settingsDir)
     self.mc = MediaCutter(settings=self.corpusSettings)
     self.srcExt = 'exb'  # extension of the source files to be converted
     self.tlis = {}  # time labels (id -> {'n': number, 'time': time value})
     self.pID = 0  # id of last aligned segment
     self.glosses = set()
Exemplo n.º 2
0
 def __init__(self, settingsDir='conf'):
     Txt2JSON.__init__(self, settingsDir=settingsDir)
     self.rxPuncSpaceBefore = re.compile(
         self.corpusSettings['punc_space_before'])
     self.rxPuncSpaceAfter = re.compile(
         self.corpusSettings['punc_space_after'])
     self.srcExt = 'yaml'
     self.pID = 0  # id of last aligned segment
Exemplo n.º 3
0
 def __init__(self, settingsDir='conf'):
     Txt2JSON.__init__(self, settingsDir=settingsDir)
     self.srcExt = 'xml'
     self.pID = 0        # id of last aligned segment
     self.glosses = []
     self.grammRules = []
     self.posRules = {}
     self.load_rules()
     self.POSTags = set()    # All POS tags encountered in the XML
     self.rxStemGlosses = re.compile('^$')
Exemplo n.º 4
0
 def __init__(self, settingsDir='conf'):
     Txt2JSON.__init__(self, settingsDir=settingsDir)
     self.speakerMeta = self.load_speaker_meta()
     self.mc = MediaCutter(settings=self.corpusSettings)
     self.srcExt = 'eaf'
     self.tlis = {}  # time labels
     self.pID = 0  # id of last aligned segment
     self.glosses = set()
     self.participants = {}  # main tier ID -> participant ID
     self.segmentTree = {}  # aID -> (contents, parent aID, tli1, tli2)
     self.segmentChildren = {}  # (aID, child tier type) -> [child aID]
Exemplo n.º 5
0
 def __init__(self, settingsDir='conf'):
     Txt2JSON.__init__(self, settingsDir=settingsDir)
     self.mc = MediaCutter(settings=self.corpusSettings)
     self.srcExt = 'xml'  # extension of the source files to be converted
     self.participants = {}   # participant ID -> dictionary of properties
     self.tlis = {}       # time labels (id -> {'n': number, 'time': time value})
     self.wordsByID = {}  # word ID -> word object
     self.morph2wordID = {}   # morph ID -> (word ID, position in the word)
     self.pID = 0         # id of last aligned segment
     self.seg2pID = {}    # ids of <seg> tags -> parallel IDs of corresponding sentences
     self.wordIDseq = []  # sequence of word/punctuation/incident IDs
                          # (needed to understand ranges such as "w13 to inc2")
     self.glosses = set()
     self.posRules = {}
     self.load_pos_rules(os.path.join(self.corpusSettings['corpus_dir'], 'conf/posRules.txt'))
Exemplo n.º 6
0
 def __init__(self, settingsDir='conf_conversion'):
     Txt2JSON.__init__(self, settingsDir=settingsDir)
     self.srcExt = 'xml'
     self.pID = 0        # id of last aligned segment
     self.glosses = []
     self.grammRules = []
     self.posRules = {}
     self.load_rules()
     self.POSTags = set()    # All POS tags encountered in the XML
     self.rxStemGlosses = re.compile('^$')
     self.mainGlossLang = 'en'
     self.badAnalysisLangs = []
     if 'main_gloss_language' in self.corpusSettings:
         self.mainGlossLang = self.corpusSettings['main_gloss_language']
     if 'bad_analysis_languages' in self.corpusSettings:
         self.badAnalysisLangs = self.corpusSettings['bad_analysis_languages']
Exemplo n.º 7
0
 def __init__(self, settingsDir='conf_conversion'):
     Txt2JSON.__init__(self, settingsDir=settingsDir)
     self.speakerMeta = self.load_speaker_meta()
     self.mc = MediaCutter(settings=self.corpusSettings)
     self.srcExt = 'eaf'
     self.tlis = {}  # time labels
     self.pID = 0  # id of last aligned segment
     self.glosses = set()
     self.participants = {}  # main tier ID -> participant ID
     self.segmentTree = {}  # aID -> (contents, parent aID, tli1, tli2)
     self.segmentChildren = {}  # (aID, child tier type) -> [child aID]
     self.spanAnnoTiers = {}  # span annotation tier type -> {tier ID -> [(tli1, tli2, contents)}
     self.alignedSpanAnnoTiers = {}  # aID of a segment -> {span annotation tier ID -> contents}
     self.additionalWordFields = []  # names of additional word-level fields associated with some analysis tiers
     self.privacySegments = {}  # segments (start_ms, end_ms) that should be beeped out, one list per source file
     self.rxIgnoreTokens = None
     self.set_ignore_tokens()
     self.usedMediaFiles = set()  # filenames of media fragments referenced in the JSONs
Exemplo n.º 8
0
 def __init__(self, settingsDir='conf'):
     Txt2JSON.__init__(self, settingsDir=settingsDir)
     self.srcExt = 'csv'
     self.pID = 0  # id of last aligned segment
 def __init__(self, settingsDir='conf'):
     Txt2JSON.__init__(self, settingsDir=settingsDir)
     self.srcExt = 'json'
     self.glosses = set()
Exemplo n.º 10
0
 def __init__(self, settingsDir='conf_conversion'):
     Txt2JSON.__init__(self, settingsDir=settingsDir)
     self.srcExt = 'xml'
     self.pID = 0        # id of last aligned segment