Exemplos de Txt2JSON em Python, exemplos de txt2json.Txt2JSON em Python

Exemplo n.º 1

0

Exibir arquivo

 def process_corpus(self, cutMedia=True):
     """
     Take every eaf file from the source directory subtree, turn it
     into a parsed json and store it in the target directory.
     """
     Txt2JSON.process_corpus(self)
     if not cutMedia:
         return
     for path, dirs, files in os.walk(
             os.path.join(self.corpusSettings['corpus_dir'], self.srcExt)):
         for fname in files:
             print(fname)
             fileExt = os.path.splitext(fname.lower())[1]
             if fileExt in self.mediaExtensions:
                 fname = os.path.abspath(os.path.join(path, fname))
                 print('Cutting media file', fname)
                 self.mc.cut_media(fname)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: xml_flex2json.py Projeto: timarkh/tsakorpus_wc

 def __init__(self, settingsDir='conf_conversion'):
     Txt2JSON.__init__(self, settingsDir=settingsDir)
     self.srcExt = 'xml'
     self.pID = 0  # id of last aligned segment
     self.glosses = []
     self.grammRules = []
     self.posRules = {}
     self.load_rules()
     self.POSTags = set()  # All POS tags encountered in the XML
     self.rxStemGlosses = re.compile('^$')
     self.mainGlossLang = 'en'
     self.badAnalysisLangs = []
     if 'main_gloss_language' in self.corpusSettings:
         self.mainGlossLang = self.corpusSettings['main_gloss_language']
     if 'bad_analysis_languages' in self.corpusSettings:
         self.badAnalysisLangs = self.corpusSettings[
             'bad_analysis_languages']

Exemplo n.º 3

0

Exibir arquivo

 def process_corpus(self):
     """
     Take every Exmaralda file from the source directory subtree, turn it
     into a parsed json and store it in the target directory.
     Split all the corpus media files into overlapping chunks of
     small duration.
     This is the main function of the class.
     """
     Txt2JSON.process_corpus(self)
     for path, dirs, files in os.walk(
             os.path.join(self.corpusSettings['corpus_dir'], self.srcExt)):
         for fname in files:
             fileExt = os.path.splitext(fname.lower())[1]
             if fileExt in self.mediaExtensions:
                 fname = os.path.abspath(os.path.join(path, fname))
                 print('Cutting media file', fname)
                 self.mc.cut_media(fname)

Exemplo n.º 4

0

Exibir arquivo

 def __init__(self, settingsDir='conf'):
     Txt2JSON.__init__(self, settingsDir=settingsDir)
     self.mc = MediaCutter(settings=self.corpusSettings)
     self.srcExt = 'xml'  # extension of the source files to be converted
     self.participants = {}  # participant ID -> dictionary of properties
     self.tlis = {}  # time labels (id -> {'n': number, 'time': time value})
     self.wordsByID = {}  # word ID -> word object
     self.morph2wordID = {}  # morph ID -> (word ID, position in the word)
     self.pID = 0  # id of last aligned segment
     self.seg2pID = {
     }  # ids of <seg> tags -> parallel IDs of corresponding sentences
     self.wordIDseq = []  # sequence of word/punctuation/incident IDs
     # (needed to understand ranges such as "w13 to inc2")
     self.glosses = set()
     self.posRules = {}
     self.load_pos_rules(
         os.path.join(self.corpusSettings['corpus_dir'],
                      'conf/posRules.txt'))

Exemplo n.º 5

0

Exibir arquivo

Arquivo: eaf2json.py Projeto: timarkh/tsakorpus_wc

 def __init__(self, settingsDir='conf_conversion'):
     Txt2JSON.__init__(self, settingsDir=settingsDir)
     self.speakerMeta = self.load_speaker_meta()
     self.mc = MediaCutter(settings=self.corpusSettings)
     self.srcExt = 'eaf'
     self.tlis = {}  # time labels
     self.pID = 0  # id of last aligned segment
     self.glosses = set()
     self.participants = {}  # main tier ID -> participant ID
     self.segmentTree = {}  # aID -> (contents, parent aID, tli1, tli2)
     self.segmentChildren = {}  # (aID, child tier type) -> [child aID]
     self.spanAnnoTiers = {}  # span annotation tier type -> {tier ID -> [(tli1, tli2, contents)}
     self.alignedSpanAnnoTiers = {}  # aID of a segment -> {span annotation tier ID -> contents}
     self.additionalWordFields = []  # names of additional word-level fields associated with some analysis tiers
     self.privacySegments = {}  # segments (start_ms, end_ms) that should be beeped out, one list per source file
     self.rxIgnoreTokens = None
     self.set_ignore_tokens()
     self.usedMediaFiles = set()  # filenames of media fragments referenced in the JSONs

Exemplo n.º 6

0

Exibir arquivo

Arquivo: img_csv2json.py Projeto: mansi-team/mansi_corpus

 def __init__(self, settingsDir='conf'):
     Txt2JSON.__init__(self, settingsDir=settingsDir)
     self.srcExt = 'csv'
     self.pID = 0  # id of last aligned segment

Exemplo n.º 7

0

Exibir arquivo

Arquivo: social_networks2json.py Projeto: mansi-team/mansi_corpus

 def __init__(self, settingsDir='conf'):
     Txt2JSON.__init__(self, settingsDir=settingsDir)
     self.srcExt = 'json'
     self.glosses = set()

Exemplo n.º 8

0

Exibir arquivo

 def __init__(self, settingsDir='conf_conversion'):
     Txt2JSON.__init__(self, settingsDir=settingsDir)
     self.srcExt = 'xml'
     self.pID = 0        # id of last aligned segment

Exemplo n.º 9

0

Exibir arquivo

Arquivo: morphy_yaml2json.py Projeto: timarkh/tsakorpus_wc

 def __init__(self, settingsDir='conf_conversion'):
     Txt2JSON.__init__(self, settingsDir=settingsDir)
     self.rxPuncSpaceBefore = re.compile(self.corpusSettings['punc_space_before'])
     self.rxPuncSpaceAfter = re.compile(self.corpusSettings['punc_space_after'])
     self.srcExt = 'yaml'
     self.pID = 0        # id of last aligned segment