def raw_txt_to_xc(txt): """ :rtype: XigtCorpus """ print("Creating XIGT corpus from raw text...") xc = XigtCorpus() PARSELOG.debug("Replacing invalid XML...") data = replace_invalid_xml(txt) instances = [] cur_lines = [] for line in data.split('\n'): if not line.strip(): instances.append('\n'.join(cur_lines)) cur_lines = [] continue else: cur_lines.append(line) if cur_lines: instances.append('\n'.join(cur_lines)) for instance in instances: i = raw_txt_to_inst(instance, corpus=xc) xc.append(i) print("{} instances parsed.".format(len(xc))) return xc
def parse_odin_xc(text, require_trans = True, require_gloss = True, require_lang = True, limit = None): """ Read in a odin-style textfile to create the xigt corpus. """ # Initialize the corpus xc = XigtCorpus() # Replace invalid characters... data = replace_invalid_xml(text) # Read all the text lines inst_txts = re.findall('doc_id=[\s\S]+?\n\n', data) #======================================================================= # Begin parsing... #======================================================================= parsed = 0 PARSELOG.info('Beginning parse') for inst_num, inst_txt in enumerate(inst_txts): if parsed % 250 == 0: PARSELOG.info('Parsing instance %d...' % parsed) pass # Handle the requirement for 1_to_1 alignment. try: i = parse_odin_inst(inst_txt, corpus=xc, idnum=inst_num) except GlossLangAlignException as glae: PARSELOG.warn('Gloss and language could not be automatically aligned for instance "%s". Skipping' % gen_item_id('i', inst_num)) continue # Try to get the translation line. --------------------------------- try: hastrans = trans_lines(i) except NoTransLineException as ntle: PARSELOG.info(ntle) hastrans = False # Try to get the gloss line. -------------------------------------- try: hasgloss = i.gloss except NoGlossLineException as ngle: PARSELOG.info(ngle) hasgloss = False # Try to get the language line. ------------------------------------ try: haslang = i.lang except NoLangLineException as nlle: PARSELOG.info(nlle) haslang = False parsed +=1 trans_constraint = (hastrans and require_trans) or (not require_trans) gloss_constraint = (hasgloss and require_gloss) or (not require_gloss) lang_constraint = (haslang and require_lang) or (not require_lang) if trans_constraint and gloss_constraint and lang_constraint: xc.append(i) else: PARSELOG.info('Requirements for instance "%s" were not satisfied. Skipping' % i.id) # If we have reached the limit of instances that have been requested, # stop processing. if limit is not None and limit == parsed: break # Return the corpus return xc