def create_words_tier_from_string(string): tokens = tokenize_string(string, tokenizer=whitespace_tokenizer) wt = Tier(type=WORDS_TYPE) for token in tokens: i = Item(id=ask_item_id(wt), text=token.value()) wt.append(i) return wt
def parse_odin_inst(string, corpus = None, idnum=None): """ Method to parse and create an IGT instance from odin-style text. """ # Start by looking for the doc_id, and the line range. doc_re = re.search('doc_id=(\S+)\s([0-9]+)\s([0-9]+)\s(.*)\n', string) docid, lnstart, lnstop, tagtypes = doc_re.groups() if idnum is not None: id = gen_item_id('i', idnum) elif corpus: id = corpus.askIgtId() else: corpus = XigtCorpus() id = 'i{}'.format(len(corpus)) inst = Igt(id = id, attributes={'doc-id':docid, 'line-range':'%s %s' % (lnstart, lnstop), 'tag-types':tagtypes}) # Now, find all the lines lines = re.findall('line=([0-9]+)\stag=(\S+):(.*)\n?', string) # --- 3) Create a raw tier. rt = Tier(id = RAW_ID, type=ODIN_TIER_TYPE, attributes={STATE_ATTRIBUTE:RAW_STATE}, igt=inst) for lineno, linetag, linetxt in lines: l = Item(id = ask_item_id(rt), text=linetxt, attributes={'tag':linetag, 'line':lineno}, tier=rt) rt.append(l) inst.append(rt) basic_processing(inst) return inst
def naacl_to_xigt(naacl_path): """ Convert the NAACL format to XIGT. :param naacl_path: """ content = open(naacl_path, 'r').read() # First, collect all the instances. instances = re.findall('Igt_id[\s\S]+?Q6.*Answer', content) xc = XigtCorpus() for instance_txt in instances: # id = re.search('Igt_id=([\S]+)', instance_txt).group(1) inst = Igt(id='i{}'.format(len(xc))) lang_raw, gloss_raw, trans_raw = instance_txt.split('\n')[1:4] # Now, create the raw tier... raw_tier = Tier(id=gen_tier_id(inst, 'r'), type='odin', attributes={STATE_ATTRIBUTE:RAW_STATE}) raw_tier.append(Item(id=ask_item_id(raw_tier), text=lang_raw, attributes={ODIN_TAG_ATTRIBUTE:ODIN_LANG_TAG})) raw_tier.append(Item(id=ask_item_id(raw_tier), text=gloss_raw, attributes={ODIN_TAG_ATTRIBUTE:ODIN_GLOSS_TAG})) raw_tier.append(Item(id=ask_item_id(raw_tier), text=trans_raw, attributes={ODIN_TAG_ATTRIBUTE:ODIN_TRANS_TAG})) inst.append(raw_tier) xc.append(inst) # Generate the clean/normal tiers, but without any cleaning. generate_normal_tier(inst, clean=False) # Lang Dependency representation handling... lang_ds_str = re.search('Q6:([\s\S]+?)Q6:', instance_txt).group(1) lang_ds_lines = lang_ds_str.split('\n')[5:-3] try: lang_dt = parse_naacl_dep(lang(inst), lang_ds_lines) create_dt_tier(inst, lang_dt, lang(inst), parse_method=INTENT_POS_MANUAL) except TreeError as te: pass except IndexError as ie: pass # Eng DS handling... eng_ds_str = re.search('Q3:([\s\S]+?)Q3:', instance_txt).group(1) eng_ds_lines = eng_ds_str.split('\n')[2:-3] try: eng_dt = parse_naacl_dep(trans(inst), eng_ds_lines) create_dt_tier(inst, eng_dt, trans(inst), parse_method=INTENT_POS_MANUAL) except TreeError as te: pass except IndexError as ie: pass except ValueError as ve: pass # Add Alignment... biling_aln_str = re.search('Q5:([\s\S]+?)Q5:', instance_txt).group(1) biling_aln_lines = biling_aln_str.split('\n')[4:-3] trans_offset = trans_raw.startswith(' ') gloss_offset = gloss_raw.startswith(' ') try: a = Alignment() for line in biling_aln_lines: gloss_s, trans_s = line.split()[0:2] if '.' in gloss_s: continue gloss_i = int(gloss_s) for trans_token in trans_s.split(','): trans_i = int(trans_token) if trans_i == 0: continue else: if trans_offset: trans_i -= 1 if gloss_offset: gloss_i -= 1 a.add((trans_i, gloss_i)) except: pass set_bilingual_alignment(inst, trans(inst), gloss(inst), a, aln_method=INTENT_ALN_MANUAL) return xc
def raw_txt_to_inst(string, corpus=None, idnum=None): """ Method to create an IGT instance from a raw three lines of text, assuming L-G-T. :param string: :param corpus: :param idnum: """ lines = string.split('\n') if len(lines) < 3: raise RawTextParseError("Three lines are assumed for raw text. Instead got {}".format(len(lines))) if idnum is not None: id = gen_item_id('i', idnum) elif corpus: id = corpus.askIgtId() else: corpus = XigtCorpus() id = 'i{}'.format(len(corpus)) inst = Igt(id = id) rt = Tier(id = RAW_ID, type=ODIN_TIER_TYPE, attributes={STATE_ATTRIBUTE:RAW_STATE}, igt=inst) for i, l in enumerate(lines): # If we have four lines, assume that the first is # native orthography if len(lines) == 4: if i == 0: linetag = ODIN_LANG_TAG + '+FR' if i == 1: linetag = ODIN_LANG_TAG if i == 2: linetag = ODIN_GLOSS_TAG if i == 3: linetag = ODIN_TRANS_TAG elif len(lines) == 3: if i == 0: linetag = ODIN_LANG_TAG elif i == 1: linetag = ODIN_GLOSS_TAG elif i == 2: linetag = ODIN_TRANS_TAG elif len(lines) == 2: if i == 0: linetag = ODIN_LANG_TAG if i == 1: linetag = ODIN_TRANS_TAG else: raise RawTextParseError("Unknown number of lines...") if not l.strip(): raise RawTextParseError("The {} line is empty: {}".format(linetag, l)) li = Item(id=ask_item_id(rt), text=l, attributes={'tag':linetag}) rt.append(li) inst.append(rt) # CONVERT_LOG.warn("Basic processing failed for instance {}".format(inst.id)) return inst