def same_type_different_tiers_test(self): pos1_id = gen_tier_id(self.i, 'pos', tier_type='pos', alignment='tw') self.assertEqual(pos1_id, 'tw-pos') self.i.append(Tier(id=pos1_id, type='pos', alignment='tw')) pos2_id = gen_tier_id(self.i, 'pos', tier_type='pos', alignment='tw') self.assertEqual(pos2_id, 'tw-pos_b')
def create_text_tier_from_lines(inst, lines, id_base, state): """ Given a list of lines that are dicts with the attributes 'text' and 'tag', create a text tier of the specified type with the provided line items. :type lines: list[dict] """ # ------------------------------------------- # 1) Generate the parent tier. tier = Tier(id=gen_tier_id(inst, id_base), type=ODIN_TYPE, attributes={STATE_ATTRIBUTE:state}) # ------------------------------------------- # 2) Iterate over the list of lines for line in lines: # Make sure the line is a dict. if not hasattr(line, 'get') or 'text' not in line or 'tag' not in line: raise Exception("When constructing tier from lines, must be a list of dicts with keys 'text' and 'tag'.") # Construct the list of tags. alltags = [] if line.get('tag') is not None: alltags.append(line.get('tag')) if line.get('labels') is not None and line.get('labels'): alltags.append(line.get('labels')) tag_str = '+'.join(alltags) # Construct the attributes line_attributes = {ODIN_TAG_ATTRIBUTE:tag_str} if line.get('judgment') is not None: line_attributes[ODIN_JUDGMENT_ATTRIBUTE] = line['judgment'] # Add the linenumber if line.get('lineno'): line_attributes['line'] = line.get('lineno', '') l = Item(id=gen_item_id(tier.id, len(tier)), attributes=line_attributes, text=line.get('text')) tier.append(l) return tier
def naacl_to_xigt(naacl_path): """ Convert the NAACL format to XIGT. :param naacl_path: """ content = open(naacl_path, 'r').read() # First, collect all the instances. instances = re.findall('Igt_id[\s\S]+?Q6.*Answer', content) xc = XigtCorpus() for instance_txt in instances: # id = re.search('Igt_id=([\S]+)', instance_txt).group(1) inst = Igt(id='i{}'.format(len(xc))) lang_raw, gloss_raw, trans_raw = instance_txt.split('\n')[1:4] # Now, create the raw tier... raw_tier = Tier(id=gen_tier_id(inst, 'r'), type='odin', attributes={STATE_ATTRIBUTE:RAW_STATE}) raw_tier.append(Item(id=ask_item_id(raw_tier), text=lang_raw, attributes={ODIN_TAG_ATTRIBUTE:ODIN_LANG_TAG})) raw_tier.append(Item(id=ask_item_id(raw_tier), text=gloss_raw, attributes={ODIN_TAG_ATTRIBUTE:ODIN_GLOSS_TAG})) raw_tier.append(Item(id=ask_item_id(raw_tier), text=trans_raw, attributes={ODIN_TAG_ATTRIBUTE:ODIN_TRANS_TAG})) inst.append(raw_tier) xc.append(inst) # Generate the clean/normal tiers, but without any cleaning. generate_normal_tier(inst, clean=False) # Lang Dependency representation handling... lang_ds_str = re.search('Q6:([\s\S]+?)Q6:', instance_txt).group(1) lang_ds_lines = lang_ds_str.split('\n')[5:-3] try: lang_dt = parse_naacl_dep(lang(inst), lang_ds_lines) create_dt_tier(inst, lang_dt, lang(inst), parse_method=INTENT_POS_MANUAL) except TreeError as te: pass except IndexError as ie: pass # Eng DS handling... eng_ds_str = re.search('Q3:([\s\S]+?)Q3:', instance_txt).group(1) eng_ds_lines = eng_ds_str.split('\n')[2:-3] try: eng_dt = parse_naacl_dep(trans(inst), eng_ds_lines) create_dt_tier(inst, eng_dt, trans(inst), parse_method=INTENT_POS_MANUAL) except TreeError as te: pass except IndexError as ie: pass except ValueError as ve: pass # Add Alignment... biling_aln_str = re.search('Q5:([\s\S]+?)Q5:', instance_txt).group(1) biling_aln_lines = biling_aln_str.split('\n')[4:-3] trans_offset = trans_raw.startswith(' ') gloss_offset = gloss_raw.startswith(' ') try: a = Alignment() for line in biling_aln_lines: gloss_s, trans_s = line.split()[0:2] if '.' in gloss_s: continue gloss_i = int(gloss_s) for trans_token in trans_s.split(','): trans_i = int(trans_token) if trans_i == 0: continue else: if trans_offset: trans_i -= 1 if gloss_offset: gloss_i -= 1 a.add((trans_i, gloss_i)) except: pass set_bilingual_alignment(inst, trans(inst), gloss(inst), a, aln_method=INTENT_ALN_MANUAL) return xc
def unique_tier_test(self): tw_id = gen_tier_id(self.i, 'w', tier_type='words', alignment='t', no_hyphenate=True) print(tw_id)