def write_instances(instance_list, out_path, type, overwrite=False): if os.path.exists(out_path) and not overwrite: SPLIT_LOG.error('File "{}" already exists and overwrite flag not set. Skipping!'.format(out_path)) return else: # Create the directory if need be try: if not os.path.exists(os.path.dirname(out_path)): os.makedirs(os.path.dirname(out_path)) except FileNotFoundError: pass num_sents = len(instance_list) if num_sents > 0: xc = XigtCorpus() for i, inst in enumerate(instance_list): # inst.id = 'i{}'.format(i) xc.append(inst) print("Writing {} instances to {}...".format(num_sents, out_path)) f = open(out_path, 'w', encoding='utf-8') sort_corpus(xc) xigtxml.dump(f, xc) f.close() else: SPLIT_LOG.warn("No instances allocated for {}. Skipping file.".format(type))
def separate_tiers(args): tiers = set(args.tiers) # assuming XML for now with open(args.infile, 'r') as instream: src_xc = xigtxml.load(instream) sep_xc = XigtCorpus(attributes=src_xc.attributes, metadata=src_xc.metadata) for igt in src_xc.igts: sep_xc.add( Igt(id=igt.id, type=igt.type, attributes=igt.attributes, metadata=igt.metadata, tiers=[t for t in igt.tiers if t.type in tiers])) xigtxml.dump(open(args.outfile, 'w'), sep_xc) if not args.remainder: return with open(args.infile, 'r') as instream: src_xc = xigtxml.load(instream) rem_xc = XigtCorpus(attributes=src_xc.attributes, metadata=src_xc.metadata) for igt in src_xc.igts: rem_xc.add( Igt(id=igt.id, type=igt.type, attributes=igt.attributes, metadata=igt.metadata, tiers=[t for t in igt.tiers if t.type not in tiers])) xigtxml.dump(open(args.remainder, 'w'), rem_xc)
def filter_corpus(filelist, outpath, **kwargs): require_lang = kwargs.get('require_lang', False) require_gloss = kwargs.get('require_gloss', False) require_trans = kwargs.get('require_trans', False) require_aln = kwargs.get('require_aln', False) require_gloss_pos = kwargs.get('require_gloss_pos', False) require_grammatical=kwargs.get('require_grammatical', False) max_instances =kwargs.get('max_instances', 0) xc, examined, failures, successes = do_filter(filelist, require_lang, require_gloss, require_trans, require_aln, require_gloss_pos, require_grammatical, max_instances) # Only create a file if there are some instances to create... if len(xc) > 0: # Make sure the directory exists that contains the output. if os.path.dirname(outpath): os.makedirs(os.path.dirname(outpath), exist_ok=True) with open(outpath, 'w', encoding='utf-8') as out_f: FILTER_LOG.log(1000, "{} instances processed, {} filtered out, {} remain.".format(examined, failures, successes)) FILTER_LOG.log(1000, 'Writing remaining instances to file "{}"...'.format(os.path.basename(outpath))) xigtxml.dump(out_f, xc) FILTER_LOG.log(1000, "Success.") else: print("No instances remain after filtering. Skipping.")
def _xigt_import(infile, outfile, options): with open(infile, 'r') as in_fh, open(outfile, 'w') as out_fh: igts = odin_igts(in_fh, options) xc = XigtCorpus( igts=igts, nsmap=_nsmap, mode='transient' ) xigtxml.dump(out_fh, xc)
def write(out_fn, fn_idx): xc = XigtCorpus() for fn, igt_indices in fn_idx.items(): # if possible, try to decode needed igts only and skip the rest in_xc = xigtxml.load(fn, mode='transient') # ignoring corpus-level metadata xc.extend(igt for i, igt in enumerate(in_xc) if i in igt_indices) # assume the nsmap of the first igt is the same for all if xc.igts: xc.nsmap = xc[0].nsmap xigtxml.dump(out_fn, xc)
def run(args): if args.infiles: for fn in args.infiles: logging.info('Normalizing {}'.format(fn)) xc = xigtxml.load(fn, mode='full') normalize_corpus(xc) xigtxml.dump(fn, xc) else: xc = xigtxml.load(sys.stdin, mode='full') normalize_corpus(xc) print(xigtxml.dumps(xc))
def run(args): if args.infiles: for fn in args.infiles: logging.info('Cleaning {}'.format(fn)) xc = xigtxml.load(fn, mode='full') clean_corpus(xc) xigtxml.dump(fn, xc) else: xc = xigtxml.load(sys.stdin, mode='full') clean_corpus(xc) print(xigtxml.dumps(xc))
def _xigt_import(infile, outfile, options): with open(infile, "r") as in_fh, open(outfile, "w") as out_fh: igts = odin_igts(in_fh, options) xc = XigtCorpus( igts=igts, attributes={ "xmlns:olac": "http://www.language-archives.org/OLAC/1.1/", "xmlns:dc": "http://purl.org/dc/elements/1.1/", "xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance", }, mode="transient", ) xigtxml.dump(out_fh, xc)
def xigt_import(infile, outfile, options=None): if options is None: options = {} options.setdefault("tier_types", default_tier_types) options.setdefault("alignments", default_alignments) options.setdefault("record_markers", default_record_markers) options.setdefault("attribute_map", default_attribute_map) with open(infile, "r") as in_fh, open(outfile, "w") as out_fh: tb = toolbox.read_toolbox_file(in_fh) igts = toolbox_igts(tb, options) xc = XigtCorpus(igts=igts, mode="transient") xigtxml.dump(out_fh, xc)
def xigt_import(infile, outfile, options=None): if options is None: options = {} options.setdefault('tier_types', default_tier_types) options.setdefault('alignments', default_alignments) options.setdefault('record_markers', default_record_markers) options.setdefault('attribute_map', default_attribute_map) options.setdefault('error_recovery_method', default_error_recovery_method) with open(infile, 'r') as in_fh, open(outfile, 'w') as out_fh: tb = toolbox.read_toolbox_file(in_fh) igts = toolbox_igts(tb, options) xc = XigtCorpus(igts=igts, mode='transient') xigtxml.dump(out_fh, xc)
def eval_classifier(c, inst_list, context_feats=False, posdict=None): """ :param c: The classifier :param inst_list: A list of Igt instances to test against. Must already have POS tags. """ gold_sents = [] eval_sents = [] to_dump = XigtCorpus() for inst in inst_list: to_tag = inst.copy() strip_pos(to_tag) # Do the classification. to_tag.classify_gloss_pos(c, lowercase=True, feat_next_gram=context_feats, feat_prev_gram=context_feats, posdict=posdict) to_dump.append(to_tag) # Fix the tags... # fix_ctn_gloss_line(to_tag, tag_method=INTENT_POS_CLASS) # Now, retrieve eval/gold. eval_tags = [v.value() for v in to_tag.get_pos_tags(GLOSS_WORD_ID, tag_method=INTENT_POS_CLASS)] gold_tags = [v.value() for v in inst.get_pos_tags(GLOSS_WORD_ID, tag_method=INTENT_POS_MANUAL)] tag_tokens = [POSToken('a', label=l) for l in eval_tags] gold_tokens= [POSToken('a', label=l) for l in gold_tags] if not len(tag_tokens) == len(gold_tokens): print("LENGTH OF SEQUENCE IS MISMATCHED") continue gold_sents.append(gold_tokens) eval_sents.append(tag_tokens) xigtxml.dump(open('./enriched_ctn_dev.xml', 'w'), to_dump) return poseval(eval_sents, gold_sents, details=True,csv=True, matrix=True)
def divide_corpus(args): infile = args.infile outdir = args.outdir igt_index = [0] # just a list so I don't have to nonlocal it later indices = set() def make_filename(fn): return os.path.join(outdir, fn + '.xml') # this should make reading the corpus faster def selective_decode_igt(elem): idx = igt_index.pop() if idx not in indices: igt = None else: igt = xigtxml.default_decode_igt(elem) indices.remove(idx) igt_index.append(idx + 1) return igt if args.meta is not None: metatype, func = args.meta func = eval('lambda m:{}'.format(func)) get_key = lambda igt: next( (func(m) for m in igt.get_meta(metatype, default=[]) if m is not None), None ) # get a mapping of code to the indexed position of each IGT keymap = defaultdict(set) xc = xigtxml.load(open(infile, 'r'), mode='transient') for i, igt in enumerate(xc): key = get_key(igt) keymap[key].add(i) xigtxml.decode_igt = selective_decode_igt # now group IGTs with similar languages into a file for key, indices in keymap.items(): if key is None: key = '-others-' # FIXME not guaranteed to be unique igt_index = [0] xc = xigtxml.load(open(infile, 'r'), mode='transient') xigtxml.dump(open(make_filename(key), 'w'), xc)
def divide_corpus(args): infile = args.infile outdir = args.outdir igt_index = [0] # just a list so I don't have to nonlocal it later indices = set() def make_filename(fn): return os.path.join(outdir, fn + '.xml') # this should make reading the corpus faster def selective_decode_igt(elem): idx = igt_index.pop() if idx not in indices: igt = None else: igt = xigtxml.default_decode_igt(elem) indices.remove(idx) igt_index.append(idx + 1) return igt if args.meta is not None: metatype, func = args.meta func = eval('lambda m:{}'.format(func)) get_key = lambda igt: next((func(m) for m in igt.get_meta(metatype, default=[]) if m is not None), None) # get a mapping of code to the indexed position of each IGT keymap = defaultdict(set) xc = xigtxml.load(open(infile, 'r'), mode='transient') for i, igt in enumerate(xc): key = get_key(igt) keymap[key].add(i) xigtxml.decode_igt = selective_decode_igt # now group IGTs with similar languages into a file for key, indices in keymap.items(): if key is None: key = '-others-' # FIXME not guaranteed to be unique igt_index = [0] xc = xigtxml.load(open(infile, 'r'), mode='transient') xigtxml.dump(open(make_filename(key), 'w'), xc)
def xigt_import(infile, outfile, options=None): if options is None: options = {} options.setdefault('record_markers', default_record_markers) options.setdefault('igt_attribute_map', default_igt_attribute_map) options.setdefault('tier_map', default_tier_map) options.setdefault('make_phrase_tier', default_make_phrase_tier) options.setdefault('tier_types', default_tier_types) options.setdefault('alignments', default_alignments) options.setdefault('error_recovery_method', default_error_recovery_method) # just use existing info to create marker-based alignment info options['tb_alignments'] = _make_tb_alignments(options) with open(infile, 'r') as in_fh, open(outfile, 'w') as out_fh: tb = toolbox.read_toolbox_file(in_fh) igts = toolbox_igts(tb, options) xc = XigtCorpus(igts=igts, mode='transient') xigtxml.dump(out_fh, xc)
def run(args): xc = xigtxml.load(args.infile) if args.igt_key: logging.info('Sorting %s IGTs' % args.infile) xc.sort(key=make_sortkey(args.igt_key)) if args.tier_key: logging.info('Sorting %s tiers by key' % args.infile) for igt in xc: igt.sort(key=make_sortkey(args.tier_key)) elif args.tier_deps: logging.info('Sorting %s tiers by ref-dependencies' % args.infile) refattrs = [ra.strip() for ra in args.tier_deps.split(',')] for igt in xc: igt.sort_tiers(refattrs=refattrs) if args.item_key: logging.info('Sorting %s items by key' % args.infile) for igt in xc: for tier in igt: tier.sort(key=make_sortkey(args.item_key)) if args.in_place: xigtxml.dump(args.infile, xc) else: print(xigtxml.dumps(xc))
def separate_tiers(args): tiers = set(args.tiers) # assuming XML for now with open(args.infile,'r') as instream: src_xc = xigtxml.load(instream) sep_xc = XigtCorpus(attributes=src_xc.attributes, metadata=src_xc.metadata) for igt in src_xc.igts: sep_xc.add(Igt(id=igt.id, type=igt.type, attributes=igt.attributes, metadata=igt.metadata, tiers=[t for t in igt.tiers if t.type in tiers])) xigtxml.dump(open(args.outfile, 'w'), sep_xc) if not args.remainder: return with open(args.infile,'r') as instream: src_xc = xigtxml.load(instream) rem_xc = XigtCorpus(attributes=src_xc.attributes, metadata=src_xc.metadata) for igt in src_xc.igts: rem_xc.add(Igt(id=igt.id, type=igt.type, attributes=igt.attributes, metadata=igt.metadata, tiers=[t for t in igt.tiers if t.type not in tiers])) xigtxml.dump(open(args.remainder, 'w'), rem_xc)
else: return xigtxml.default_decode_meta(elem) ### Encoding ### def encode_meta(meta): metatype = meta.type.lower() if metatype in ('judgment', 'vetted', 'phenomena'): attributes = dict(type=meta.type, **meta.attributes) e = etree.Element('meta', attrib=attributes) if metatype == 'phenomena': for phenomenon in meta.content: p = etree.Element('phenomenon') p.text = phenomenon e.append(p) return e else: return xigtxml.default_encode_meta(meta) ### Function maps ### xigtxml.decode_meta = matrix_decode_meta xigtxml.encode_meta = matrix_encode_meta if __name__ == '__main__': import sys f = sys.argv[1] xc = xigtxml.load(open(f,'r')) print(xigtxml.dumps(xc, pretty_print=True)) xigtxml.dump(open('abkhaz-out.xigt','w'), xc, pretty_print=True)
def do_projection(**kwargs): """ (Re)project the :param aln_method: The alignment method """ kwargs = ArgPasser(kwargs) aln_method = ALN_ARG_MAP[kwargs.get('aln_method', ARG_ALN_ANY)] successes = 0 failures = 0 in_path = kwargs.get(ARG_INFILE) with open(in_path, 'r', encoding='utf-8') as f: PROJ_LOG.log(1000, 'Loading file "{}"...'.format(os.path.basename(in_path))) xc = xigtxml.load(f, mode=INCREMENTAL) for inst in xc: success_fail_string = 'Instance {:20s} {{:10s}}{{}}'.format('"'+inst.id+'"...') def fail(reason): nonlocal failures, success_fail_string success_fail_string = success_fail_string.format('FAIL', reason) failures += 1 def success(): nonlocal successes, success_fail_string success_fail_string = success_fail_string.format('SUCCESS', '') successes += 1 # Query whether we want to require to use only trees # where the alignment is 100%. completeness_requirement = kwargs.get('completeness', default=0, t=float) # TODO: Find better way to do this? try: if kwargs.get('pos', True): project_trans_pos_to_gloss(inst, aln_method=aln_method, completeness_requirement=completeness_requirement) project_gloss_pos_to_lang(inst, tag_method=INTENT_POS_PROJ) if kwargs.get('ds', True): project_pt_tier(inst, proj_aln_method=aln_method) project_ds_tier(inst, proj_aln_method=aln_method, completeness_requirement=completeness_requirement) except (NoNormLineException) as ntle: fail("Bad Lines") except (NoAlignmentProvidedError, ProjectionException) as nape: fail("Alignment") except (GlossLangAlignException) as glae: fail("Gloss-Lang") except (ProjectionIncompleteAlignment) as pia: fail("Alignment Incomplete") except PhraseStructureProjectionException as pspe: fail("Projection Failed") else: success() finally: PROJ_LOG.info(success_fail_string) inst.sort_tiers() out_path = kwargs.get(ARG_OUTFILE) # Try to make the folder if it doesn't already exist. os.makedirs(os.path.dirname(out_path), exist_ok=True) PROJ_LOG.log(1000, 'Writing new file "{}"...'.format(os.path.basename(out_path))) with open(out_path, 'w', encoding='utf-8') as out_f: xigtxml.dump(out_f, xc) PROJ_LOG.log(1000, '{} instances processed, {} successful, {} failed.'.format(len(xc), successes, failures))
p.add_argument('-d', '--dest', required=True, help='Output directory for modified files.') p.add_argument('-f', '--force', help='Force overwrite existing files.') args = p.parse_args() for path in args.FILE: with open(path, 'r', encoding='utf-8') as f: xc = xigtxml.load(f, mode=INCREMENTAL) for inst in xc: JUDG_LOG.info('Processing instance "{}"'.format(inst.id)) for item in xigtpath.findall(inst, 'tier[@type='+ODIN_TIER_TYPE+ ']/item'): # Skip blank lines if item.value() is None: continue # Get the judgment and add it if it is non-null. j = get_judgment(item.value()) if j is not None: item.attributes[ODIN_JUDGMENT_ATTRIBUTE] = j JUDG_LOG.debug('Judgment found on item "{}"'.format(item.id)) # Make the output directory if it doesn't exist. makedirs(args.dest, exist_ok=True) outpath = os.path.join(args.dest, os.path.basename(path)) if not os.path.exists(outpath) or args.force: with open(outpath, 'w', encoding='utf-8') as out_f: xigtxml.dump(out_f, xc)
def enrich(**kwargs): global classifier if ARG_OUTFILE not in kwargs: ENRICH_LOG.critical("No output file specified.") sys.exit() # ============================================================================= # Set up the alternate classifier path... # ============================================================================= class_path = kwargs.get('class_path') #=========================================================================== # Set up the different arguments... #=========================================================================== inpath = kwargs.get(ARG_INFILE) parse_args = kwargs.get(PARSE_VAR, []) pos_args = kwargs.get(POS_VAR, []) aln_args = kwargs.get(ALN_VAR, []) max_parse_length = kwargs.get('max_parse_length', 10) if not (parse_args or pos_args or aln_args): ENRICH_LOG.warning("No enrichment specified. Basic processing only will be performed.") #=========================================================================== # Sanity check the arguments. #=========================================================================== # Check that alignment is asked for if projection is asked for. if (ARG_POS_PROJ in pos_args or ARG_PARSE_PROJ in parse_args) and (not aln_args): ENRICH_LOG.warn("You have asked for projection methods but have not requested " + \ "alignments to be generated. Projection may fail if alignment not already present in file.") ENRICH_LOG.log(1000, 'Loading input file...') with open(inpath, 'r', encoding='utf-8') as in_f: corp = xigtxml.load(in_f, mode=INCREMENTAL) # ------------------------------------------- # Initialize the English tagger if: # A) "proj" option is selected for pos. # B) "trans" option is given for pos. # C) "heurpos" option is given for alignment. # ------------------------------------------- s = None if ARG_POS_PROJ in pos_args or ARG_POS_TRANS in pos_args or ARG_ALN_HEURPOS in aln_args: ENRICH_LOG.log(1000, 'Initializing tagger...') tagger = c.getpath('stanford_tagger_trans') try: s = StanfordPOSTagger(tagger) except TaggerError as te: ENRICH_LOG.critical(te) sys.exit(2) # ------------------------------------------- # Initialize the parser if: # A) "trans" option is given for parse # B) "proj" option is given for parse. # ------------------------------------------- if ARG_PARSE_TRANS in parse_args or ARG_PARSE_PROJ in parse_args: ENRICH_LOG.log(1000, "Intializing English parser...") sp = stanford_parser.StanfordParser() # ------------------------------------------- # Initialize the classifier if: # A) "class" option is given for pos # B) "heurpos" option is given for alignment. # ------------------------------------------- m = None if ARG_POS_CLASS in pos_args or ARG_ALN_HEURPOS in aln_args: ENRICH_LOG.log(1000, "Initializing gloss-line classifier...") p = load_posdict() m = mallet_maxent.MalletMaxent(classifier) # -- 1b) Giza Gloss to Translation alignment -------------------------------------- if ARG_ALN_GIZA in aln_args or ARG_ALN_GIZAHEUR in aln_args: ENRICH_LOG.log(1000, 'Aligning gloss and translation lines using mgiza++...') try: if ARG_ALN_GIZAHEUR in aln_args: giza_align_t_g(corp, resume=True, use_heur=True, symmetric=kwargs.get(ALN_SYM_VAR, SYMMETRIC_INTERSECT)) if ARG_ALN_GIZA in aln_args: giza_align_t_g(corp, resume=True, use_heur=False, symmetric=kwargs.get(ALN_SYM_VAR, SYMMETRIC_INTERSECT)) except GizaAlignmentException as gae: gl = logging.getLogger('giza') gl.critical(str(gae)) raise gae # ------------------------------------------- # Begin iterating through the corpus # ------------------------------------------- for inst in corp: feedback_string = 'Instance {:15s}: {{:20s}}{{}}'.format(inst.id) reasons = [] inst_status = None def fail(reason): nonlocal inst_status, reasons if reason not in reasons: reasons.append(reason) inst_status = 'WARN' def success(): nonlocal inst_status inst_status = 'OK' # ------------------------------------------- # Define the reasons for failure # ------------------------------------------- F_GLOSS_LINE = "NOGLOSS" F_LANG_LINE = "NOLANG" F_TRANS_LINE = "NOTRANS" F_BAD_LINES = "BADLINES" F_L_G_ALN = "L_G_ALIGN" F_T_G_ALN = "G_T_ALIGN" F_NO_TRANS_POS="NO_POS_TRANS" F_PROJECTION = "PROJECTION" F_UNKNOWN = "UNKNOWN" F_PARSELEN = "OVER_MAX_LENGTH" try: # ------------------------------------------- # Get the different lines # ------------------------------------------- def tryline(func): nonlocal inst try: return func(inst) except NoNormLineException as nnle: return None gl = tryline(gloss_line) tls = tryline(trans_lines) lls = tryline(lang_lines) has_gl = gl is not None has_tl = tls is not None has_ll = lls is not None has_all = lambda: (has_gl and has_tl and has_ll) # ------------------------------------------- # Translation Line # ------------------------------------------- if has_tl: if ARG_POS_PROJ in pos_args or ARG_POS_TRANS in pos_args or ARG_ALN_HEURPOS in aln_args: try: tag_trans_pos(inst, s) except CriticalTaggerError as cte: ENRICH_LOG.critical(str(cte)) sys.exit(2) if ARG_PARSE_PROJ in parse_args or ARG_PARSE_TRANS in parse_args: if len(trans(inst)) <= max_parse_length: parse_translation_line(inst, sp, pt=True, dt=True) else: fail(F_PARSELEN) # 4) POS tag the gloss line -------------------------------------------- if has_gl: if ARG_POS_CLASS in pos_args or ARG_ALN_HEURPOS in aln_args: classify_gloss_pos(inst, m, posdict=p) # ------------------------------------------- # Try getting alignments. # ------------------------------------------- if has_gl and has_ll: try: add_gloss_lang_alignments(inst) except GlossLangAlignException as glae: fail(F_L_G_ALN) if has_gl and has_tl: if ARG_ALN_HEURPOS in aln_args: heur_align_inst(inst, use_pos=True) if ARG_ALN_HEUR in aln_args: heur_align_inst(inst, use_pos=False) # ------------------------------------------- # Now, do the necessary projection tasks. # ------------------------------------------- # Project the classifier tags... if has_ll and has_gl and ARG_POS_CLASS in pos_args: try: project_gloss_pos_to_lang(inst, tag_method=INTENT_POS_CLASS) except GlossLangAlignException: fail(F_L_G_ALN) # ------------------------------------------- # Do the trans-to-lang projection... # ------------------------------------------- if has_all(): proj_aln_method = ALN_ARG_MAP[kwargs.get('proj_aln', ARG_ALN_ANY)] aln = get_trans_gloss_alignment(inst, aln_method=proj_aln_method) if not aln or len(aln) == 0: fail(F_T_G_ALN) else: # ------------------------------------------- # POS Projection # ------------------------------------------- if ARG_POS_PROJ in pos_args: trans_tags = trans_tag_tier(inst) if not trans_tags: fail(F_NO_TRANS_POS) else: project_trans_pos_to_gloss(inst) try: project_gloss_pos_to_lang(inst, tag_method=INTENT_POS_PROJ) except GlossLangAlignException as glae: fail(F_L_G_ALN) # ------------------------------------------- # Parse projection # ------------------------------------------- if ARG_PARSE_PROJ in parse_args: try: project_pt_tier(inst, proj_aln_method=proj_aln_method) except PhraseStructureProjectionException as pspe: fail(F_PROJECTION) except NoAlignmentProvidedError as nape: fail(F_T_G_ALN) try: project_ds_tier(inst, proj_aln_method=proj_aln_method) except ProjectionException as pe: fail(F_PROJECTION) except NoAlignmentProvidedError as nape: fail(F_T_G_ALN) # Sort the tiers... ---------------------------------------------------- inst.sort_tiers() except Exception as e: # ENRICH_LOG.warn("Unknown Error occurred processing instance {}".format(inst.id)) ENRICH_LOG.debug(e) # raise(e) fail(F_UNKNOWN) if not reasons: success() ENRICH_LOG.info(feedback_string.format(inst_status, ','.join(reasons))) ENRICH_LOG.log(1000, 'Writing output file...') if hasattr(kwargs.get(ARG_OUTFILE), 'write'): xigtxml.dump(kwargs.get(ARG_OUTFILE), corp) else: xigtxml.dump(writefile(kwargs.get(ARG_OUTFILE)), corp) ENRICH_LOG.log(1000, 'Done.') ENRICH_LOG.log(1000, "{} instances written.".format(len(corp)))
def test_parse(self): p = os.path.join(testfile_dir, 'naacl/ger.naacl') o = os.path.join(testfile_dir, 'naacl/ger.xml') xc = naacl_to_xigt(p) dump(open(o, 'w'), xc)
elif args.subcommand == CMD_FILTER: filter_corpus(flatten_list(getattr(args, ARG_INFILE)), getattr(args, ARG_OUTFILE), **vars(args)) # EXTRACT elif args.subcommand == CMD_EXTRACT: extract_from_xigt(input_filelist=flatten_list(args.FILE), **vars(args)) # EVAL elif args.subcommand == CMD_EVAL: evaluate_intent(flatten_list(args.FILE), eval_alignment=args.alignment, eval_ds=args.ds_projection, eval_posproj=args.pos_projection, classifier_path=args.classifier, classifier_feats=args.classifier_feats, eval_tagger=args.pos_tagger, gold_tagmap=args.tagmap_gold, trans_tagmap=args.tagmap_trans, outpath=args.output) # TEXT CONVERT elif args.subcommand == CMD_TEXT: xc = text_to_xigtxml(args.FILE) dump(args.OUT_FILE, xc) # PROJECT elif args.subcommand == CMD_PROJECT: do_projection(**vars(args)) # REPRO elif args.subcommand == CMD_REPRO: reproduce(args.action)
if head_i == -1: head_w = 'ROOT' head_i = 0 else: head_w = w_tier[int(head_i)-1].value() child_t = Terminal(dep_w, index=int(dep)) head_t = Terminal(head_w, index=head_i) edges.append(DepEdge(head=head_t, dep=child_t)) dt = build_dep_edges(edges) return dt if __name__ == '__main__': p = ArgumentParser() p.add_argument('IN_FILE', type=existsfile) p.add_argument('OUT_FILE') args = p.parse_args() xc = naacl_to_xigt(args.IN_FILE) dump(open(args.OUT_FILE, 'w'), xc) class test_naacl(TestCase): def test_parse(self): p = os.path.join(testfile_dir, 'naacl/ger.naacl') o = os.path.join(testfile_dir, 'naacl/ger.xml') xc = naacl_to_xigt(p) dump(open(o, 'w'), xc)
def convert_pml(aln_path, out_path, hindi=True): if hindi: igt_data = retrieve_hindi() else: igt_data = retrieve_naacl() a_root = load_xml(aln_path) doc_a = a_root.find(".//reffile[@name='document_a']").get('href') doc_b = a_root.find(".//reffile[@name='document_b']").get('href') doc_a = os.path.join(os.path.join(os.path.dirname(aln_path), doc_a)) doc_b = os.path.join(os.path.join(os.path.dirname(aln_path), doc_b)) # Load the sentences for each document. a_sents, a_glossed = load_sents(doc_a) b_sents, b_glossed = load_sents(doc_b) sent_alignments = a_root.findall(".//body/LM") assert (a_glossed and not b_glossed) or (b_glossed and not a_glossed), "Only one file should have glosses" xc = XigtCorpus() for sent_alignment in sent_alignments: # Get the sentence id... aln_id = sent_alignment.attrib.get('id') a_snt_id = re.search('^.+?-(.*)$', aln_id).group(1) if a_snt_id not in igt_data: continue # Get the text and tokens from the naacl data. pre_txt, lang_txt, gloss_txt, trans_txt = igt_data[a_snt_id] lang_tokens = lang_txt.split() gloss_tokens = gloss_txt.split() trans_tokens = trans_txt.split() a_snt_ref = sent_alignment.find('./tree_a.rf').text.split('#')[1] b_snt_ref = sent_alignment.find('./tree_b.rf').text.split('#')[1] word_alignments = sent_alignment.findall('./node_alignments/LM') a_snt, a_edges = a_sents[a_snt_ref] b_snt, b_edges = b_sents[b_snt_ref] assert isinstance(a_snt, Sentence) assert isinstance(b_snt, Sentence) # ------------------------------------------- # Skip sentences if they are not found for whatever reason # ------------------------------------------- if not a_snt or not b_snt: continue # ------------------------------------------- # Start constructing the IGT Instance. # ------------------------------------------- trans_snt, trans_indices = a_snt, a_edges gloss_snt, gloss_indices = b_snt, b_edges if a_glossed: trans_snt, trans_indices = b_snt, b_edges gloss_snt, gloss_indices = a_snt, a_edges # Hindi stuff... if hindi: lang_tokens = [w.text for w in gloss_snt] lang_postags = [w.pos for w in gloss_snt] lang_txt = ' '.join(lang_tokens) trans_tokens = [w.text for w in trans_snt] trans_postags = [w.pos for w in trans_snt] trans_txt = ' '.join(trans_tokens) gloss_tokens = [w.gloss if w.gloss else 'NULL' for w in gloss_snt] gloss_postags = lang_postags gloss_txt = ' '.join(gloss_tokens) inst = Igt(id=re.sub('s-', 'igt', a_snt_ref)) nt = Tier(type=ODIN_TIER_TYPE, id=NORM_ID, attributes={STATE_ATTRIBUTE:NORM_STATE}) ll = Item(id='n1', attributes={ODIN_TAG_ATTRIBUTE:ODIN_LANG_TAG}, text=lang_txt) gl = Item(id='n2', attributes={ODIN_TAG_ATTRIBUTE:ODIN_GLOSS_TAG}, text=gloss_txt) tl = Item(id='n3', attributes={ODIN_TAG_ATTRIBUTE:ODIN_TRANS_TAG}, text=trans_txt) nt.extend([ll,gl,tl]) inst.append(nt) # ------------------------------------------- # Handle the phrase tiers # ------------------------------------------- generate_lang_phrase_tier(inst) generate_trans_phrase_tier(inst) def process_postags(sent, tokens): postags = [] for i, token in enumerate(tokens): word = sent.getorder(i+1) if word is None: postags.append(None) else: postags.append(word.pos) return postags # ------------------------------------------- # Now, handle the translation words. # ------------------------------------------- tt = create_word_tier(ODIN_TRANS_TAG, trans_tokens, trans_phrase(inst)[0]) inst.append(tt) if not hindi: trans_postags = process_postags(trans_snt, trans_tokens) add_pos_tags(inst, tt.id, trans_postags, tag_method=INTENT_POS_MANUAL) # ------------------------------------------- # Handle the words tiers... # ------------------------------------------- wt = create_word_tier(ODIN_LANG_TAG, lang_tokens, lang_phrase(inst)[0]) gwt= create_word_tier(ODIN_GLOSS_TAG, gloss_tokens, gl) inst.extend([wt, gwt]) # Quickly set the alignment for the gloss words. for w, gw in zip(wt, gwt): gw.alignment = w.id if not hindi: lang_postags = process_postags(gloss_snt, gloss_tokens) gloss_postags = lang_postags add_pos_tags(inst, wt.id, lang_postags, tag_method=INTENT_POS_MANUAL) add_pos_tags(inst, gwt.id, gloss_postags, tag_method=INTENT_POS_MANUAL) create_dt_tier(inst, assemble_ds(gloss_snt, gloss_indices), wt, INTENT_DS_MANUAL) create_dt_tier(inst, assemble_ds(trans_snt, trans_indices), tt, INTENT_DS_MANUAL) # ------------------------------------------- # Now, the word alignments. # ------------------------------------------- a = Alignment() for word_alignment in word_alignments: a_ref = word_alignment.find('./a.rf').text.split('#')[1] b_ref = word_alignment.find('./b.rf').text.split('#')[1] a_word = a_snt.getid(a_ref) b_word = b_snt.getid(b_ref) if a_word is None or b_word is None: continue if not hindi: a_idx = a_word.order b_idx = b_word.order else: a_idx = a_snt.index(a_word)+1 b_idx = b_snt.index(b_word)+1 # Make sure the gloss is in the if a_glossed: trans_idx = b_idx lang_idx = a_idx else: trans_idx = a_idx lang_idx = b_idx a.add((trans_idx, lang_idx)) set_bilingual_alignment(inst, trans(inst), lang(inst), a, INTENT_ALN_MANUAL) set_bilingual_alignment(inst, trans(inst), gloss(inst), a, INTENT_ALN_MANUAL) xc.append(inst) with open(out_path, 'w', encoding='utf-8') as f: xigtxml.dump(f, xc)