def separate_tiers(args): tiers = set(args.tiers) # assuming XML for now with open(args.infile, 'r') as instream: src_xc = xigtxml.load(instream) sep_xc = XigtCorpus(attributes=src_xc.attributes, metadata=src_xc.metadata) for igt in src_xc.igts: sep_xc.add( Igt(id=igt.id, type=igt.type, attributes=igt.attributes, metadata=igt.metadata, tiers=[t for t in igt.tiers if t.type in tiers])) xigtxml.dump(open(args.outfile, 'w'), sep_xc) if not args.remainder: return with open(args.infile, 'r') as instream: src_xc = xigtxml.load(instream) rem_xc = XigtCorpus(attributes=src_xc.attributes, metadata=src_xc.metadata) for igt in src_xc.igts: rem_xc.add( Igt(id=igt.id, type=igt.type, attributes=igt.attributes, metadata=igt.metadata, tiers=[t for t in igt.tiers if t.type not in tiers])) xigtxml.dump(open(args.remainder, 'w'), rem_xc)
def run(args): if args.infiles: for fn in args.infiles: logging.info('Normalizing {}'.format(fn)) xc = xigtxml.load(fn, mode='full') normalize_corpus(xc) xigtxml.dump(fn, xc) else: xc = xigtxml.load(sys.stdin, mode='full') normalize_corpus(xc) print(xigtxml.dumps(xc))
def run(args): if args.infiles: for fn in args.infiles: logging.info('Cleaning {}'.format(fn)) xc = xigtxml.load(fn, mode='full') clean_corpus(xc) xigtxml.dump(fn, xc) else: xc = xigtxml.load(sys.stdin, mode='full') clean_corpus(xc) print(xigtxml.dumps(xc))
def xc_load(path, mode=FULL, do_basic_processing=False): f = open(path, 'r', encoding='utf-8') xc = xigtxml.load(f, mode=mode) if do_basic_processing: for inst in xc: basic_processing(inst) return xc
def run(args): passed = [] from xml.etree import ElementTree as ET ids = Counter() for i, f in enumerate(args.files): with open(f, 'r') as fh: try: xc = xigtxml.load(fh, mode='transient') except ET.ParseError: print('Corpus {} ({}) failed to load. First verify ' 'that the XML file is valid by doing a schema ' 'validation.' .format(i, f)) else: context = make_context( xc, i, '<xigt-corpus>', 'collection', ids=ids ) report = validate_corpus(xc, context) report = filter_empty_reports( report, minlevel=logging.getLogger().getEffectiveLevel() ) if report_is_empty(report): passed.append(True) else: passed.append(False) print_report(report, args) add_id(ids, xc) return all(passed)
def print_stats(args): def new_stats(): return { 'languages': set(), 'iso-639-3': defaultdict(lambda: defaultdict(int)), 'instances': 0, 'igts': defaultdict(int), 'tiers': defaultdict(int), 'items': defaultdict(int), } stats = new_stats() lg_condition = lambda m: 'phrases' in m.attributes.get('tiers', '') num_files = 0 for f in args.files: with open(f, 'r') as fh: num_files += 1 cur_stats = new_stats() xc = xigtxml.load(fh, mode='transient') for igt in xc: stats['instances'] += 1 cur_stats['instances'] += 1 # language is in a meta element lgs = igt.get_meta('language', conditions=[lg_condition]) if lgs: lg_name = lgs[0].attributes.get('name', '???').strip() lg_iso = lgs[0].attributes.get('iso-639-3', '???').strip() else: lg_name = '' lg_iso = '' stats['languages'].add(lg_name.lower()) cur_stats['languages'].add(lg_name.lower()) stats['iso-639-3'][lg_iso][lg_name] += 1 cur_stats['iso-639-3'][lg_iso][lg_name] += 1 # count tiers and items by types, IGTs by tier types all_tier_types = set() for tier in igt: stats['tiers'][tier.type] += 1 cur_stats['tiers'][tier.type] += 1 all_tier_types.add(tier.type) for item in tier: stats['items'][item.type] += 1 cur_stats['items'][item.type] += 1 stats['igts'][tuple(sorted(all_tier_types))] += 1 cur_stats['igts'][tuple(sorted(all_tier_types))] += 1 if args.summarize_each: print_summary('{} summary:'.format(f), cur_stats) if args.languages_each: print_languages('Languages used in {}:'.format(f), cur_stats['iso-639-3']) if args.summarize: print_summary( 'Overall summary ({} file{}):'.format( num_files, 's' if num_files != 1 else ''), stats) if args.languages: print_languages( 'Languages used overall ({} file{}):'.format( num_files, 's' if num_files != 1 else ''), stats['iso-639-3'])
def print_stats(args): def new_stats(): return { 'languages': set(), 'iso-639-3': defaultdict(lambda: defaultdict(int)), 'instances': 0, 'igts': defaultdict(int), 'tiers': defaultdict(int), 'items': defaultdict(int), } stats = new_stats() lg_condition = lambda m: 'phrases' in m.attributes.get('tiers', '') num_files = 0 for f in args.files: with open(f, 'r') as fh: num_files += 1 cur_stats = new_stats() xc = xigtxml.load(fh, mode='transient') for igt in xc: stats['instances'] += 1 cur_stats['instances'] += 1 # language is in a meta element lgs = igt.get_meta('language', conditions=[lg_condition]) if lgs: lg_name = lgs[0].attributes.get('name', '???').strip() lg_iso = lgs[0].attributes.get('iso-639-3', '???').strip() else: lg_name = '' lg_iso = '' stats['languages'].add(lg_name.lower()) cur_stats['languages'].add(lg_name.lower()) stats['iso-639-3'][lg_iso][lg_name] += 1 cur_stats['iso-639-3'][lg_iso][lg_name] += 1 # count tiers and items by types, IGTs by tier types all_tier_types = set() for tier in igt: stats['tiers'][tier.type] += 1 cur_stats['tiers'][tier.type] += 1 all_tier_types.add(tier.type) for item in tier: stats['items'][item.type] += 1 cur_stats['items'][item.type] += 1 stats['igts'][tuple(sorted(all_tier_types))] += 1 cur_stats['igts'][tuple(sorted(all_tier_types))] += 1 if args.summarize_each: print_summary('{} summary:'.format(f), cur_stats) if args.languages_each: print_languages('Languages used in {}:'.format(f), cur_stats['iso-639-3']) if args.summarize: print_summary('Overall summary ({} file{}):' .format(num_files, 's' if num_files != 1 else ''), stats) if args.languages: print_languages('Languages used overall ({} file{}):' .format(num_files, 's' if num_files != 1 else ''), stats['iso-639-3'])
def divide_corpus(args): infile = args.infile outdir = args.outdir igt_index = [0] # just a list so I don't have to nonlocal it later indices = set() def make_filename(fn): return os.path.join(outdir, fn + '.xml') # this should make reading the corpus faster def selective_decode_igt(elem): idx = igt_index.pop() if idx not in indices: igt = None else: igt = xigtxml.default_decode_igt(elem) indices.remove(idx) igt_index.append(idx + 1) return igt if args.meta is not None: metatype, func = args.meta func = eval('lambda m:{}'.format(func)) get_key = lambda igt: next( (func(m) for m in igt.get_meta(metatype, default=[]) if m is not None), None ) # get a mapping of code to the indexed position of each IGT keymap = defaultdict(set) xc = xigtxml.load(open(infile, 'r'), mode='transient') for i, igt in enumerate(xc): key = get_key(igt) keymap[key].add(i) xigtxml.decode_igt = selective_decode_igt # now group IGTs with similar languages into a file for key, indices in keymap.items(): if key is None: key = '-others-' # FIXME not guaranteed to be unique igt_index = [0] xc = xigtxml.load(open(infile, 'r'), mode='transient') xigtxml.dump(open(make_filename(key), 'w'), xc)
def divide_corpus(args): infile = args.infile outdir = args.outdir igt_index = [0] # just a list so I don't have to nonlocal it later indices = set() def make_filename(fn): return os.path.join(outdir, fn + '.xml') # this should make reading the corpus faster def selective_decode_igt(elem): idx = igt_index.pop() if idx not in indices: igt = None else: igt = xigtxml.default_decode_igt(elem) indices.remove(idx) igt_index.append(idx + 1) return igt if args.meta is not None: metatype, func = args.meta func = eval('lambda m:{}'.format(func)) get_key = lambda igt: next((func(m) for m in igt.get_meta(metatype, default=[]) if m is not None), None) # get a mapping of code to the indexed position of each IGT keymap = defaultdict(set) xc = xigtxml.load(open(infile, 'r'), mode='transient') for i, igt in enumerate(xc): key = get_key(igt) keymap[key].add(i) xigtxml.decode_igt = selective_decode_igt # now group IGTs with similar languages into a file for key, indices in keymap.items(): if key is None: key = '-others-' # FIXME not guaranteed to be unique igt_index = [0] xc = xigtxml.load(open(infile, 'r'), mode='transient') xigtxml.dump(open(make_filename(key), 'w'), xc)
def write(out_fn, fn_idx): xc = XigtCorpus() for fn, igt_indices in fn_idx.items(): # if possible, try to decode needed igts only and skip the rest in_xc = xigtxml.load(fn, mode='transient') # ignoring corpus-level metadata xc.extend(igt for i, igt in enumerate(in_xc) if i in igt_indices) # assume the nsmap of the first igt is the same for all if xc.igts: xc.nsmap = xc[0].nsmap xigtxml.dump(out_fn, xc)
def run(args): job = make_job(args) agenda = job['agenda'] global_c = defaultdict for infile in args.infiles: filename = basename(infile) if args.basename else infile print(job['file_description'].format(filename=filename)) xc = xigtxml.load(infile) results = process_agenda(xc, agenda) print_results(results) print()
def run(infile, outpath, out_format, config=None): cfg = None if config: import json cfg = json.load(open(config,'r')) if out_format == 'latex': import xigt.exporters.latex as exporter elif out_format == 'itsdb': import xigt.exporters.itsdb as exporter # elif ... with open(infile, 'r') as in_fh: xc = xigtxml.load(in_fh, mode='transient') exporter.xigt_export(xc, outpath, config=cfg)
def set_vectors(datasets): """ Take loaded datasets pointing to XIGT files, load the IGT from the files and then send the glosses to vectors. Args: datasets: loaded dataset objects Returns: None """ # Process and convert data for dataset in datasets: for iso in datasets[dataset]["iso_list"]: # Open the current xigt file xc = xigtxml.load(open(datasets[dataset]["iso_list"][iso]["xigt"])) for igt in xc: # Ignore lines without glosses if not igt.get('g'): continue # Capture the translated words if a translation line exists try: words = dict((w, True) for w in ' '.join( [str(line.value()).lower() for line in igt.get('t')]).split()) except: words = {} # Determine which glosses share a morpheme morphemes = {} for gloss in igt.get('g'): if gloss.alignment not in morphemes: morphemes[gloss.alignment] = [] morphemes[gloss.alignment] = ( morphemes.get(gloss.alignment, 0) + [re.sub(' ', '', str(gloss.value()).lower())]) # Create a vector for each gloss instance for gloss in igt.get('g'): if re.sub(PUNCTEX, '', gloss.value()): word_match = True if re.sub( PUNCTEX, '', gloss.value()).lower() in words else False shared = morphemes[ gloss.alignment] if gloss.alignment else '' set_vector(dataset, iso, re.sub(PUNCTEX, '', gloss.value()), shared, word_match)
def split_corpus(filelist, train=0, dev=0, test=0, prefix='', seed=None, overwrite=False, nfold=1): # TODO: Make it so we automatically get to one # ------------------------------------------- # Check the arguments # ------------------------------------------- split_sum = train + dev + test if split_sum != 1.0: SPLIT_LOG.critical('Sum of train({}) + dev({}) + test({}) should = 1, not {}'.format(train, dev, test, split_sum)) raise CorpusSplitException() instances = [] # -- 1) Load all the files for f in filelist: SPLIT_LOG.info("Loading file {}".format(f)) xc = xigtxml.load(open(f, 'r', encoding='utf-8')) instances.extend(xc) # ------------------------------------------- # Run the requested number of folds # ------------------------------------------- offset = 0 for fold in range(0, nfold): # -- 2) Shuffle with the specified seed if requested if seed is not None: r = random.Random() random.shuffle(instances, r.seed(seed)) # -- 3) Move the files by the sliding offset if specified... offset_start = int(len(instances) * offset) instances = instances[offset_start:] + instances[:offset_start] # Actually split the instances train_instances, dev_instances, test_instances = split_instances(instances, train, dev, test) train_path = outpath_name(prefix, 'train', nfold, fold) dev_path = outpath_name(prefix, 'dev', nfold, fold) test_path = outpath_name(prefix, 'test', nfold, fold) # -- 6) Write out the output files. write_instances(train_instances, train_path, 'train', overwrite) write_instances(dev_instances, dev_path, 'dev', overwrite) write_instances(test_instances, test_path, 'test', overwrite) offset += (1 / nfold)
def separate_tiers(args): tiers = set(args.tiers) # assuming XML for now with open(args.infile,'r') as instream: src_xc = xigtxml.load(instream) sep_xc = XigtCorpus(attributes=src_xc.attributes, metadata=src_xc.metadata) for igt in src_xc.igts: sep_xc.add(Igt(id=igt.id, type=igt.type, attributes=igt.attributes, metadata=igt.metadata, tiers=[t for t in igt.tiers if t.type in tiers])) xigtxml.dump(open(args.outfile, 'w'), sep_xc) if not args.remainder: return with open(args.infile,'r') as instream: src_xc = xigtxml.load(instream) rem_xc = XigtCorpus(attributes=src_xc.attributes, metadata=src_xc.metadata) for igt in src_xc.igts: rem_xc.add(Igt(id=igt.id, type=igt.type, attributes=igt.attributes, metadata=igt.metadata, tiers=[t for t in igt.tiers if t.type not in tiers])) xigtxml.dump(open(args.remainder, 'w'), rem_xc)
def set_vectors(datasets): """ Take loaded datasets pointing to XIGT files, load the IGT from the files and then send the glosses to vectors. Args: datasets: loaded dataset objects Returns: None """ # Process and convert data for dataset in datasets: for iso in datasets[dataset]["iso_list"]: # Open the current xigt file xc = xigtxml.load(open(datasets[dataset]["iso_list"][iso]["xigt"])) for igt in xc: # Ignore lines without glosses if not igt.get('g'): continue # Capture the translated words if a translation line exists try: words = dict((w, True) for w in ' '.join([str(line.value()).lower() for line in igt.get('t')] ).split()) except: words = {} # Determine which glosses share a morpheme morphemes = {} for gloss in igt.get('g'): if gloss.alignment not in morphemes: morphemes[gloss.alignment] = [] morphemes[gloss.alignment] = (morphemes.get(gloss.alignment, 0) + [re.sub(' ', '', str(gloss.value()).lower())]) # Create a vector for each gloss instance for gloss in igt.get('g'): if re.sub(PUNCTEX, '', gloss.value()): word_match = True if re.sub(PUNCTEX, '', gloss.value()).lower() in words else False shared = morphemes[gloss.alignment] if gloss.alignment else '' set_vector(dataset, iso, re.sub(PUNCTEX, '', gloss.value()), shared, word_match)
def run(args): xc = xigtxml.load(args.infile) if args.igt_key: logging.info('Sorting %s IGTs' % args.infile) xc.sort(key=make_sortkey(args.igt_key)) if args.tier_key: logging.info('Sorting %s tiers by key' % args.infile) for igt in xc: igt.sort(key=make_sortkey(args.tier_key)) elif args.tier_deps: logging.info('Sorting %s tiers by ref-dependencies' % args.infile) refattrs = [ra.strip() for ra in args.tier_deps.split(',')] for igt in xc: igt.sort_tiers(refattrs=refattrs) if args.item_key: logging.info('Sorting %s items by key' % args.infile) for igt in xc: for tier in igt: tier.sort(key=make_sortkey(args.item_key)) if args.in_place: xigtxml.dump(args.infile, xc) else: print(xigtxml.dumps(xc))
def setUp(self): xc = xigtxml.load(ger_file) self.inst = xc[0]
wals_svo_present = wals_svo.is_language_present(language_code) wals_sv_present = wals_sv.is_language_present(language_code) wals_ov_present = wals_ov.is_language_present(language_code) wals_past_tense_present = wals_past_tense.is_language_present(language_code) wals_future_tense_present = wals_future_tense.is_language_present(language_code) # check to see if we should bother loading the language if ( (wals_svo_present and do_svo) or (wals_sv_present and do_sv) or (wals_ov_present and do_ov) or (wals_nadj_present and do_nadj) or (wals_past_tense_present and do_past_tense) or (wals_future_tense_present and do_future_tense) ): xc = xigtxml.load(language, mode="full") if wals_nadj_present and do_nadj: calc = NounAdjectiveProbe(xc, language_code, False, args.ndo) examine_language(calc, nadj_feature_dictionary, nadj_feature_num_instances_dictionary, nadj_errors) if wals_svo_present and do_svo: calc = SVOProbe(xc, language_code, False, args.ndo) examine_language(calc, svo_feature_dictionary, svo_feature_num_instances_dictionary, svo_errors) if wals_sv_present and do_sv: calc = SVProbe(xc, language_code, False, args.ndo) examine_language(calc, sv_feature_dictionary, sv_feature_num_instances_dictionary, sv_errors) if wals_ov_present and do_ov: calc = OVProbe(xc, language_code, False, args.ndo) examine_language(calc, ov_feature_dictionary, ov_feature_num_instances_dictionary, ov_errors) if wals_past_tense_present and do_past_tense: calc = PastTenseProbe(xc, language_code, False, args.ndo) examine_language(
def index(fn, by, idx): xc = xigtxml.load(fn, mode='transient') for i, igt in enumerate(xc): idx_key = xp.find(igt, by) idx[idx_key][fn].add(i)
affixes.append(split[1]) print(affixes) feature_dictionary={} lang_count=0 for i in range(len(odin_corpus)): filename = os.path.basename(odin_corpus[i]) language_code = os.path.splitext(filename)[0] try: # this is just a check to see if we get an error here. # we're going to error out if we can't look up this language/feature in WALS. wals_code = wals_dictionary.iso_to_wals[language_code] wals_value = wals.feature_dictionary[wals_code] except KeyError: # it wasn't in the dictionary of languages which have reported stats for feature in WALS continue xc = xigtxml.load(odin_corpus[i], mode='transient') prefix_count=0 suffix_count=0 affix_count=0 no_affix_count=0 sentence_count=0 hasmarker=False igt_list=[] for igt in xc: try: gloss=igt["g"] alignments=igt["a"] #glosspos=igt["gw-pos"] except: continue sentence_count+=1
def do_projection(**kwargs): """ (Re)project the :param aln_method: The alignment method """ kwargs = ArgPasser(kwargs) aln_method = ALN_ARG_MAP[kwargs.get('aln_method', ARG_ALN_ANY)] successes = 0 failures = 0 in_path = kwargs.get(ARG_INFILE) with open(in_path, 'r', encoding='utf-8') as f: PROJ_LOG.log(1000, 'Loading file "{}"...'.format(os.path.basename(in_path))) xc = xigtxml.load(f, mode=INCREMENTAL) for inst in xc: success_fail_string = 'Instance {:20s} {{:10s}}{{}}'.format('"'+inst.id+'"...') def fail(reason): nonlocal failures, success_fail_string success_fail_string = success_fail_string.format('FAIL', reason) failures += 1 def success(): nonlocal successes, success_fail_string success_fail_string = success_fail_string.format('SUCCESS', '') successes += 1 # Query whether we want to require to use only trees # where the alignment is 100%. completeness_requirement = kwargs.get('completeness', default=0, t=float) # TODO: Find better way to do this? try: if kwargs.get('pos', True): project_trans_pos_to_gloss(inst, aln_method=aln_method, completeness_requirement=completeness_requirement) project_gloss_pos_to_lang(inst, tag_method=INTENT_POS_PROJ) if kwargs.get('ds', True): project_pt_tier(inst, proj_aln_method=aln_method) project_ds_tier(inst, proj_aln_method=aln_method, completeness_requirement=completeness_requirement) except (NoNormLineException) as ntle: fail("Bad Lines") except (NoAlignmentProvidedError, ProjectionException) as nape: fail("Alignment") except (GlossLangAlignException) as glae: fail("Gloss-Lang") except (ProjectionIncompleteAlignment) as pia: fail("Alignment Incomplete") except PhraseStructureProjectionException as pspe: fail("Projection Failed") else: success() finally: PROJ_LOG.info(success_fail_string) inst.sort_tiers() out_path = kwargs.get(ARG_OUTFILE) # Try to make the folder if it doesn't already exist. os.makedirs(os.path.dirname(out_path), exist_ok=True) PROJ_LOG.log(1000, 'Writing new file "{}"...'.format(os.path.basename(out_path))) with open(out_path, 'w', encoding='utf-8') as out_f: xigtxml.dump(out_f, xc) PROJ_LOG.log(1000, '{} instances processed, {} successful, {} failed.'.format(len(xc), successes, failures))
def enrich(**kwargs): global classifier if ARG_OUTFILE not in kwargs: ENRICH_LOG.critical("No output file specified.") sys.exit() # ============================================================================= # Set up the alternate classifier path... # ============================================================================= class_path = kwargs.get('class_path') #=========================================================================== # Set up the different arguments... #=========================================================================== inpath = kwargs.get(ARG_INFILE) parse_args = kwargs.get(PARSE_VAR, []) pos_args = kwargs.get(POS_VAR, []) aln_args = kwargs.get(ALN_VAR, []) max_parse_length = kwargs.get('max_parse_length', 10) if not (parse_args or pos_args or aln_args): ENRICH_LOG.warning("No enrichment specified. Basic processing only will be performed.") #=========================================================================== # Sanity check the arguments. #=========================================================================== # Check that alignment is asked for if projection is asked for. if (ARG_POS_PROJ in pos_args or ARG_PARSE_PROJ in parse_args) and (not aln_args): ENRICH_LOG.warn("You have asked for projection methods but have not requested " + \ "alignments to be generated. Projection may fail if alignment not already present in file.") ENRICH_LOG.log(1000, 'Loading input file...') with open(inpath, 'r', encoding='utf-8') as in_f: corp = xigtxml.load(in_f, mode=INCREMENTAL) # ------------------------------------------- # Initialize the English tagger if: # A) "proj" option is selected for pos. # B) "trans" option is given for pos. # C) "heurpos" option is given for alignment. # ------------------------------------------- s = None if ARG_POS_PROJ in pos_args or ARG_POS_TRANS in pos_args or ARG_ALN_HEURPOS in aln_args: ENRICH_LOG.log(1000, 'Initializing tagger...') tagger = c.getpath('stanford_tagger_trans') try: s = StanfordPOSTagger(tagger) except TaggerError as te: ENRICH_LOG.critical(te) sys.exit(2) # ------------------------------------------- # Initialize the parser if: # A) "trans" option is given for parse # B) "proj" option is given for parse. # ------------------------------------------- if ARG_PARSE_TRANS in parse_args or ARG_PARSE_PROJ in parse_args: ENRICH_LOG.log(1000, "Intializing English parser...") sp = stanford_parser.StanfordParser() # ------------------------------------------- # Initialize the classifier if: # A) "class" option is given for pos # B) "heurpos" option is given for alignment. # ------------------------------------------- m = None if ARG_POS_CLASS in pos_args or ARG_ALN_HEURPOS in aln_args: ENRICH_LOG.log(1000, "Initializing gloss-line classifier...") p = load_posdict() m = mallet_maxent.MalletMaxent(classifier) # -- 1b) Giza Gloss to Translation alignment -------------------------------------- if ARG_ALN_GIZA in aln_args or ARG_ALN_GIZAHEUR in aln_args: ENRICH_LOG.log(1000, 'Aligning gloss and translation lines using mgiza++...') try: if ARG_ALN_GIZAHEUR in aln_args: giza_align_t_g(corp, resume=True, use_heur=True, symmetric=kwargs.get(ALN_SYM_VAR, SYMMETRIC_INTERSECT)) if ARG_ALN_GIZA in aln_args: giza_align_t_g(corp, resume=True, use_heur=False, symmetric=kwargs.get(ALN_SYM_VAR, SYMMETRIC_INTERSECT)) except GizaAlignmentException as gae: gl = logging.getLogger('giza') gl.critical(str(gae)) raise gae # ------------------------------------------- # Begin iterating through the corpus # ------------------------------------------- for inst in corp: feedback_string = 'Instance {:15s}: {{:20s}}{{}}'.format(inst.id) reasons = [] inst_status = None def fail(reason): nonlocal inst_status, reasons if reason not in reasons: reasons.append(reason) inst_status = 'WARN' def success(): nonlocal inst_status inst_status = 'OK' # ------------------------------------------- # Define the reasons for failure # ------------------------------------------- F_GLOSS_LINE = "NOGLOSS" F_LANG_LINE = "NOLANG" F_TRANS_LINE = "NOTRANS" F_BAD_LINES = "BADLINES" F_L_G_ALN = "L_G_ALIGN" F_T_G_ALN = "G_T_ALIGN" F_NO_TRANS_POS="NO_POS_TRANS" F_PROJECTION = "PROJECTION" F_UNKNOWN = "UNKNOWN" F_PARSELEN = "OVER_MAX_LENGTH" try: # ------------------------------------------- # Get the different lines # ------------------------------------------- def tryline(func): nonlocal inst try: return func(inst) except NoNormLineException as nnle: return None gl = tryline(gloss_line) tls = tryline(trans_lines) lls = tryline(lang_lines) has_gl = gl is not None has_tl = tls is not None has_ll = lls is not None has_all = lambda: (has_gl and has_tl and has_ll) # ------------------------------------------- # Translation Line # ------------------------------------------- if has_tl: if ARG_POS_PROJ in pos_args or ARG_POS_TRANS in pos_args or ARG_ALN_HEURPOS in aln_args: try: tag_trans_pos(inst, s) except CriticalTaggerError as cte: ENRICH_LOG.critical(str(cte)) sys.exit(2) if ARG_PARSE_PROJ in parse_args or ARG_PARSE_TRANS in parse_args: if len(trans(inst)) <= max_parse_length: parse_translation_line(inst, sp, pt=True, dt=True) else: fail(F_PARSELEN) # 4) POS tag the gloss line -------------------------------------------- if has_gl: if ARG_POS_CLASS in pos_args or ARG_ALN_HEURPOS in aln_args: classify_gloss_pos(inst, m, posdict=p) # ------------------------------------------- # Try getting alignments. # ------------------------------------------- if has_gl and has_ll: try: add_gloss_lang_alignments(inst) except GlossLangAlignException as glae: fail(F_L_G_ALN) if has_gl and has_tl: if ARG_ALN_HEURPOS in aln_args: heur_align_inst(inst, use_pos=True) if ARG_ALN_HEUR in aln_args: heur_align_inst(inst, use_pos=False) # ------------------------------------------- # Now, do the necessary projection tasks. # ------------------------------------------- # Project the classifier tags... if has_ll and has_gl and ARG_POS_CLASS in pos_args: try: project_gloss_pos_to_lang(inst, tag_method=INTENT_POS_CLASS) except GlossLangAlignException: fail(F_L_G_ALN) # ------------------------------------------- # Do the trans-to-lang projection... # ------------------------------------------- if has_all(): proj_aln_method = ALN_ARG_MAP[kwargs.get('proj_aln', ARG_ALN_ANY)] aln = get_trans_gloss_alignment(inst, aln_method=proj_aln_method) if not aln or len(aln) == 0: fail(F_T_G_ALN) else: # ------------------------------------------- # POS Projection # ------------------------------------------- if ARG_POS_PROJ in pos_args: trans_tags = trans_tag_tier(inst) if not trans_tags: fail(F_NO_TRANS_POS) else: project_trans_pos_to_gloss(inst) try: project_gloss_pos_to_lang(inst, tag_method=INTENT_POS_PROJ) except GlossLangAlignException as glae: fail(F_L_G_ALN) # ------------------------------------------- # Parse projection # ------------------------------------------- if ARG_PARSE_PROJ in parse_args: try: project_pt_tier(inst, proj_aln_method=proj_aln_method) except PhraseStructureProjectionException as pspe: fail(F_PROJECTION) except NoAlignmentProvidedError as nape: fail(F_T_G_ALN) try: project_ds_tier(inst, proj_aln_method=proj_aln_method) except ProjectionException as pe: fail(F_PROJECTION) except NoAlignmentProvidedError as nape: fail(F_T_G_ALN) # Sort the tiers... ---------------------------------------------------- inst.sort_tiers() except Exception as e: # ENRICH_LOG.warn("Unknown Error occurred processing instance {}".format(inst.id)) ENRICH_LOG.debug(e) # raise(e) fail(F_UNKNOWN) if not reasons: success() ENRICH_LOG.info(feedback_string.format(inst_status, ','.join(reasons))) ENRICH_LOG.log(1000, 'Writing output file...') if hasattr(kwargs.get(ARG_OUTFILE), 'write'): xigtxml.dump(kwargs.get(ARG_OUTFILE), corp) else: xigtxml.dump(writefile(kwargs.get(ARG_OUTFILE)), corp) ENRICH_LOG.log(1000, 'Done.') ENRICH_LOG.log(1000, "{} instances written.".format(len(corp)))
def setUp(self): my_path = os.path.join(testfile_dir, 'xigt/kor-ex.xml') self.my_igt = xigtxml.load(my_path)
wals_nadj_present = wals_nadj.is_language_present(language_code) wals_svo_present = wals_svo.is_language_present(language_code) wals_sv_present = wals_sv.is_language_present(language_code) wals_ov_present = wals_ov.is_language_present(language_code) wals_past_tense_present = wals_past_tense.is_language_present(language_code) wals_future_tense_present = wals_future_tense.is_language_present(language_code) # check to see if we should bother loading the language if (wals_svo_present and do_svo) \ or (wals_sv_present and do_sv) \ or (wals_ov_present and do_ov) \ or (wals_nadj_present and do_nadj) \ or (wals_past_tense_present and do_past_tense) \ or (wals_future_tense_present and do_future_tense): xc = xigtxml.load(language, mode='full') if wals_nadj_present and do_nadj: calc = NounAdjectiveProbe(xc, language_code, False, args.ndo) examine_language(calc, nadj_feature_dictionary, nadj_feature_num_instances_dictionary, nadj_errors) if wals_svo_present and do_svo: calc = SVOProbe(xc, language_code, False, args.ndo) examine_language(calc, svo_feature_dictionary, svo_feature_num_instances_dictionary, svo_errors) if wals_sv_present and do_sv: calc = SVProbe(xc, language_code, False, args.ndo) examine_language(calc, sv_feature_dictionary, sv_feature_num_instances_dictionary, sv_errors) if wals_ov_present and do_ov: calc = OVProbe(xc, language_code, False, args.ndo) examine_language(calc, ov_feature_dictionary, ov_feature_num_instances_dictionary, ov_errors) if wals_past_tense_present and do_past_tense: calc = PastTenseProbe(xc, language_code, False, args.ndo) examine_language(calc, past_tense_feature_dictionary, past_tense_feature_num_instances_dictionary,
negc=0 correct_position=0 incorrect_position=0 feature_dictionary={} for i in range(len(odin_corpus)): filename = os.path.basename(odin_corpus[i]) language_code = os.path.splitext(filename)[0] try: # this is just a check to see if we get an error here. # we're going to error out if we can't look up this language/feature in WALS. wals_code = wals_dictionary.iso_to_wals[language_code] wals_value = wals.feature_dictionary[wals_code] except KeyError: # it wasn't in the dictionary of languages which have reported stats for feature in WALS continue xc = xigtxml.load(odin_corpus[i], mode='full') hasneg=False position={"VNeg":0,"NegV":0,"[V-Neg]":0,"[Neg-V]":0} number={"single":0,"double":0} result1=findwords(xc) hasword=result1[0] wordpos=result1[1] neglist=result1[2] wordnum=result1[3] position["NegV"]=wordpos["before"]#*(524/1059) position["VNeg"]=wordpos["after"]#*(171/1059) number["single"]+=wordnum["single"] number["double"]+=wordnum["double"] result2=findmorphs(xc)
negc = 0 correct_position = 0 incorrect_position = 0 feature_dictionary = {} for i in range(len(odin_corpus)): filename = os.path.basename(odin_corpus[i]) language_code = os.path.splitext(filename)[0] try: # this is just a check to see if we get an error here. # we're going to error out if we can't look up this language/feature in WALS. wals_code = wals_dictionary.iso_to_wals[language_code] wals_value = wals.feature_dictionary[wals_code] except KeyError: # it wasn't in the dictionary of languages which have reported stats for feature in WALS continue xc = xigtxml.load(odin_corpus[i], mode='full') hasneg = False position = {"VNeg": 0, "NegV": 0, "[V-Neg]": 0, "[Neg-V]": 0} number = {"single": 0, "double": 0} result1 = findwords(xc) hasword = result1[0] wordpos = result1[1] neglist = result1[2] wordnum = result1[3] position["NegV"] = wordpos["before"] #*(524/1059) position["VNeg"] = wordpos["after"] #*(171/1059) number["single"] += wordnum["single"] number["double"] += wordnum["double"] result2 = findmorphs(xc)
def wordlist(filelist, gloss=None, meta=None): """ This function takes a list of Xigt-XML ODIN files, looks for the 'normalized' ODIN tier, and grabs the contents of all gloss and meta lines. It tokenizes simply by matching all word characters (using regex's `\w` escape) so as to pull out hyphenated and dotted gloss line tokens. The output is returned as a wordlist reverse sorted by count. :param filelist: List of input files to process. :type filelist: list[str] :param gloss: Path to use for the output gloss wordlist. :type gloss: str :param meta: Path to use for the output meta wordlist. :type meta: str """ gloss_words = defaultdict(int) meta_words = defaultdict(int) # ------------------------------------------- # Iterate over all the paths in the list of files. # ------------------------------------------- for path in filelist: with open(path, 'r', encoding='utf-8') as f: # Load the XigtCorpus, using the transient mode (most memory efficient) xc = xigtxml.load(f, mode='transient') # Now, iterate over each `Igt` instance in each file, for igt in xc: # Use a xigtpath expression to find the `tier` item that is a child of this node, # with state="normalized" as an attribute. norm_tier = xigtpath.find(igt, './tier[@state="normalized"]') # Next, since the `tag` attribute can be G+CR or M+AC etc., grab all lines # with a tag that starts with the desired tag letter. gloss_lines = [item for item in norm_tier if item.attributes['tag'].startswith("G")] meta_lines = [item for item in norm_tier if item.attributes['tag'].startswith("M")] # Define a local function to update the wordlists for gloss and meta # lines. def update_count(l_l, words): for l in l_l: if l.value(): for w in l.value().split(): for sub_w in re.findall('[\w]+', w): # <-- tokenize if sub_w.strip(): words[sub_w.lower()] += 1 # <-- lowercase, and add # Update the counts. update_count(gloss_lines, gloss_words) update_count(meta_lines, meta_words) # Define a function to write out the wordlist objects to files. # here, we will reverse sort by frequency of the word, and # tab-delineate the columns. def write_items(words, path): if path: f = open(path, 'w', encoding='utf-8') items = sorted(words.items(), key=lambda x: (x[1], x[0]), reverse=True) for w, count in items: f.write('{}\t{}\n'.format(w, count)) f.close() write_items(gloss_words, gloss) write_items(meta_words, meta)
from unittest import TestCase import os my_dir = os.path.dirname(__file__) seg_tests_path = os.path.join(my_dir, 'seg_tests.xml') from xigt.codecs import xigtxml from xigt import XigtCorpus, Igt from intent2.xigt_helpers import xigt_find from intent2.serialize.importers import parse_xigt_instance # Load the testcase files with open(seg_tests_path, 'r') as seg_tests_f: xc = xigtxml.load(seg_tests_f) # type: XigtCorpus # ------------------------------------------- # TestCases # ------------------------------------------- class EsuTest(TestCase): def setUp(self): self.inst = xigt_find(xc, id='esu-58') # type: Igt def test_segmentation(self): inst = parse_xigt_instance(self.inst) self.assertEqual(len(inst.gloss), 1) class IkxTest(TestCase): def setUp(self):
from collections import OrderedDict from xigt.codecs import xigtxml # etree is either from lxml.etree or xml.etree.ElementTree etree = xigtxml.etree ### Decoding ### ### Encoding ### ### Function maps ### if __name__ == '__main__': import sys from xigt.codecs import xigttxt f = sys.argv[1] xc = xigtxml.load(open(f,'r')) print(xigttxt.dumps(xc, pretty_print=True))
from intent.igt.igtutils import get_judgment from xigt.codecs import xigtxml from xigt.consts import INCREMENTAL if __name__ == '__main__': p = ArgumentParser() p.add_argument('FILE', nargs='+') p.add_argument('-d', '--dest', required=True, help='Output directory for modified files.') p.add_argument('-f', '--force', help='Force overwrite existing files.') args = p.parse_args() for path in args.FILE: with open(path, 'r', encoding='utf-8') as f: xc = xigtxml.load(f, mode=INCREMENTAL) for inst in xc: JUDG_LOG.info('Processing instance "{}"'.format(inst.id)) for item in xigtpath.findall(inst, 'tier[@type='+ODIN_TIER_TYPE+ ']/item'): # Skip blank lines if item.value() is None: continue # Get the judgment and add it if it is non-null. j = get_judgment(item.value()) if j is not None: item.attributes[ODIN_JUDGMENT_ATTRIBUTE] = j JUDG_LOG.debug('Judgment found on item "{}"'.format(item.id))