def produce_files(**c): # Set up the output files outdir = c.get('outdir') c['tag_out'] = os.path.join(outdir, 'ablation_tags.txt') c['class_out'] = os.path.join(outdir, 'ablation_class.txt') c['maxent_path'] = os.path.join(outdir, 'ablation-model.maxent') c['tag_f'] = open(c.get('tag_out'), 'w', encoding='utf-8') c['class_f'] = open(c.get('class_out'), 'w', encoding='utf-8') c = ArgPasser(c) xp = XamlParser(**c) xml_files = glob.glob(os.path.join(c.get('input_dir'), c.get('pattern', default='*.xml'))) for x_f in xml_files: xp.parse(x_f, **c) return c
def write_gram(token, **kwargs): # Re-cast the kwargs as an argpasser. kwargs = ArgPasser(kwargs) output_type = kwargs.get('type', 'classifier') output = kwargs.get('output', sys.stdout) posdict = kwargs.get('posdict', None) if posdict is None: posdict = env.load_posdict() # Previous tag info prev_gram = kwargs.get('prev_gram') next_gram = kwargs.get('next_gram') # Get heuristic alignment aln_labels = kwargs.get('aln_labels', []) # =========================================================================== # Break apart the token... # =========================================================================== gram = token.seq pos = token.goldlabel # Lowercase if asked for lower = kwargs.get('lowercase', True, bool) gram = gram.lower() if gram else gram # Fix the various issues with the grams. gram = fix_gram(gram) # =========================================================================== # Do some cleaning on the gram.... # =========================================================================== # Only take the first of two slashed grams gram = re.sub('(.*)?/(.*)', r'\1', gram) # Remove leading and trailing stuff gram = re.sub('^(\S+)[\-=:\[\(\]\)/\*]$', r'\1', gram) gram = re.sub('^[\-=:\[\(\]\)/\*](\S+)$', r'\1', gram) # =========================================================================== # Output the grams for a classifier # # NOTE! Only tokens that have an ASSIGNED pos tag will be written out this way! if output_type == 'classifier' and pos: output.write(pos) # ======================================================================= # Get the morphemes # ======================================================================= morphs = intent.utils.token.tokenize_string(gram, intent.utils.token.morpheme_tokenizer) # ============================================================================= # Gram cleaning.... # ============================================================================= # Replace the characters that cause the svmlight format issues. gram = gram.replace(':', '-') gram = gram.replace('#', '-') # ======================================================================= # Is there a number # ======================================================================= if re.search('[0-9]', gram) and kwargs.get('feat_has_number', False, bool): output.write('\thas-number:1') # ======================================================================= # What labels is it aligned with # ======================================================================= if kwargs.get('feat_align', False, bool): for aln_label in aln_labels: output.write('\taln-label-%s:1' % aln_label) # ======================================================================= # Suffix # ======================================================================= if kwargs.get('feat_suffix', True, bool): output.write('\tgram-suffix-3-%s:1' % gram[-3:]) output.write('\tgram-suffix-2-%s:1' % gram[-2:]) output.write('\tgram-suffix-1-%s:1' % gram[-1:]) # ======================================================================= # Prefix # ======================================================================= if kwargs.get('feat_prefix', True, bool): output.write('\tgram-prefix-3-%s:1' % gram[:3]) output.write('\tgram-prefix-2-%s:1' % gram[:2]) output.write('\tgram-prefix-1-%s:1' % gram[:1]) # ======================================================================= # Number of morphs # ======================================================================= if kwargs.get('feat_morph_num', False, bool): output.write('\t%d-morphs:1' % len(list(morphs))) # =================================================================== # Previous gram # =================================================================== if prev_gram: prev_gram = prev_gram.lower() if lower else prev_gram # And then tokenize... for token in intent.utils.token.tokenize_string(prev_gram, intent.utils.token.morpheme_tokenizer): if kwargs.get('feat_prev_gram', True, bool): output.write('\tprev-gram-%s:1' % fix_gram(token.seq)) # Add prev dictionary tag if posdict and kwargs.get('feat_prev_gram_dict', True, bool) and token.seq in posdict: prev_tags = posdict.top_n(token.seq) output.write('\tprev-gram-dict-tag-%s:1' % prev_tags[0][0]) # Write a "**NONE**" for prev or next... elif kwargs.get('feat_prev_gram', True, bool): output.write('\tprev-gram-**NONE**:1') # =================================================================== # Next gram # =================================================================== if next_gram: next_gram = next_gram.lower() if lower else next_gram for token in intent.utils.token.tokenize_string(next_gram, intent.utils.token.morpheme_tokenizer): # =================================================================== # Gram itself # =================================================================== if kwargs.get('feat_next_gram', True, bool): output.write('\tnext-gram-%s:1' % fix_gram(token.seq)) if posdict and kwargs.get('feat_next_gram_dict', True, bool) and token.seq in posdict: next_tags = posdict.top_n(token.seq) output.write('\tnext-gram-dict-tag-%s:1' % next_tags[0][0]) elif kwargs.get('feat_next_gram', True, bool): output.write('\tnext-gram-**NONE**:1') # ======================================================================= # Iterate through the morphs # ======================================================================= for token in morphs: # =================================================================== # Just write the morph # =================================================================== if kwargs.get('feat_basic', True, bool): output.write('\t%s:1' % token.seq) # =================================================================== # If the morph resembles a word in our dictionary, give it # a predicted tag # =================================================================== if posdict and token.seq in posdict and kwargs.get('feat_dict', True, bool): top_tags = posdict.top_n(token.seq) # best = top_tags[0][0] # if best != pos: # MODULE_LOGGER.debug('%s tagged as %s not %s' % (gram, pos, best)) output.write('\ttop-dict-word-%s:1' % top_tags[0][0]) if len(top_tags) > 1: output.write('\tnext-dict-word-%s:1' % top_tags[1][0]) output.write('\n') # =========================================================================== # If writing the gram out for the tagger... # =========================================================================== if output_type == 'tagger' and kwargs.get('tag_f'): output.write('%s/%s ' % (gram, pos))
def do_projection(**kwargs): """ (Re)project the :param aln_method: The alignment method """ kwargs = ArgPasser(kwargs) aln_method = ALN_ARG_MAP[kwargs.get('aln_method', ARG_ALN_ANY)] successes = 0 failures = 0 in_path = kwargs.get(ARG_INFILE) with open(in_path, 'r', encoding='utf-8') as f: PROJ_LOG.log(1000, 'Loading file "{}"...'.format(os.path.basename(in_path))) xc = xigtxml.load(f, mode=INCREMENTAL) for inst in xc: success_fail_string = 'Instance {:20s} {{:10s}}{{}}'.format('"'+inst.id+'"...') def fail(reason): nonlocal failures, success_fail_string success_fail_string = success_fail_string.format('FAIL', reason) failures += 1 def success(): nonlocal successes, success_fail_string success_fail_string = success_fail_string.format('SUCCESS', '') successes += 1 # Query whether we want to require to use only trees # where the alignment is 100%. completeness_requirement = kwargs.get('completeness', default=0, t=float) # TODO: Find better way to do this? try: if kwargs.get('pos', True): project_trans_pos_to_gloss(inst, aln_method=aln_method, completeness_requirement=completeness_requirement) project_gloss_pos_to_lang(inst, tag_method=INTENT_POS_PROJ) if kwargs.get('ds', True): project_pt_tier(inst, proj_aln_method=aln_method) project_ds_tier(inst, proj_aln_method=aln_method, completeness_requirement=completeness_requirement) except (NoNormLineException) as ntle: fail("Bad Lines") except (NoAlignmentProvidedError, ProjectionException) as nape: fail("Alignment") except (GlossLangAlignException) as glae: fail("Gloss-Lang") except (ProjectionIncompleteAlignment) as pia: fail("Alignment Incomplete") except PhraseStructureProjectionException as pspe: fail("Projection Failed") else: success() finally: PROJ_LOG.info(success_fail_string) inst.sort_tiers() out_path = kwargs.get(ARG_OUTFILE) # Try to make the folder if it doesn't already exist. os.makedirs(os.path.dirname(out_path), exist_ok=True) PROJ_LOG.log(1000, 'Writing new file "{}"...'.format(os.path.basename(out_path))) with open(out_path, 'w', encoding='utf-8') as out_f: xigtxml.dump(out_f, xc) PROJ_LOG.log(1000, '{} instances processed, {} successful, {} failed.'.format(len(xc), successes, failures))