def filter(ef: ExperimentFiles, overwrite = False): """ Filter all of the desired languages for the experiment. """ filtration_performed = False # Iterate over all the languages. for lang in ef.langs: orig_path = ef.orig(lang) filt_path = ef.filtered(lang) # Don't overwrite already existing files unless # we've been asked to overwrite. if not os.path.exists(filt_path) or overwrite: if not filtration_performed: REPRO_LOG.log(NORM_LEVEL, "Filtering ODIN data.") if not USE_CONDOR: filter_corpus([orig_path], filt_path, require_aln=True, require_lang=True, require_gloss=True, require_trans=True) else: REPRO_LOG.log(NORM_LEVEL, "Filtering {}...".format(lang)) args = ['filter', '--require-aln', '--require-lang', '--require-gloss', '--require-trans', orig_path, filt_path] condorify(args, ef._filter_dir(condor=True), ef.filtered(lang, True)) filtration_performed = True # If we're using condor, wait until all the # tasks for this step have completed. if USE_CONDOR and filtration_performed: if condor_email: condor.condor_wait_notify('Filtering of languages performed.', condor_email, "Filtration Done") else: condor.condor_wait() if filtration_performed: REPRO_LOG.log(NORM_LEVEL, "Filtration complete.")
def extract_pos(ef: ExperimentFiles, overwrite=False): extraction_performed = False for tc in ef.tagger_configs(): if not os.path.exists(tc.tagger_path()) or overwrite: if USE_CONDOR: args = tc.extract_args(True) condorify(*args) else: args, kwargs = tc.extract_args(False) extract_from_xigt(args, **kwargs) extraction_performed = True if extraction_performed and USE_CONDOR: if condor_email: condor.condor_wait_notify("Taggers have been extracted.", condor_email, "CONDOR: POS Tagger Extraction Complete") else: condor.condor_wait()
def enrich(ef: ExperimentFiles, overwrite = False, parse=True): """ Enrich the files using all types of word alignment, and tag/parse the translation line. """ enrichment_performed = False for lang in ef.langs: filtered_f = ef.filtered(lang) enriched_f = ef.enriched(lang) if not os.path.exists(enriched_f) or overwrite: # Notify user of enrichment if at least one file is being enriched. if not enrichment_performed: REPRO_LOG.log(NORM_LEVEL, "Enriching ODIN data.") if not USE_CONDOR: enrich_args = {ALN_VAR:aln_methods, POS_VAR:[ARG_POS_TRANS], ARG_INFILE:filtered_f, ARG_OUTFILE:enriched_f} intent.commands.enrich.enrich() else: args = ['enrich', '--align', ','.join(aln_methods), '--pos', ARG_POS_TRANS, filtered_f, enriched_f] if parse: args += ['--parse', 'trans'] condorify(args, ef._enriched_dir(condor=True), ef.enriched(lang, True)) enrichment_performed = True if USE_CONDOR and enrichment_performed: if condor_email: condor.condor_wait_notify('Enrichment of languages performed.', condor_email, "Enrichment Done") else: condor.condor_wait() if enrichment_performed: REPRO_LOG.log(NORM_LEVEL, "Enrichment complete.")
def eval_taggers_igt(ef: ExperimentFiles, overwrite = False): evaluation_performed = False for tc in ef.tagger_configs(): if not os.path.exists(tc.rgigt_eval()) or overwrite: if not evaluation_performed: REPRO_LOG.log(NORM_LEVEL, "Evaluating POS taggers on IGT data.") if USE_CONDOR: condorify(*tc.eval_igt_pos_args(True)) else: args, kwargs = tc.eval_igt_pos_args(False) evaluate_intent(args, **kwargs) evaluation_performed = True if USE_CONDOR and evaluation_performed: if condor_email: condor.condor_wait_notify("IGT Evaluation Complete", condor_email, subject="IGT POS Evaluation Complete") else: condor.condor_wait() # ------------------------------------------- # Now, consolidate all of the different methods and report. # ------------------------------------------- te = TaggerEval() for tc in ef.tagger_configs(): if not os.path.exists(tc.rgigt_eval()): REPRO_LOG.warn("Missing eval: {}".format(os.path.basename(tc.rgigt_eval()))) else: with open(tc.rgigt_eval(), 'r') as f: data = f.readlines() filename, matches, compares, acc = data[-1].strip().split(',') te.add_tc(tc, int(matches), int(compares)) print(te)
def postag(ef: ExperimentFiles, overwrite=False): tagging_performed = False for tc in ef.tagger_configs(): if not os.path.exists(tc.tagged()) or overwrite: if USE_CONDOR: args = tc.tag_args(True) condorify(*args) else: args = tc.tag_args(False) if tc.method == ARG_POS_CLASS: intent.commands.enrich.enrich(**args) else: do_projection(**args) tagging_performed = True if USE_CONDOR and (tagging_performed): if condor_email: condor.condor_wait_notify("Data has been tagged.", condor_email, "CONDOR: Tagging Complete") else: condor.condor_wait()