def filter(ef: ExperimentFiles, overwrite = False): """ Filter all of the desired languages for the experiment. """ filtration_performed = False # Iterate over all the languages. for lang in ef.langs: orig_path = ef.orig(lang) filt_path = ef.filtered(lang) # Don't overwrite already existing files unless # we've been asked to overwrite. if not os.path.exists(filt_path) or overwrite: if not filtration_performed: REPRO_LOG.log(NORM_LEVEL, "Filtering ODIN data.") if not USE_CONDOR: filter_corpus([orig_path], filt_path, require_aln=True, require_lang=True, require_gloss=True, require_trans=True) else: REPRO_LOG.log(NORM_LEVEL, "Filtering {}...".format(lang)) args = ['filter', '--require-aln', '--require-lang', '--require-gloss', '--require-trans', orig_path, filt_path] condorify(args, ef._filter_dir(condor=True), ef.filtered(lang, True)) filtration_performed = True # If we're using condor, wait until all the # tasks for this step have completed. if USE_CONDOR and filtration_performed: if condor_email: condor.condor_wait_notify('Filtering of languages performed.', condor_email, "Filtration Done") else: condor.condor_wait() if filtration_performed: REPRO_LOG.log(NORM_LEVEL, "Filtration complete.")
# 1) Filter the data # ------------------------------------------- filtration_done = False for lang in ef.langs: orig_f = ef.get_original_file(lang) filtered_f = ef.get_filtered_file(lang) if not os.path.exists(filtered_f): filtration_done = True if USE_CONDOR: model_prefix, name = ef.get_condor_filter(lang) run_cmd([p3path, intent_script, 'filter', '--require-aln', '--require-gloss', '--require-trans', '--require-lang', orig_f, filtered_f], model_prefix, name, False) else: filter_corpus([orig_f], filtered_f, require_lang=True, require_gloss=True, require_trans=True, require_aln=True) if USE_CONDOR and filtration_done: condor_wait_notify("Data has been filtered.", email_address, "CONDOR: Filtration complete.") # ------------------------------------------- # 2) Enriched data # ------------------------------------------- enrichment_done = False for lang in ef.langs: filtered_f = ef.get_filtered_file(lang) enriched_f = ef.get_enriched_file(lang) if not os.path.exists(enriched_f): enrichment_done = True
# ENRICH if args.subcommand == CMD_ENRICH: enrich(**vars(args)) # STATS elif args.subcommand == CMD_STATS: igt_stats(flatten_list(args.FILE), type='xigt', show_filename=True) # SPLIT elif args.subcommand == CMD_SPLIT: split_corpus(flatten_list(args.FILE), args.train, args.dev, args.test, prefix=args.prefix, overwrite=args.overwrite, nfold=args.nfold) # FILTER elif args.subcommand == CMD_FILTER: filter_corpus(flatten_list(getattr(args, ARG_INFILE)), getattr(args, ARG_OUTFILE), **vars(args)) # EXTRACT elif args.subcommand == CMD_EXTRACT: extract_from_xigt(input_filelist=flatten_list(args.FILE), **vars(args)) # EVAL elif args.subcommand == CMD_EVAL: evaluate_intent(flatten_list(args.FILE), eval_alignment=args.alignment, eval_ds=args.ds_projection, eval_posproj=args.pos_projection, classifier_path=args.classifier, classifier_feats=args.classifier_feats, eval_tagger=args.pos_tagger, gold_tagmap=args.tagmap_gold, trans_tagmap=args.tagmap_trans, outpath=args.output)