Exemplo n.º 1
0
def filter(ef: ExperimentFiles, overwrite = False):
    """
    Filter all of the desired languages for the experiment.
    """
    filtration_performed = False

    # Iterate over all the languages.
    for lang in ef.langs:
        orig_path = ef.orig(lang)
        filt_path = ef.filtered(lang)

        # Don't overwrite already existing files unless
        # we've been asked to overwrite.
        if not os.path.exists(filt_path) or overwrite:

            if not filtration_performed:
                REPRO_LOG.log(NORM_LEVEL, "Filtering ODIN data.")

            if not USE_CONDOR:
                filter_corpus([orig_path], filt_path,
                              require_aln=True,
                              require_lang=True,
                              require_gloss=True,
                              require_trans=True)
            else:
                REPRO_LOG.log(NORM_LEVEL, "Filtering {}...".format(lang))
                args = ['filter',
                        '--require-aln',
                        '--require-lang',
                        '--require-gloss',
                        '--require-trans',
                        orig_path, filt_path]
                condorify(args, ef._filter_dir(condor=True), ef.filtered(lang, True))

            filtration_performed = True

    # If we're using condor, wait until all the
    # tasks for this step have completed.
    if USE_CONDOR and filtration_performed:
        if condor_email:
            condor.condor_wait_notify('Filtering of languages performed.', condor_email, "Filtration Done")
        else:
            condor.condor_wait()

    if filtration_performed:
        REPRO_LOG.log(NORM_LEVEL, "Filtration complete.")
Exemplo n.º 2
0
def extract_pos(ef: ExperimentFiles, overwrite=False):
    extraction_performed = False
    for tc in ef.tagger_configs():
        if not os.path.exists(tc.tagger_path()) or overwrite:
            if USE_CONDOR:
                args = tc.extract_args(True)
                condorify(*args)
            else:
                args, kwargs = tc.extract_args(False)
                extract_from_xigt(args, **kwargs)

            extraction_performed = True

    if extraction_performed and USE_CONDOR:
        if condor_email:
            condor.condor_wait_notify("Taggers have been extracted.", condor_email, "CONDOR: POS Tagger Extraction Complete")
        else:
            condor.condor_wait()
Exemplo n.º 3
0
def enrich(ef: ExperimentFiles, overwrite = False, parse=True):
    """
    Enrich the files using all types of word alignment, and tag/parse the translation line.
    """

    enrichment_performed = False
    for lang in ef.langs:

        filtered_f = ef.filtered(lang)
        enriched_f = ef.enriched(lang)

        if not os.path.exists(enriched_f) or overwrite:

            # Notify user of enrichment if at least one file is being enriched.
            if not enrichment_performed:
                REPRO_LOG.log(NORM_LEVEL, "Enriching ODIN data.")

            if not USE_CONDOR:
                enrich_args = {ALN_VAR:aln_methods,
                               POS_VAR:[ARG_POS_TRANS],
                               ARG_INFILE:filtered_f,
                               ARG_OUTFILE:enriched_f}
                intent.commands.enrich.enrich()
            else:
                args = ['enrich',
                        '--align', ','.join(aln_methods),
                        '--pos', ARG_POS_TRANS,
                        filtered_f, enriched_f]
                if parse:
                    args += ['--parse', 'trans']

                condorify(args, ef._enriched_dir(condor=True), ef.enriched(lang, True))

            enrichment_performed = True

    if USE_CONDOR and enrichment_performed:
        if condor_email:
            condor.condor_wait_notify('Enrichment of languages performed.', condor_email, "Enrichment Done")
        else:
            condor.condor_wait()

    if enrichment_performed:
        REPRO_LOG.log(NORM_LEVEL, "Enrichment complete.")
Exemplo n.º 4
0
def eval_taggers_igt(ef: ExperimentFiles, overwrite = False):
    evaluation_performed = False
    for tc in ef.tagger_configs():

        if not os.path.exists(tc.rgigt_eval()) or overwrite:

            if not evaluation_performed:
                REPRO_LOG.log(NORM_LEVEL, "Evaluating POS taggers on IGT data.")

            if USE_CONDOR:
                condorify(*tc.eval_igt_pos_args(True))
            else:
                args, kwargs = tc.eval_igt_pos_args(False)
                evaluate_intent(args, **kwargs)

            evaluation_performed = True

    if USE_CONDOR and evaluation_performed:
        if condor_email:
            condor.condor_wait_notify("IGT Evaluation Complete", condor_email, subject="IGT POS Evaluation Complete")
        else:
            condor.condor_wait()

    # -------------------------------------------
    # Now, consolidate all of the different methods and report.
    # -------------------------------------------
    te = TaggerEval()
    for tc in ef.tagger_configs():
        if not os.path.exists(tc.rgigt_eval()):
            REPRO_LOG.warn("Missing eval: {}".format(os.path.basename(tc.rgigt_eval())))
        else:
            with open(tc.rgigt_eval(), 'r') as f:
                data = f.readlines()
                filename, matches, compares, acc = data[-1].strip().split(',')
                te.add_tc(tc, int(matches), int(compares))

    print(te)
Exemplo n.º 5
0
def postag(ef: ExperimentFiles, overwrite=False):

    tagging_performed = False
    for tc in ef.tagger_configs():

        if not os.path.exists(tc.tagged()) or overwrite:

            if USE_CONDOR:
                args = tc.tag_args(True)
                condorify(*args)
            else:
                args = tc.tag_args(False)
                if tc.method == ARG_POS_CLASS:
                    intent.commands.enrich.enrich(**args)
                else:
                    do_projection(**args)

            tagging_performed = True

    if USE_CONDOR and (tagging_performed):
        if condor_email:
            condor.condor_wait_notify("Data has been tagged.", condor_email, "CONDOR: Tagging Complete")
        else:
            condor.condor_wait()
Exemplo n.º 6
0
for lang in ef.langs:
    orig_f = ef.get_original_file(lang)
    filtered_f = ef.get_filtered_file(lang)

    if not os.path.exists(filtered_f):
        filtration_done = True
        if USE_CONDOR:
            model_prefix, name = ef.get_condor_filter(lang)
            run_cmd([p3path, intent_script, 'filter',
                     '--require-aln', '--require-gloss', '--require-trans', '--require-lang',
                     orig_f, filtered_f], model_prefix, name, False)
        else:
            filter_corpus([orig_f], filtered_f, require_lang=True, require_gloss=True, require_trans=True, require_aln=True)

if USE_CONDOR and filtration_done:
    condor_wait_notify("Data has been filtered.", email_address, "CONDOR: Filtration complete.")


# -------------------------------------------
# 2) Enriched data
# -------------------------------------------
enrichment_done = False
for lang in ef.langs:
    filtered_f = ef.get_filtered_file(lang)
    enriched_f = ef.get_enriched_file(lang)

    if not os.path.exists(enriched_f):
        enrichment_done = True
        if USE_CONDOR:
            model_prefix, name = ef.get_condor_enrich(lang)
            run_cmd([p3path, intent_script, 'enrich',