def write_to_cumulog(clargs=None):
    """Write config infos and accuracy measures to cumulative log"""
    inf = {}
    inf["run_id"] = prep.find_run_id()
    try:
        inf["revnum"] = prep.find_git_revnum()
    except OSError:
        print "- Can't get git revision number (OSError)"
        inf["revnum"] = "XXXXX"
    if clargs.comment is not None and clargs.comment != "":
        inf["run_comment"] = clargs.comment
    else:
        inf["run_comment"] = tc.COMMENT
    if clargs.maxdista is not None:
        inf["maxdista"] = clargs.maxdsita
    else:
        inf["maxdista"] = tc.maxdista
    if clargs.distaw is not None:
        inf["distaw"] = clargs.distaw
    else:
        inf["distaw"] = tc.distaw
    if clargs.lmw is not None:
        inf["lmw"] = clargs.lmw
    else:
        inf["lmw"] = tc.lmw
    if clargs.lmpath is not None:
        inf["lmpath"] = os.path.basename(clargs.lmpath)
    else:
        inf["lmpath"] = os.path.basename(tc.lmpath)
    if clargs.lm_window is not None:
        inf["lm_window"] = tc.lm_window
    else:
        inf["lm_window"] = tc.lm_window
    inf["increment_norm"] = tc.increment_norm
    inf["accept_all_IV_regex_outputs"] = tc.accept_all_IV_regex_outputs
    outhead = "Run ID [{0}], RevNum [{1}] {2}\n".format(
        inf["run_id"], inf["revnum"], "=" * 50)
    with codecs.open(tc.EVALFN.format(prep.find_run_id()), "r",
                     "utf8") as done_res:
        with codecs.open(tc.CUMULOG, "a", "utf8") as cumu_res:
            cumu_res.write(outhead)
            cumu_res.write("RunComment: {}\n".format(inf["run_comment"]))
            for key in [
                    "maxdista", "distaw", "lmw", "lmpath", "increment_norm"
            ]:
                cumu_res.write("{}: {}\n".format(key, inf[key]))
            cumu_res.write("".join(done_res.readlines()[-4:]))
Exemplo n.º 2
0
def write_out(corr_dico):
    """Write out the final hash in a format that matches reference output format"""
    with codecs.open(tc.id_order, "r", "utf8") as idor:
        orderlist = [idn.strip() for idn in idor.readlines()]
    with codecs.open(tc.OUTFN.format(prep.find_run_id()), "w",
                     "utf8") as outfh:
        for tid in orderlist:
            if tid in corr_dico:
                outfh.write("%s\n" % tid)
                for oov_corr in corr_dico[tid]:
                    outfh.write("\t%s\t%s\n" % (oov_corr[0], oov_corr[1]))
Exemplo n.º 3
0
def preliminary_preps():
    """Set up logger and read command line arguments"""
    # logger
    logfile_name = os.path.join(tc.LOGDIR, "run_%s.log" % prep.find_run_id())
    lgr, lfh = prep.set_log(__name__, logfile_name, False)
    # cl options
    clargs = set_option_parser()
    if clargs.tag is not None and clargs.tag:
        tc.TAG = True
    elif clargs is not None and not clargs.tag:
        tc.TAG = False
    #TODO: options below don't seem to be able to affect tc other than for writing to the cumulog
    elif clargs.maxdista is not None:
        tc.maxdista = clargs.maxdista
    elif clargs.distaw is not None:
        tc.distaw = clargs.distaw
    elif clargs.lmw is not None:
        tc.lmw = clargs.lmw
    return lgr, lfh, clargs
Exemplo n.º 4
0
# -*- coding: utf-8 -*-
import codecs
from collections import defaultdict
import os
import re

import tnconfig as tc
import preparation as prep

# logging
logfile_name = os.path.join(tc.LOGDIR, "run_%s.log" % prep.find_run_id())
lgr, lfh = prep.set_log(__name__, logfile_name, False)

class EdScoreMatrix:
    """Methods to read cost matrix from module in arg cost_module
       and to find costs for individual character-edits."""

    def __init__(self, cost_module):
        self.costm = cost_module

    row_names = None
    col_names = None    
    matrix_conts = None
    accented_chars = [u'\xe1', u'\xe9', u'\xed', u'\xf1', u'\xf3', u'\xfa', u'\xfc']
    matrix_stats = {"max" : None, "min" : None, "ave" : None}

    def read_cost_matrix(self):
        """Read cost matrix into a hash. Set instance values for them"""
        row_names = self.costm.row_names.strip().split("\t")
        col_names = self.costm.col_names.strip().split("\t")
        costs = self.costm.costs
Exemplo n.º 5
0
import codecs
import inspect
import os
import logging
import psutil
import sys
import time

import preparation as prep
import tnconfig as tc

# logging
logfile_name = os.path.join(tc.LOGDIR, "run_%s.log" % prep.find_run_id(increase=True))
lgr, lfh = prep.set_log(__name__, logfile_name, False)


def check_server(port):
    """Check if Freeling server is running on port"""
    listening = False
    flprocs = [p for p in psutil.get_process_list()
               if p.name.startswith("analyze")]
    for flp in flprocs:
        flpcons = flp.get_connections()
        if len(flpcons) > 0 and flpcons[0].local_address[1] == port:
            listening = True
            break
    if listening:
        return True
    return False

def start_server(servtype="default"):
Exemplo n.º 6
0
def main():

    global lgr
    global tweet
    global clargs
    global ref_OOVs  # debug
    global all_tweets  # debug
    global safe_rules
    global rerules
    global abbrules
    global rinrules
    global ivs
    global ent_hash
    global entmgr
    global ppro
    global edimgr
    global stpwords
    global outdico
    global all_tweeto
    all_tweets = []  # debug

    # prep ---------------------------------------------------------------------
    lgr, lfh, clargs = preliminary_preps()

    # processing ---------------------------------------------------------------

    # Check if need to delete in-memory IV and entities dicos (if just changed config)
    #ok = raw_input("Need to reset the IV dictionary (if changed tc.merged_iv_and_entities)? [y] to reset\n")
    #if ok == "y":
    #    print "- Deleting 'ivs' (Imerged IV + ent) in current scope"
    #    delattr(sys.modules[__name__], "ivs")
    #    if "ivs_only" in dir(sys.modules["__main__"]):
    #        print "- Deleting 'ivs_only' (IV) in current scope"
    #        delattr(sys.modules[__name__], "ivs_only")

    corpusname = {True: "test", False: "dev"}
    print "Corpus: {0}".format(corpusname[tc.EVAL])
    print "Comment: {0}".format(tc.COMMENT)

    print "Start {0}".format(time.asctime(time.localtime()))
    print "Run ID: %s" % prep.find_run_id()
    try:
        lgr.info("Run {0} START | Rev [{1}] {2}".format(
            tc.RUNID, prep.find_git_revnum(), "=" * 60))
    except OSError:
        lgr.info("Run {0} START | Rev [{1}] {2}".format(
            tc.RUNID, "XXXX", "=" * 60))

    print "= main: preliminary preps"
    id_order = prep.find_id_order()
    ref_OOVs = prep.find_ref_OOVs(tc.ANNOTS)
    textdico = prep.grab_texts(tc.TEXTS)

    call_freeling(textdico)

    print "= main: load analyzers"
    ppro, safe_rules, rerules, abbrules, rinrules = load_preprocessing()
    ent_hash = load_entities()
    if tc.merge_iv_and_entities:
        ivs = merge_iv_and_entities(ivs, ent_hash)
    edimgr = load_distance_editor()
    slmmgr, binslm = load_lm()
    entmgr = load_entity_manager(ent_hash, ivs, edimgr, lmmgr)
    stpwords = stopwords.words('english')

    print "= twittero: creating Tweet instances"
    all_tweeto, outdico = parse_tweets(textdico)

    print "= main: create baseline"
    baseline_dico = get_baseline_results(all_tweeto)

    if not tc.BASELINE:
        print "= main: NORMALIZATION"
        x = 0
        for tid in all_tweeto:
            lgr.debug("NORMALIZING, TID [{0}]".format(tid))
            tweet = all_tweeto[tid]
            for tok in tweet.toks:
                if not isinstance(tok, OOV):
                    continue
                oov = tok  # easier label
                if tc.activate_prepro:
                    # separate prepro components switched on/off inside preprocess(oov)
                    preprocess(oov)
                if tc.use_ed:
                    create_edit_candidates(oov)
                    find_lm_scores(oov)
                rank_candidates(oov)
                rank_before_entities(oov)
                if tc.use_entities:
                    cf_with_ent(oov)
            x += 1
            #if x == 10: break #debug

            if x % 100 == 0:
                print("Done {0} tweets, {1}".format(
                    x, time.asctime(time.localtime())))

        # Extra step to add more entity candidates
        if tc.use_lmall:
            print "= Adding extra entities, {0}".format(
                time.asctime(time.localtime()))
            add_extra_entities()
            print "= Done"

        #outdico = populate_outdico(all_tweeto, outdico) # old, now use populate_easy

        if tc.generic_workflow or tc.use_entities:  # Doesn't cover all cases. Enough for paper-tests
            wf = "aft"
        else:
            wf = "bef"
        outdico = populate_easy(all_tweeto, outdico, wf)

    # write-out ----------------------------------------------------------------
    print "= writer"
    lgr.info("Writing out")
    if tc.BASELINE:
        chosen_outdico = baseline_dico
    else:
        chosen_outdico = outdico
    write_out(chosen_outdico)

    # evaluation ---------------------------------------------------------------
    print "= evaluation"
    lgr.info("Running evaluation")
    neval.main(tc.ANNOTS, tc.OUTFN.format(prep.find_run_id()))
    write_to_cumulog(clargs=clargs)

    lgr.removeHandler(lfh)

    print "End {0}".format(time.asctime(time.localtime()))
Exemplo n.º 7
0
def write_to_cumulog(clargs=None):
    """Write config infos and accuracy measures to cumulative log"""
    global golden_set_res
    global all_tweeto
    inf = {}
    inf["run_id"] = prep.find_run_id()
    try:
        inf["revnum"] = prep.find_git_revnum()
    except OSError:
        print "- Can't get git revision number (OSError)"
        inf["revnum"] = "XXXXX"
    if clargs.comment is not None and clargs.comment != "":
        inf["run_comment"] = clargs.comment
    else:
        inf["run_comment"] = tc.COMMENT
    inf["generic_lev"] = tc.generic_lev
    if clargs.maxdista is not None:
        inf["maxdista"] = clargs.maxdsita
    else:
        inf["maxdista"] = tc.maxdista
    if clargs.distaw is not None:
        inf["distaw"] = clargs.distaw
    else:
        inf["distaw"] = tc.distaw
    if clargs.lmw is not None:
        inf["lmw"] = clargs.lmw
    else:
        inf["lmw"] = tc.lmw
    if clargs.lmpath is not None:
        inf["lmpath"] = os.path.basename(clargs.lmpath)
    else:
        inf["lmpath"] = os.path.basename(tc.lmpath)
    if clargs.lm_window is not None:
        inf["lm_window"] = tc.lm_window
    else:
        inf["lm_window"] = tc.lm_window
    inf["increment_norm"] = tc.increment_norm
    inf["accept_all_IV_regex_outputs"] = tc.accept_all_IV_regex_outputs
    inf["merge_iv_and_entities"] = tc.merge_iv_and_entities
    inf["accent_check_in_regexes"] = tc.accent_check_in_regexes
    if tc.EVAL:
        inf["corpus"] = "test"
    else:
        inf["corpus"] = "dev"

    golden_set_res = tnstats.hash_gold_standard(tc.ANNOTS)
    coverage_info, coverage_stats = tnstats.get_upper_bound(
        golden_set_res, all_tweeto.values())
    envs_dico = {"W": "work", "H": "home", "S": "hslt-server"}
    inf["enviro"] = envs_dico[tc.ENV]
    wf_dico = {True: "lm_all", False: "lm_one"}
    inf["lm_app"] = wf_dico[tc.use_lmall]
    outhead = "== Run ID [{0}], RevNum [{1}] {2}\n".format(
        inf["run_id"], inf["revnum"], "=" * 48)
    with codecs.open(tc.EVALFN.format(prep.find_run_id()), "r",
                     "utf8") as done_res:
        with codecs.open(tc.CUMULOG, "a", "utf8") as cumu_res:
            cumu_res.write(outhead)
            cumu_res.write("RunComment: {0}\n".format(inf["run_comment"]))
            for key in [
                    "enviro", "corpus", "lm_app", "generic_lev", "maxdista",
                    "distaw", "accent_check_in_regexes", "lmw", "lmpath",
                    "increment_norm", "accept_all_IV_regex_outputs",
                    "merge_iv_and_entities"
            ]:
                cumu_res.write("- {0}: {1}\n".format(key, inf[key]))
            iso_cumu_settings_list = [
                'tc.no_postprocessing', 'tc.activate_prepro',
                'tc.safelist_end', 'tc.abbrev_end', 'tc.use_regexes',
                'tc.use_ed', 'tc.context_sens_ed', 'tc.use_entities'
            ]
            iso_cumu_settings_dict = dict(
                (name, eval(name)) for name in iso_cumu_settings_list)
            cumu_res.write("+ Isolating/Cumulative Module Settings +\n")
            for setting in iso_cumu_settings_dict:
                cumu_res.write("- {0}: {1}\n".format(
                    setting, iso_cumu_settings_dict[setting]))
            cumu_res.write("+ Upper Bound +\n")
            for stat in coverage_stats:
                cumu_res.write("- {0}: {1}\n".format(stat,
                                                     coverage_stats[stat]))

            cumu_res.write("".join(done_res.readlines()[-4:]))
            done_res.seek(0, 0)
        print "+ Results +"
        print "".join(done_res.readlines()[-4:])
def main():

    global lgr
    global tweet
    global clargs
    global ref_OOVs  # debug
    global all_tweets  # debug
    global safe_rules
    global rerules
    global ppro
    global edimgr
    global outdico
    all_tweets = []  # debug

    # prep ---------------------------------------------------------------------
    lgr, lfh, clargs = preliminary_preps()

    # processing ---------------------------------------------------------------
    print "Start {0}".format(time.asctime(time.localtime()))
    print "Run ID: %s" % prep.find_run_id()
    lgr.info("Run {0} START | Rev [{1}] {2}".format(tc.RUNID,
                                                    prep.find_git_revnum(),
                                                    "=" * 60))

    print "= main: preliminary preps"
    id_order = prep.find_id_order()
    ref_OOVs = prep.find_ref_OOVs(tc.ANNOTS)
    textdico = prep.grab_texts(tc.TEXTS)

    call_freeling(textdico)

    print "= main: load analyzers"
    ppro, safe_rules, rerules = load_preprocessing()
    edimgr = load_distance_editor()
    slmmgr, binslm = load_lm()

    print "= twittero: creating Tweet instances"
    all_tweeto, outdico = parse_tweets(textdico)

    print "= main: create baseline"
    baseline_dico = get_baseline_results(all_tweeto)

    print "= main: NORMALIZATION"
    x = 0
    for tid in all_tweeto:
        lgr.debug("NORMALIZING, TID [{0}]".format(tid))
        tweet = all_tweeto[tid]
        for tok in tweet.toks:
            if not isinstance(tok, OOV):
                continue
            oov = tok  # easier label
            preprocess(oov)
            create_edit_candidates(oov)
            find_lm_scores(oov)
            rank_candidates(oov)
        x += 1
        if x % 100 == 0:
            print("Done {0} tweets, {1}".format(x,
                                                time.asctime(
                                                    time.localtime())))

    outdico = populate_outdico(all_tweeto, outdico)

    # write-out ----------------------------------------------------------------
    print "= writer"
    lgr.info("Writing out")
    if tc.BASELINE:
        chosen_outdico = baseline_dico
    else:
        chosen_outdico = outdico
    write_out(chosen_outdico)

    # evaluation ---------------------------------------------------------------
    print "= evaluation"
    lgr.info("Running evaluation")
    neval.main(tc.ANNOTS, tc.OUTFN.format(prep.find_run_id()))
    write_to_cumulog(clargs=clargs)

    lgr.removeHandler(lfh)

    print "End {0}".format(time.asctime(time.localtime()))