예제 #1
0
def omw_fix_dup(cli, args):
    rp = TextReport(args.output)
    omw = get_omw()
    c = Counter()
    with omw.ctx() as ctx:
        senses = ctx.sense.select(limit=args.topk, columns=('synset', ))
        synsetids = {s.synset for s in senses}
        rp.print("-- OMW synsets: {}\n".format(len(synsetids)))
        for sid in synsetids:
            try:
                sid = SynsetID.from_string(sid)
            except:
                cli.logger.warning("Ignored synset ID: {}".format(sid))
                continue
            ss = omw.get_synset(sid, ctx=ctx)
            fixed_def, dup_defs = join_definitions(ss)
            if dup_defs:
                c.count("Duplicated")
                rp.print("-- Original {}: {}".format(ss.ID, ss.definition))
                rp.print("-- Fixed    {}: {}".format(ss.ID, fixed_def))
                for dup in dup_defs:
                    rp.print(
                        "DELETE FROM synset_def WHERE synset='{}' and def='{}';"
                        .format(ss.ID, to_sqlite_string(dup)))
                rp.print()
        c.summarise()
        pass
예제 #2
0
def manual_patch(cli, args):
    rp = TextReport()
    omw = get_omw()
    if not args.input or not os.path.isfile(args.input):
        raise Exception("Input file could not be found")
    with open(args.input, 'r') as infile, omw.ctx() as ctx:
        synsets = json.loads(infile.read())
        # for ss in synsets:
        #     rp.print(ss['synset'], ss['definition'])
        # rp.print("Found synsets:", len(synsets))
        for sinfo in synsets:
            sid, fixed_def = sinfo['synset'], sinfo['definition']
            ss = omw.get_synset(sid, ctx=ctx)
            orig_def = remove_puncs(ss.definition)
            if remove_puncs(fixed_def) != orig_def:
                rp.header("WARNING:", sid)
                rp.print(ss.definition)
                rp.print(fixed_def)
예제 #3
0
def verify_patch(cli, args):
    rp = TextReport()
    c = Counter()
    if not args.input or not os.path.isfile(args.input):
        raise Exception("Patch file not found")
    # load patches
    with open(args.input) as infile:
        patches = [DefPatch.from_dict(p) for p in yaml.safe_load(infile)]
    rp.print("Found {} patches.".format(len(patches)))
    # Validate against GWN-30
    # gwn = get_gwn()  # don't use GWN, for now
    omw = get_omw()
    wn = get_wn()
    with omw.ctx() as ctx, wn.ctx() as wnctx:
        for patch in patches:
            try:
                sid = wn.sk2sid(patch.sensekey, ctx=wnctx)
                if not sid:
                    raise Exception("sensekey `{}' does not exist.".format(
                        patch.sensekey))
                ss = omw.get_synset(sid, ctx=ctx)
                ssdef = ss.definition[:-1] if ss.definition.endswith(
                    ';') else ss.definition
                if patch.orig_def == ssdef:
                    c.count("Found")
                    rp.print("-", "{} [{}]".format(patch.orig_def,
                                                   patch.sensekey))
                    rp.print(" ", patch.new_def)
                    if patch.comment:
                        rp.print("C", patch.comment)
                else:
                    c.count("Found - diff")
                    rp.print("[DIFF]",
                             "{} [{}]".format(patch.orig_def, patch.sensekey))
                    rp.print("New:  ",
                             "{} [{}]".format(patch.new_def, patch.sensekey))
                    rp.print("      ", ssdef)
                    rp.print("Note: ", patch.comment)
            except:
                getLogger().warn("sensekey `{}' couldn't be found".format(
                    patch.sensekey))
                c.count("Not found")
                continue
        c.summarise(report=rp)
예제 #4
0
def find_omw_typo(cli, args):
    omw = get_omw()
    with omw.ctx() as ctx:
        defs = ctx.synset_def.select(
            "lang='eng' and (def like '% )%' or def like '%  %' or def like '% e.g.' or def like '% ,%' or def like '%:')"
        )
        if args.action == 'list':
            print("Found {} definitions with typo".format(len(defs)))
            for d in defs:
                print(d)
                print("Fixed: {}".format(repr(fix_typo(d._2))))
        elif args.action == 'patch':
            patch_script = TextReport(args.output)
            for d in defs:
                fixed_def = fix_typo(d._2)
                patch_script.writeline("-- Orig : {} [{}]".format(
                    d._2, d.synset))
                patch_script.writeline("-- Fixed: {}".format(fixed_def))
                patch_script.writeline(
                    "UPDATE synset_def SET def = '{}' WHERE synset='{}' AND def='{}';\n"
                    .format(to_sqlite_string(fixed_def), d.synset,
                            to_sqlite_string(d._2)))
예제 #5
0
파일: extract.py 프로젝트: letuananh/omwtk
def extract_omw(cli, args):
    ''' OMW Extractor '''
    rp = TextReport()
    omw = get_omw()
    WN_POS = 'nvar'
    with omw.ctx() as ctx:
        for pos in WN_POS:
            rp.header("POS: {}".format(pos))
            query = '''SELECT lemma, sense.synset, def as sdef FROM sense LEFT JOIN word ON sense.wordid = word.wordid and sense.lang=word.lang LEFT JOIN synset_def ON sense.synset = synset_def.synset AND sense.lang = synset_def.lang WHERE sense.lang='eng' AND word.lang='eng' AND synset_def.lang='eng' AND pos=? ORDER By freq DESC '''
            params = [pos]
            if args.topk:
                query += ' LIMIT ?'
                params.append(args.topk)
            results = ctx.select(query, params)
            senses = OrderedDict()
            potential_names = 0
            for lemma, sid, sdef in results:
                if lemma.lower() != lemma:
                    # if pos not in 'nar':
                    #     rp.print("{} - {}".format(lemma, pos))
                    potential_names += 1
                if (lemma, sid) in senses:
                    senses[(lemma, sid)] += "; " + sdef
                else:
                    senses[(lemma, sid)] = sdef
            print("Found {} sense in OMW".format(len(senses.keys())))
            print("Potential name: {}".format(potential_names))
            if args.output:
                out_path = "{}_{}.txt".format(args.output, pos)
                wordsenses = (k + (v, ) for k, v in senses.items())
                CSV.write_tsv(out_path, wordsenses, quoting=CSV.QUOTE_MINIMAL)
                print("Written to {}".format(out_path))
                lemma_out_path = "{}_{}_lemma.txt".format(args.output, pos)
                with open(lemma_out_path, 'w') as outfile:
                    for l, sid in senses.keys():
                        outfile.write(l)
                        outfile.write('\n')
                    print("Written to {}".format(lemma_out_path))
예제 #6
0
파일: ttl.py 프로젝트: letuananh/intsem.fx
def compare_ttls(cli, args):
    ''' Compare TTL to gold '''
    rp = TextReport()
    omw = get_omw()
    ctx = omw.ctx()
    gold = None
    profile = None
    ignored_ids = []
    if args.ignore:
        ignored_ids = [x.strip() for x in read_file(args.ignore).splitlines() if x.strip()]
        getLogger().debug("Ignored sentence IDs: {}".format(', '.join(ignored_ids)))
    if args.gold_profile:
        gold = read_ttl(args.gold_profile, ttl_format=args.ttl_format)
        # remove ignored sentences
        if ignored_ids:
            for sid in ignored_ids:
                gold.pop(sid, default=None)
        if not args.batch:
            rp.header("Gold sentences: {} | Loc: {}".format(len(gold), args.gold_profile))
        if args.verbose and not args.batch:
            for s in gold:
                rp.print("Sent #{}: {} tags".format(s.ID, len(s.tags)))
    elif not args.batch:
        print("Oops, no gold!")
    # read profile
    if args.profile:
        profile = read_ttl(args.profile, ttl_format=args.ttl_format)
        if not args.batch:
            rp.header("Profile sentences: {} | Loc: {}".format(len(profile), args.profile))
        # remove ignored sentences
        if ignored_ids:
            for sid in ignored_ids:
                profile.pop(sid, default=None)
        if not args.batch:
            rp.header("Profile sentences: {} (ignored: {}) | Loc: {}".format(len(profile), len(ignored_ids), args.profile))
        if args.verbose and not args.batch:
            for s in profile:
                getLogger().debug("Profile/Sent #{}: {} tags".format(s.ID, len(s.tags)))
    elif not args.batch:
        print("Oops, no profile to evaluate")
    # calculate precision and recall
    if gold and profile:
        gold_tags, gold_tags_len, gold_ignored = prepare_tags(gold, args=args, nonsense=args.nonsense)
        profile_tags, profile_tags_len, profile_ignored = prepare_tags(profile, args=args, nonsense=args.nonsense)
        if gold_tags_len == 0:
            rp.print("WARNING: There was no tag found in the gold profile. Please make sure that the tags for comparison are *sentence level* tags")
        if profile_tags_len == 0:
            rp.print("WARNING: There was no tag found in the evaluating profile. Please make sure that the tags for comparison are *sentence level* tags")
        getLogger().debug("Gold tags: {}".format(gold_tags_len))
        getLogger().debug(list(gold_tags.items())[:5])
        getLogger().debug("Profile tags: {}".format(profile_tags_len))
        getLogger().debug(list(profile_tags.items())[:5])
        true_positive, false_negative = score(gold_tags, profile_tags, args=args)
        precision = len(true_positive) / profile_tags_len
        recall = len(true_positive) / gold_tags_len
        f1 = 2 * precision * recall / (precision + recall)
        getLogger().debug("TP: {}".format(len(true_positive)))
        getLogger().debug("FN: {}".format(len(false_negative)))
        getLogger().debug("Recall (TP/Gtags): {}".format(recall))
        getLogger().debug("Precision (TP/Ptags): {}".format(precision))
        getLogger().debug("F1 (2*p*r/(p+r)): {}".format(f1))
        rc_text = "{:.2f}%".format(recall * 100)
        pr_text = "{:.2f}%".format(precision * 100)
        f1_text = "{:.2f}%".format(f1 * 100)
        if not args.batch:
            rp.print("True positive: {}".format(len(true_positive)))
            rp.print("False Negative: {}".format(len(false_negative)))
            rp.print("Gold # senses: {} | Ignored: {} | Total: {}".format(gold_tags_len, gold_ignored, gold_tags_len + gold_ignored))
            rp.print("Predicted # senses: {} | Ignored: {} | Total: {}".format(profile_tags_len, profile_ignored, profile_tags_len + profile_ignored))
            rp.print("Recall:    {}".format(rc_text))
            rp.print("Precision: {}".format(pr_text))
            rp.print("F1       : {}".format(f1_text))
        if args.org:
            # output org-mode
            columns = [rc_text, pr_text, f1_text]
            if args.cols:
                columns = args.cols + columns
            rp.print('| {} |'.format(' | '.join(columns)))
        if args.debug:
            if not args.batch:
                print("Debug file: {}".format(args.debug))
            debugfile = TextReport(args.debug)
            debugfile.print(".:: Table of content ::.")
            debugfile.print("")
            debugfile.print("[Misisng senses]")
            debugfile.print("[By classes]")
            debugfile.print("[Summary]")
            debugfile.print("")
            ss_map = {}
            debugfile.header("[Missing senses]")
            for sid, cfrom, cto, label in sorted(false_negative):
                if label not in ss_map:
                    ss = omw.get_synset(label, ctx=ctx)
                    ss_map[label] = ss
                else:
                    ss = ss_map[label]
                # get the surface form
                surface = gold.get(sid).text[int(cfrom):int(cto)]
                debugfile.print("{}\t{}\t{}\t{}\t{}\t{}\t{}".format(sid, cfrom, cto, surface, label, ss.definition, ss.lemmas))
            # by classes
            c = Counter()
            c.update(synsetID for sentID, cfrom, cto, synsetID in false_negative)
            debugfile.header("[By classes]")
            for synsetID, freq in c.most_common():
                ss = ss_map[synsetID]
                debugfile.print("{}: {} | ({}) - {}".format(synsetID, freq, ', '.join(ss.lemmas), ss.definition))
            # summary
            debugfile.header("[Summary]")
            debugfile.print("True positive: {}".format(len(true_positive)))
            debugfile.print("False positive: {}".format(len(false_negative)))
            debugfile.print("Gold # senses: {} | Ignored: {} | Total: {}".format(gold_tags_len, gold_ignored, gold_tags_len + gold_ignored))
            debugfile.print("Predicted # senses: {} | Ignored: {} | Total: {}".format(profile_tags_len, profile_ignored, profile_tags_len + profile_ignored))
            debugfile.print("Recall (TP/Gtags)   : {}".format(rc_text))
            debugfile.print("Precision (TP/Ptags): {}".format(pr_text))
            debugfile.print("F1  (2*p*r/(p+r))   : {}".format(f1_text))
    ctx.close()
예제 #7
0
파일: lex2pred.py 프로젝트: letuananh/omwtk
from puchikarui import Schema, with_ctx
from coolisf import GrammarHub
from chirptext.leutile import grouper
from chirptext.io import CSV
from chirptext import TextReport, FileHelper, Counter, FileHub
from chirptext.cli import CLIApp, setup_logging
from yawlib.helpers import get_gwn
from yawlib.helpers import get_wn, get_omw

# -------------------------------------------------------------------------------
# Configuration
# -------------------------------------------------------------------------------

DATA_FOLDER = os.path.abspath(os.path.expanduser('./data'))
omw = get_omw()
gwn = get_gwn()
wn = get_wn()
setup_logging('logging.json', 'logs')
ghub = GrammarHub()
MY_DIR = os.path.dirname(__file__)
SETUP_FILE = os.path.join(MY_DIR, 'scripts', 'ewdb.sql')
ROOTS = {'n': 'root_wn_n', 'v': 'root_wn_v', 'a': 'root_wn_adj', 'r': ''}
DEFAULT_DB_PATH = FileHelper.abspath('data/ewmap.db')


class EWDB(Schema):
    class Flags:
        PROCESSED = 1
        NO_PARSE = 2
        MWE = 3
예제 #8
0
파일: views.py 프로젝트: letuananh/yawlib
# :license: MIT, see LICENSE for more details.

import os
import json
import logging
import django
from django.http import HttpResponse, Http404
from yawlib import SynsetID, SynsetCollection
from yawlib.helpers import get_omw, get_wn

# ---------------------------------------------------------------------
# CONFIGURATION
# ---------------------------------------------------------------------
logger = logging.getLogger(__name__)
wsql = get_wn()
omwsql = get_omw()
print("OMW: {}".format(omwsql))


def jsonp(func):
    """ JSON/JSONP decorator """
    def decorator(request, *args, **kwargs):
        objects = func(request, *args, **kwargs)
        # ignore HttpResponse
        if isinstance(objects, HttpResponse):
            return objects
        # JSON/JSONP response
        data = json.dumps(objects)
        if 'callback' in request.GET:
            callback = request.GET['callback']
        elif 'callback' in request.POST: