Пример #1
0
def gen_vocab(cli, args):
    ''' Generate vocabulary list from a tokenized file '''
    if args.topk and args.topk <= 0:
        topk = None
        cli.logger.warning("Invalid k will be ignored (k should be greater than or equal to 1)")
    else:
        topk = args.topk
    if args.stopwords:
        with open(args.stopwords, 'r') as swfile:
            stopwords = swfile.read().splitlines()
    else:
        stopwords = []
    if os.path.isfile(args.input):
        cli.logger.info("Generating vocabulary list from file {}".format(args.input))
        with codecs.open(args.input, encoding='utf-8') as infile:
            if args.output:
                cli.logger.info("Output: {}".format(args.output))
            rp = TextReport(args.output)
            lines = infile.read().splitlines()
            c = Counter()
            for line in lines:
                words = line.split()
                c.update(w for w in words if w not in stopwords)
            # report vocab
            word_freq = c.most_common(topk)
            words = [k for k, v in word_freq]
            rp.header("Lexicon")
            rp.writeline("\n".join(textwrap.wrap(" ".join(w for w in words), width=70)))
            for k, v in word_freq:
                rp.print("{}: {}".format(k, v))
    else:
        cli.logger.warning("File {} does not exist".format(args.input))
Пример #2
0
def gen_mfs_5500(cli, args):
    ''' Generate 3rd round tree banking '''
    rp = TextReport(args.output)
    topk_synsets = topk_mfs(5500)
    # finished treebanking
    first_round = read_lines('data/omw3000_synsets.txt')
    second_round = read_lines('data/omw5000_synsets.txt')
    done_synsets = set(first_round + second_round)
    # new
    third_round = topk_synsets.difference(done_synsets)
    # report
    print("All     :", len(topk_synsets))
    print("Done    :", len(done_synsets))
    print("New     :", len(third_round))
    # write to a synset file
    with open('data/omw5300_synsets.txt', 'w') as outfile:
        outfile.write('\n'.join(third_round))
    with FileHub(working_dir='data',
                 default_mode='w') as hub, omw.ctx() as ctx:
        profile = 'omw5300'
        filename = 'omw5300A'
        for idx, sid in enumerate(third_round):
            ss = omw.get_synset(sid, ctx=ctx)
            hub[profile].header(ss.ID,
                                'lemmas: {}'.format(", ".join(ss.lemmas)))
            for d in ss.definitions:
                hub[filename].writeline(d)
                hub[profile].print(d, level=1)
        rp.header("Generated files")
        for f in hub.files.keys():
            rp.print(hub[f].path)
Пример #3
0
def convert(cli, args):
    ''' Convert patches from CSV format to YAML '''
    rp = TextReport()
    # validate input file
    if not args.input:
        patch_path = os.path.join(DATA_FOLDER, 'patches',
                                  '20171112_Wn31_glitches_def.csv')
    else:
        patch_path = args.input
    if not os.path.isfile(patch_path):
        raise Exception("File {} does not exist.".format(patch_path))
    # validate output file
    out_path = args.output if args.output else None
    if out_path == '*.yaml':
        out_path = FileHelper.replace_ext(patch_path, 'yaml')
    rp.print("Input:", patch_path)
    rp.print("Output:", out_path if out_path else '*stdout*')
    # convert patches
    patches = read_csv(patch_path)
    json_patches = [p.to_json() for p in patches]
    yaml_str = yaml.dump(json_patches, default_flow_style=False)
    # dump output
    if out_path:
        with open(out_path, 'w') as outfile:
            outfile.write(yaml_str)
        if args.echo:
            print(yaml_str)
    else:
        print(yaml_str)
Пример #4
0
def map_preds(cli, args):
    rp = TextReport(args.output)
    ctx = PredSense.wn.ctx()
    not_found = []
    pred_file = 'data/erg_preds_interesting.txt'
    if args.all:
        pred_file = 'data/erg_preds_sorted.txt'
    name, ext = os.path.splitext(pred_file)
    not_found_file = name + "_notfound" + ext
    with open(pred_file, 'r') as infile:
        for p_str in infile.read().splitlines():
            p = Predicate.from_string(p_str)
            candidates = None
            if p.pos == 'x' and p.sense == 'subord':
                continue  # ignore these for now
            # if (p.pos == 'x' and p.sense == 'deg') or p.pos == 'p':
            if args.all or (p.pos and p.pos in 'xpq'):
                rp.header(p, p.lemma, p.pos, p.sense)
                candidates = PredSense.search_pred_string(p, ctx=ctx)
                for c in candidates:
                    rp.print(c.ID, c.lemmas, c.definition)
            if not candidates:
                not_found.append(p_str)
    with TextReport(not_found_file, 'w') as outfile:
        for p in not_found:
            outfile.print(p)

    if args.output:
        print("Written to: {}".format(args.output))
    print("Done")
Пример #5
0
def read_nttat(cli, args):
    ''' Convert NTTAT patch to JSON '''
    stdout = TextReport()
    ext = 'json'
    rp = TextReport("{}_1.{}".format(args.output, ext))
    rp2 = TextReport("{}_2.{}".format(args.output, ext))
    gwn = get_gwn()
    data = []
    with open(args.input, 'r') as infile, gwn.ctx() as ctx:
        ssids = re.findall('\d{8}-[nvarx]', infile.read())
        print(len(ssids))
        print(ssids)
        for sid in ssids:
            ss = gwn.get_synset(sid, ctx=ctx)
            sdef = fix_gwn_def(ss.definition)
            stdout.header(sid, "Lemmas: {}".format(", ".join(ss.lemmas)))
            stdout.print(sdef)
            data.append({
                "synset": sid,
                "lemmas": ss.lemmas,
                "definition": sdef
            })
    cut = int(len(data) / 2)
    # first half
    first_half = json.dumps(data[:cut], indent=2)
    rp.write(first_half)
    # second half
    second_half = json.dumps(data[cut:], indent=2)
    rp2.write(second_half)
Пример #6
0
def import_data(cli, args):
    '''Import XML data into SQLite database'''
    rp = TextReport()
    t = Timer(report=rp)
    db_loc = os.path.abspath(os.path.expanduser(args.jdb))
    rp.print("Jamdict DB location        : {}".format(db_loc))
    rp.print("JMDict XML file location   : {}".format(args.jmdxml))
    rp.print("Kanjidic2 XML file location: {}".format(args.kd2xml))
    jam = get_jam(cli, args)
    if args and (args.jdb or args.kd2):
        if os.path.isfile(db_loc):
            if not confirm(
                    "Database file exists. Do you want to overwite (This action cannot be undone! yes/no?) "
            ):
                cli.logger.warning("Program aborted.")
                exit()
            else:
                os.unlink(db_loc)
        # perform input
        t.start(
            "Creating Jamdict SQLite database. This process may take very long time ..."
        )
        jam.import_data()
        t.stop()
    else:
        print("Database paths were not provided. Process aborted.")
Пример #7
0
def omw_fix_dup(cli, args):
    rp = TextReport(args.output)
    omw = get_omw()
    c = Counter()
    with omw.ctx() as ctx:
        senses = ctx.sense.select(limit=args.topk, columns=('synset', ))
        synsetids = {s.synset for s in senses}
        rp.print("-- OMW synsets: {}\n".format(len(synsetids)))
        for sid in synsetids:
            try:
                sid = SynsetID.from_string(sid)
            except:
                cli.logger.warning("Ignored synset ID: {}".format(sid))
                continue
            ss = omw.get_synset(sid, ctx=ctx)
            fixed_def, dup_defs = join_definitions(ss)
            if dup_defs:
                c.count("Duplicated")
                rp.print("-- Original {}: {}".format(ss.ID, ss.definition))
                rp.print("-- Fixed    {}: {}".format(ss.ID, fixed_def))
                for dup in dup_defs:
                    rp.print(
                        "DELETE FROM synset_def WHERE synset='{}' and def='{}';"
                        .format(ss.ID, to_sqlite_string(dup)))
                rp.print()
        c.summarise()
        pass
Пример #8
0
def list_gpreds(cli, args):
    rp = TextReport(args.output)
    with open('data/erg_preds_sorted.txt', 'r') as infile:
        sorted_preds = (Predicate.from_string(l) for l in infile)
        for pred in sorted_preds:
            if pred.ptype == Predicate.GRAMMARPRED:
                rp.print(pred)
    pass
Пример #9
0
def list_preds(cli, args):
    rp = TextReport(args.output)
    lexdb = read_erg_lex()
    keyrels = set(l.keyrel for l in lexdb if l.keyrel)
    preds = [Predicate.from_string(p) for p in keyrels]
    sorted_preds = sorted(preds, key=lambda x: x.pos or '')
    # All preds
    with open('data/erg_preds_sorted.txt', 'w') as outfile:
        for pred in sorted_preds:
            outfile.write('{}\n'.format(pred))
    poses = set(p.pos for p in preds)
    trivial_preds = [p for p in preds if p.pos and p.pos in TRIVIAL_POS]
    if not args.trivial:
        preds = [p for p in preds if not p.pos or p.pos not in TRIVIAL_POS]
    interesting_poses = set(p.pos for p in preds)
    # write interesting preds to file
    c = Counter()
    with open('data/erg_preds_interesting.txt', 'w') as outfile:
        for pred in sorted(preds, key=lambda x: "cqpx".index(x.pos) if x.pos else 0):
            c.count(pred.pos if pred.pos else 'NONE')
            outfile.write('{}\n'.format(pred))
    # report
    rp.print("Interesting preds: {}".format(len(preds)))
    rp.print("Trivial preds: {}".format(len(trivial_preds)))
    rp.print("POS: {}".format(poses))
    rp.print("Interesting POS: {}".format(interesting_poses))
    c.summarise(rp)
Пример #10
0
def lookup(cli, args):
    '''Lookup words by kanji/kana'''
    jam = get_jam(cli, args)
    results = jam.lookup(args.query, strict_lookup=args.strict)
    report = TextReport(args.output)
    if args.format == 'json':
        report.print(json.dumps(results.to_json(),
                                ensure_ascii=args.ensure_ascii,
                                indent=args.indent if args.indent else None))
    else:
        if args.compact:
            report.print(results.text(separator='\n------\n', entry_sep='\n'))
        else:
            dump_result(results, report=report)
Пример #11
0
def create_ewdb(cli, args):
    db = EWDB(args.db)
    c = Counter()
    rp = TextReport()
    rp.header("DB location: {}".format(db.ds.path))
    with db.ctx() as ctx:
        for pos in 'nvar':
            file_name = 'data/tsdb/skeletons/omw_{}.txt'.format(pos)
            rp.print("Reading file: {}".format(file_name))
            for idx, row in enumerate(iter_tsv(file_name)):
                lemma, sid, sdef = row
                db.add_sense(sid, lemma, pos, sdef, ctx=ctx)
                c.count("Added")
    c.summarise()
    pass
Пример #12
0
def order_preds(cli, args):
    doc = Document.from_file(args.gold)
    output = TextReport(args.output)
    if not args.ident:
        output.print("No ident was provided")
    for ident in args.ident:
        sent = doc.by_ident(ident, default=None)
        if sent is None:
            output.print("Sent #{} is missing".format(ident))
        else:
            output.print(sent)
            eps = sent[0].dmrs().obj().eps()
            sort_eps(eps)
            output.print(["{}<{}:{}>".format(str(x.pred), x.cfrom, x.cto) for x in eps])
    output.print("Done")
Пример #13
0
def show_info(cli, args):
    ''' Show jamdict configuration (data folder, configuration file location, etc.) '''
    output = TextReport(args.output) if 'output' in args else TextReport()
    output.header("Jamdict | {} - Version: {}".format(
        version_info.__description__, version_info.__version__),
                  level='h0')
    output.header("Basic configuration")
    output.print("JAMDICT_HOME:           {}".format(config.home_dir()))
    output.print("Configuration location: {}".format(
        config._get_config_manager().locate_config()))
    output.header("Data files")
    output.print("Jamdict DB location: {} - {}".format(args.jdb,
                                                       file_status(args.jdb)))
    output.print("JMDict XML file    : {} - {}".format(
        args.jmdxml, file_status(args.jmdxml)))
    output.print("KanjiDic2 XML file : {} - {}".format(
        args.kd2xml, file_status(args.kd2xml)))
Пример #14
0
def manual_patch(cli, args):
    rp = TextReport()
    omw = get_omw()
    if not args.input or not os.path.isfile(args.input):
        raise Exception("Input file could not be found")
    with open(args.input, 'r') as infile, omw.ctx() as ctx:
        synsets = json.loads(infile.read())
        # for ss in synsets:
        #     rp.print(ss['synset'], ss['definition'])
        # rp.print("Found synsets:", len(synsets))
        for sinfo in synsets:
            sid, fixed_def = sinfo['synset'], sinfo['definition']
            ss = omw.get_synset(sid, ctx=ctx)
            orig_def = remove_puncs(ss.definition)
            if remove_puncs(fixed_def) != orig_def:
                rp.header("WARNING:", sid)
                rp.print(ss.definition)
                rp.print(fixed_def)
Пример #15
0
def doc_stats(cli, args):
    ''' Show document statistics '''
    doc = Document.from_file(args.path)  # input
    output = TextReport(args.output)  # output
    stats = Counter()
    pred_counter = Counter()
    empty_sentences = []
    unknown_preds = Counter()
    all_pos = Counter()
    not_found = None
    if args.ttl:
        ttl_doc = ttl.Document.read_ttl(args.ttl)
        not_found = set(s.ID for s in ttl_doc).difference(s.ident for s in doc)
    for sent in doc:
        stats.count("Sentences")
        if not len(sent):
            stats.count("Sentences-empty")
            empty_sentences.append(sent.ident)
        for reading in sent:
            stats.count("Readings")
            stats['Predicates'] += len(reading.dmrs().layout.nodes)
            # pred_counter.update(n.predstr for n in reading.dmrs().layout.nodes)
            for n in reading.dmrs().layout.nodes:
                if n.pred.pos == 'u' and n.pred.sense == 'unknown':
                    stats.count("Unnown predicates")
                    if '/' in n.pred.lemma:
                        try:
                            lemma, pos = n.pred.lemma.rsplit('/', 1)
                        except:
                            getLogger().warning("Invalid unknown pred: {}".format(n.pred))
                            raise
                        all_pos.count(pos)
                        unknown_preds.count((str(n.pred), lemma, pos))
                    else:
                        stats.count("UFO")
                else:
                    stats.count("Known predicates")
                    pred_counter.count(n.predstr)
    output.header("Summary", level="h0")
    stats.summarise(output)
    output.header("Empty sentences")
    output.print("\n".join(empty_sentences))
    if not_found is not None:
        output.header("Missing from TTL")
        for sid in not_found:
            output.print(sid)
    output.header("Unknown preds POS")
    for pos, count in all_pos.most_common():
        output.print(pos, count, separator='\t')
    output.header("Unknown preds")
    for (pred, lemma, pos), count in unknown_preds.most_common():
        output.print(pred, lemma, pos, count, separator='\t')
    output.header("Known preds", level="h1")
    pred_counter.summarise(output)
Пример #16
0
def gen_mfs_3000(cli, args):
    rp = TextReport(args.output)
    ssids = list(topk_mfs(3000))
    random.shuffle(ssids)
    with FileHub(working_dir='data',
                 default_mode='w') as hub, omw.ctx() as ctx:
        filename = 'omw3000A'
        for idx, sid in enumerate(ssids):
            ss = omw.get_synset(sid, ctx=ctx)
            if idx > len(ssids) / 2:
                filename = 'omw3000B'
            hub['omw3000'].header(ss.ID,
                                  'lemmas: {}'.format(", ".join(ss.lemmas)))
            for d in ss.definitions:
                hub[filename].writeline(d)
                hub['omw3000'].print(d, level=1)
        rp.header("Generated files")
        for f in hub.files.keys():
            rp.print(hub[f].path)
Пример #17
0
def lookup(cli, args):
    '''Lookup words by kanji/kana'''
    jam = get_jam(cli, args)
    if jam.ready:
        results = jam.lookup(args.query, strict_lookup=args.strict)
        report = TextReport(args.output)
        if args.format == 'json':
            report.print(
                json.dumps(results.to_dict(),
                           ensure_ascii=args.ensure_ascii,
                           indent=args.indent if args.indent else None))
        else:
            if args.compact:
                report.print(
                    results.text(separator='\n------\n', entry_sep='\n'))
            else:
                dump_result(results, report=report)
    else:
        getLogger().warning(
            f"Jamdict database is not available.\nThere are 3 ways to install data: \n    1) install jamdict_data via PyPI using `pip install jamdict_data` \n    2) download prebuilt dictionary database file from: {jamdict.__url__}, \n    3) or build your own database file from XML source files."
        )
Пример #18
0
def gen_mfs_5000(cli, args):
    rp = TextReport(args.output)
    from omwtk.wn_ntumc_top3000 import WN_NTUMC_TOP3000
    first_round = set(x['synset'] for x in WN_NTUMC_TOP3000)
    top5000 = topk_mfs(5000)
    round2 = list(top5000.difference(first_round))
    random.shuffle(round2)
    with FileHub(working_dir='data',
                 default_mode='w') as hub, omw.ctx() as ctx:
        filename = 'omw5000A'
        for idx, sid in enumerate(round2):
            ss = omw.get_synset(sid, ctx=ctx)
            if idx > 200:
                filename = 'omw5000B'
            hub['omw5000'].header(ss.ID,
                                  'lemmas: {}'.format(", ".join(ss.lemmas)))
            for d in ss.definitions:
                hub[filename].writeline(d)
                hub['omw5000'].print(d, level=1)
        rp.header("Generated files")
        for f in hub.files.keys():
            rp.print(hub[f].path)
Пример #19
0
def extract_wn31(cli, args):
    c = Counter()
    rp = TextReport()
    entries = []
    infile = FileHelper.abspath(args.input)
    if not os.path.isfile(infile):
        rp.print("File not found")
    else:
        rp.print("Processing {}".format(infile))
        tree = etree.iterparse(infile)
        for event, element in tree:
            if event == 'end' and element.tag == 'Synset':
                for child in element:
                    if child.tag == 'Definition':
                        entries.append((element.get('id'), element.get('ili'),
                                        child.text))
                        c.count('Definition')
                c.count("Synset")
                element.clear()
        c.summarise(report=rp)
    # Format: wn31sid ili definition
    CSV.write_tsv(args.output, entries)
Пример #20
0
def gen_vocab(cli, args):
    ''' Generate vocabulary list from a tokenized file '''
    if args.topk and args.topk <= 0:
        topk = None
        cli.logger.warning(
            "Invalid k will be ignored (k should be greater than or equal to 1)"
        )
    else:
        topk = args.topk
    if args.stopwords:
        with open(args.stopwords, 'r') as swfile:
            stopwords = swfile.read().splitlines()
    else:
        stopwords = []
    if os.path.isfile(args.input):
        cli.logger.info("Generating vocabulary list from file {}".format(
            args.input))
        with codecs.open(args.input, encoding='utf-8') as infile:
            if args.output:
                cli.logger.info("Output: {}".format(args.output))
            rp = TextReport(args.output)
            lines = infile.read().splitlines()
            c = Counter()
            for line in lines:
                words = line.split()
                c.update(w for w in words if w not in stopwords)
            # report vocab
            word_freq = c.most_common(topk)
            words = [k for k, v in word_freq]
            rp.header("Lexicon")
            rp.writeline("\n".join(
                textwrap.wrap(" ".join(w for w in words), width=70)))
            for k, v in word_freq:
                rp.print("{}: {}".format(k, v))
    else:
        cli.logger.warning("File {} does not exist".format(args.input))
Пример #21
0
def find_lesk_candidates(cli, args):
    doc = Document.from_file(args.gold)
    ne = 0
    for s in doc:
        if len(s):
            ne += 1
    print("Gold ISF: {} | not empty sents: {}".format(args.gold, ne))
    # candidates = dd(lambda: dd(set))
    notfound = dd(list)
    ident_sent_map = {}
    all_preds = Counter()
    missing_preds = Counter()
    found_preds = Counter()
    with PredSense.wn.ctx() as ctx:
        for idx, sent in enumerate(doc):
            if not len(sent):
                continue
            elif args.ident and sent.ident not in args.ident:
                continue
            if args.topk and args.topk < idx:
                break
            print(sent)
            ident_sent_map[sent.ident] = sent
            dmrs = sent[0].dmrs()
            if dmrs.tags:
                for ep in dmrs.get_lexical_preds():
                    all_preds.count(str(ep.pred))
                    if ep.nodeid in dmrs.tags:
                        # if there is a tag for this node
                        ep_synsets = PredSense.search_ep(ep, ctx=ctx)  # return a SynsetCollection()
                        for tag in dmrs.tags[ep.nodeid]:
                            if tag.synset.ID not in ep_synsets:
                                notfound[sent.ident].append((ep.nodeid, str(ep.pred), tag.synset.ID, tag.synset.lemma, [(x.ID, x.lemma) for x in ep_synsets]))
                                missing_preds.count(str(ep.pred))
                            else:
                                found_preds.count(str(ep.pred))
    output = TextReport(args.output)
    # summarise
    total_found = sum(c for pred, c in found_preds.most_common())
    total_missing = sum(c for pred, c in missing_preds.most_common())
    output.print("Found    : {}".format(total_found))
    output.print("Not found: {}".format(total_missing))
    ratio = (total_missing * 100) / (total_found + total_missing)
    output.print("Missing %: {}".format(ratio))
    # preds by sentences
    output.header("By sentences")
    for sid in sorted(notfound.keys()):
        sent = ident_sent_map[sid]
        output.print((sid, sent.text))
        items = notfound[sid]
        for item in items:
            output.print(item)
        output.print()
    # by preds
    output.header("By preds")
    for pred, occurrence in missing_preds.most_common():
        output.print("{}: {}".format(pred, occurrence))
    print("Done")
Пример #22
0
def map_predsense(cli, args):
    ''' Pred-Sense Mapping (gold DMRSes, gold Senses) '''
    rp = TextReport(args.output) if args.output else TextReport()
    rp.header("Pred-Sense mapping / strategy = {}".format(args.strat))
    if args.gold:
        sents = Document.from_file(args.gold)
        if args.patchsid:
            patch_gold_sid(sents)
    else:
        sents = read_gold_mrs()
        patch_gold_sid(sents)
    # ignore empty sentence
    empty_sents = [s for s in sents if not len(s)]
    not_empty_sents = [s for s in sents if len(s)]
    rp.print("MRS-Sents: {}".format(len(sents)))
    rp.print("MRS-Sents not empty: {}".format(len(not_empty_sents)))
    if args.ttl:
        doc = ttl.read(args.ttl, mode=args.ttl_format)
    else:
        # [XXX] using gold by default is bad ...
        doc = ttl.Document(name='gold', path='data').read()
    rp.print("TTL-Sents: {}".format(len(doc)))
    found_sents = 0
    for sent in not_empty_sents:
        if doc.get(sent.ident) is None:
            cli.logger.warning("Sentence {} could not be found".format(sent.ident))
        else:
            found_sents += 1
    rp.print("Matched: {}".format(found_sents))
    rp.print("Empty sentences: {}".format([s.ident for s in empty_sents]))
    # Now mapping is possible
    # ----------------------------------------
    ct = Counter()  # total
    cm = Counter()  # matched
    cnm = Counter()  # not matched
    cig = Counter()  # ignored
    sense_lemmas = dd(set)  # sense, lemma, map
    sense_sents = dd(set)  # not-matched senses to sentences
    lemma_sents = dd(set)  # not matched lemmas to sentences
    rp.print("Performing Pred-Sense Mapping")
    sents_to_map = not_empty_sents[:args.topk] if args.topk else not_empty_sents
    for sent in sents_to_map:
        sent.shallow = doc.get(sent.ident)
        for m, nm, ig in import_shallow(sent, mode=args.strat, no_small_sense=args.noss, fix_token=args.fixtoken, no_nonsense=args.nononsense):
            for c, nid, pred in m:
                ct.count(c.tag)
                cm.count(c.tag)
            for c in ig:
                sense_lemmas[c.tag].add(c.clemma)
                ct.count(c.tag)
                cig.count(c.tag)
            for c in nm:
                sense_lemmas[c.tag].add(c.clemma)
                ct.count(c.tag)
                cnm.count(c.tag)
                sense_sents[c.tag].add(sent)
                lemma_sents[c.clemma].add(sent)
            # print("Sent #{} - Not matched: {}".format(sent.ident, nm))
            # print("           Matched    : {}".format(len(m)))
    rp.header("Not matched", level='h0')
    for sid, c in cnm.most_common():
        rp.print("{}: {} | Lemmas: {}".format(sid, c, sense_lemmas[sid]))
    rp.header("Not matched (by lemma)", level='h0')
    for clemma, sents in sorted(lemma_sents.items(), key=lambda x: len(x[1]), reverse=True):
        rp.print("{}: {} | sents: {}".format(clemma, len(sents), [s.ident for s in sents]))
    if args.matched:
        rp.header("Total", level='h0')
        ct.summarise()
    rp.header("Ignored", level='h0')
    for sid, c in cig.most_common():
        rp.print("{}: {} | Lemmas: {}".format(sid, c, sense_lemmas[sid]))
    # show sense - sentences
    rp.header("Sense - Sentences", level='h0')
    for sid, c in cnm.most_common():
        sents = sense_sents[sid]
        rp.header("{} - {}".format(sid, sense_lemmas[sid]), level='h2')
        for sent in sents:
            ttl_sent = doc.get(sent.ident)
            rp.print(ttl_sent)
            for concept in ttl_sent.concepts:
                if concept.tag == sid:
                    rp.print('  -> {}'.format(concept))
    rp.header("Lemma - Sentences", level='h0')
    for clemma, sents in sorted(lemma_sents.items(), key=lambda x: len(x[1]), reverse=True):
        rp.header("#{}".format(clemma,))
        for sent in sents:
            ttl_sent = doc.get(sent.ident)
            rp.print(ttl_sent)
            for concept in ttl_sent.concepts:
                if concept.clemma == clemma:
                    rp.print('  -> {}'.format(concept))
        rp.print()
    # Show final numbers
    total_concepts = sum(x[1] for x in ct.most_common())
    total_matched = sum(x[1] for x in cm.most_common())
    total_notmatched = sum(x[1] for x in cnm.most_common())
    total_ignored = sum(x[1] for x in cig.most_common())
    rp.header("Summarise")
    rp.print("Total concepts: {}".format(total_concepts))
    rp.print("Matched: {}".format(total_matched))
    rp.print("Not matched: {}".format(total_notmatched))
    rp.print("Ignored: {}".format(total_ignored))
    if args.output:
        print("Total concepts: {}".format(total_concepts))
        print("Matched: {}".format(total_matched))
        print("Not matched: {}".format(total_notmatched))
        print("Ignored: {}".format(total_ignored))
        print("Output file: {}".format(args.output))
    print("Done!")
    return total_concepts, total_matched, total_notmatched, total_ignored
Пример #23
0
def verify_patch(cli, args):
    rp = TextReport()
    c = Counter()
    if not args.input or not os.path.isfile(args.input):
        raise Exception("Patch file not found")
    # load patches
    with open(args.input) as infile:
        patches = [DefPatch.from_dict(p) for p in yaml.safe_load(infile)]
    rp.print("Found {} patches.".format(len(patches)))
    # Validate against GWN-30
    # gwn = get_gwn()  # don't use GWN, for now
    omw = get_omw()
    wn = get_wn()
    with omw.ctx() as ctx, wn.ctx() as wnctx:
        for patch in patches:
            try:
                sid = wn.sk2sid(patch.sensekey, ctx=wnctx)
                if not sid:
                    raise Exception("sensekey `{}' does not exist.".format(
                        patch.sensekey))
                ss = omw.get_synset(sid, ctx=ctx)
                ssdef = ss.definition[:-1] if ss.definition.endswith(
                    ';') else ss.definition
                if patch.orig_def == ssdef:
                    c.count("Found")
                    rp.print("-", "{} [{}]".format(patch.orig_def,
                                                   patch.sensekey))
                    rp.print(" ", patch.new_def)
                    if patch.comment:
                        rp.print("C", patch.comment)
                else:
                    c.count("Found - diff")
                    rp.print("[DIFF]",
                             "{} [{}]".format(patch.orig_def, patch.sensekey))
                    rp.print("New:  ",
                             "{} [{}]".format(patch.new_def, patch.sensekey))
                    rp.print("      ", ssdef)
                    rp.print("Note: ", patch.comment)
            except:
                getLogger().warn("sensekey `{}' couldn't be found".format(
                    patch.sensekey))
                c.count("Not found")
                continue
        c.summarise(report=rp)
Пример #24
0
def isf_to_ukb(cli, args):
    ''' ISF to UKB '''
    doc = Document.from_file(args.input)
    output = TextReport(args.output)
    tokenfile = TextReport(args.output + '.tokens.txt')
    report = TextReport(args.report)
    report.print("Output file: {}".format(args.output))
    processed = 0
    if not args.ident:
        report.print("No ident was provided")
    for idx, sent in enumerate(doc):
        # sent = doc.by_ident(ident, default=None)
        if args.topk and idx > args.topk:
            break
        if args.ident and sent.ident not in args.ident:
            continue
        if sent is None:
            report.print("Sent #{} is missing".format(sent.ident))
        elif len(sent) == 0:
            report.print("Sent #{} is empty (i.e. there is no parse)".format(sent.ident))
        else:
            sentid = sent.ID if sent.ID else sent.ident
            report.print("Processing {}".format(sentid))
            tokens = sent.readings[0].dmrs().tokenize_pos(strict=args.strict)
            if not tokens:
                report.print("Empty DMRS: {} (no pred???)".format(sentid))
                continue
            # sentense is OK ...
            output.print(sentid)
            for idx, (isf_lemma, pos, cfrom, cto) in enumerate(tokens):
                # In UKB's lemmas, use _ to represent a space
                lemma = isf_lemma.replace('+', '_')
                output.write("{text}#{p}#w{wid}#1 ".format(text=lemma, p=pos, wid=idx))
                tokenfile.writeline('\t'.join((str(sentid), str(idx), str(cfrom), str(cto))))
            output.write('\n\n')
            processed += 1
    report.print("Processed {} sentence(s)".format(processed))
    report.print("Done")
Пример #25
0
    def test_tagging_all(self):
        getLogger().debug("Tagging everything ...")
        sents = self.gold()
        smap = {str(s.ident): s for s in sents}
        # reag tags
        doc = ttl.Document('gold', TEST_GOLD_DIR).read()
        filter_wrong_senses(doc)
        count_good_bad = Counter()
        perfects = []
        to_be_checked = dd(list)
        tbc_concepts = dd(list)
        concept_count = Counter()
        fix_texts = []
        instances = Counter()
        tag_map = dd(set)
        report = TextReport('data/gold_report.txt')
        matched_report = TextReport('data/gold_matched.txt')
        not_matched_report = TextReport('data/gold_notmatched.txt')
        for s in sents[:5]:
            sid = str(s.ident)
            if not doc.has_id(sid):
                raise Exception("Cannot find sentence {}".format(sid))
            elif len(s) == 0:
                logging.warning("Empty sentence: {}".format(s))
            else:
                tagged = doc.get(sid)
                if s.text != tagged.text:
                    fix_texts.append((s.ident, s.text, tagged.text))
                # try to tag ...
                dmrs = s[0].dmrs()
                matched, not_matched, ignored = tag_gold(dmrs, tagged, s.text, mode=Lexsem.ROBUST)
                if not not_matched:
                    count_good_bad.count("Perfect")
                    perfects.append((s, matched))
                else:
                    for nm in not_matched:
                        tag_map[nm.tag].add(nm.clemma)
                        tbc_concepts[nm.tag].append(s.ident)
                        concept_count.count(nm.tag)
                        instances.count('instances')
                    to_be_checked[s.ident].append(nm)
                    count_good_bad.count("To be checked")
        # report matched
        for sent, m in perfects:
            tagged = doc.get(str(sent.ident))
            matched_report.header("#{}: {}".format(sent.ident, sent.text), "h0")
            matched_report.writeline(sent[0].dmrs())
            matched_report.header("Concepts")
            for c, nid, pred in m:
                matched_report.writeline("{} ===> {}:{}".format(c, nid, pred))
            matched_report.writeline()
            matched_report.writeline()
        # report not matched
        not_matched_report.header("By senses", "h0")
        for k, v in concept_count.most_common():
            sids = ' '.join(["#{}".format(x) for x in tbc_concepts[k]])
            not_matched_report.print("{}: {} | {} => {}".format(k, v, sids, tag_map[k]))
        not_matched_report.header("By sentences", "h0")
        for sid, nm in to_be_checked.items():
            not_matched_report.print("#{}: {}  | {}".format(sid, nm, smap[str(sid)].text))
        # full details
        for sid, nm in to_be_checked.items():
            sent = smap[str(sid)]
            tagged = doc.get(str(sid))
            not_matched_report.header("#{}: {}".format(sid, sent.text))
            not_matched_report.writeline(sent[0].dmrs())
            for n in nm:
                not_matched_report.writeline(n)

        # for i, t1, t2 in fix_texts:
        #     getLogger().debug(i)
        #     getLogger().debug(t1)
        #     getLogger().debug(t2)
        count_good_bad.summarise(report=report)
        instances.summarise(report=report)
Пример #26
0
rp3 = TextReport.null()  # ouptut to /dev/null, i.e. nowhere
rp3.write("This line goes no where")

rp4 = TextReport.string(
)  # output to a string. Call rp.content() to get the string
rp4.write("This line will be stored in a string buffer")

rp5 = TextReport(TextReport.STRINGIO)  # same as above
rp5.write("This line will also be stored in a string buffer")

# TextReport will close the output stream automatically by using the with statement
with TextReport.string() as rp:
    rp.header("Lorem Ipsum Analysis", level="h0")
    rp.header("Raw", level="h1")
    rp.print(LOREM_IPSUM)
    rp.header("Character Frequency")
    ct.summarise(report=rp)
    print(rp.content())

# ------------------------------------------------------------------------------
# Web fetcher
# ------------------------------------------------------------------------------
from chirptext import WebHelper

web = WebHelper('~/tmp/webcache.db')
data = web.fetch('https://letuananh.github.io/test/data.json')
print(data)
data_json = web.fetch_json('https://letuananh.github.io/test/data.json')
print(data_json)
Пример #27
0
def show_info(cli, args):
    ''' Show jamdict configuration (data folder, configuration file location, etc.) '''
    output = TextReport(args.output) if 'output' in args else TextReport()
    if args.config:
        jamdict.config.read_config(args.config)
    output.print("Jamdict " + jamdict.version_info.__version__)
    output.print(jamdict.version_info.__description__)
    jam = get_jam(cli, args)
    output.header("Basic configuration")
    jamdict_home = jamdict.config.home_dir()
    if not os.path.isdir(os.path.expanduser(jamdict_home)):
        jamdict_home += " [Missing]"
    else:
        jamdict_home += " [OK]"
    output.print(f"JAMDICT_HOME: {jamdict_home}")
    if jamdict.util._JAMDICT_DATA_AVAILABLE:
        import jamdict_data
        data_pkg = f"version {jamdict_data.__version__} [OK]"
    else:
        data_pkg = "Not installed"
    output.print(f"jamdict-data: {data_pkg}")
    if args.config:
        _config_path = args.config + " [Custom]"
        if not os.path.isfile(args.config):
            _config_path += " [Missing]"
    else:
        _config_path = jamdict.config._get_config_manager().locate_config()
    if not _config_path:
        _config_path = "Not available.\n     Run `python3 -m jamdict config` to create configuration file if needed."
    output.print(f"Config file : {_config_path}")

    output.header("Data files")
    output.print(
        f"Jamdict DB location: {jam.db_file} - {file_status(jam.db_file)}")
    output.print(
        f"JMDict XML file    : {jam.jmd_xml_file} - {file_status(jam.jmd_xml_file)}"
    )
    output.print(
        f"KanjiDic2 XML file : {jam.kd2_xml_file} - {file_status(jam.kd2_xml_file)}"
    )
    output.print(
        f"JMnedict XML file  : {jam.jmnedict_xml_file} - {file_status(jam.jmnedict_xml_file)}"
    )

    if jam.ready:
        output.header("Jamdict database metadata")
        try:
            for meta in jam.jmdict.meta.select():
                output.print(f"{meta.key}: {meta.value}")
        except Exception as e:
            print(e)
            output.print("Error happened while retrieving database meta data")
    output.header("Others")
    output.print(f"puchikarui: version {puchikarui_version}")
    output.print(f"chirptext : version {chirptext_version}")
    output.print(f"lxml      : {jamdict.jmdict._LXML_AVAILABLE}")
Пример #28
0
# ------------------------------------------------------------------------------
# Sample text report
# ------------------------------------------------------------------------------
# a string report
rp = TextReport()  # by default, TextReport will write to standard output, i.e. terminal
rp = TextReport(TextReport.STDOUT)  # same as above
rp = TextReport('~/tmp/my-report.txt')  # output to a file
rp = TextReport.null()  # ouptut to /dev/null, i.e. nowhere
rp = TextReport.string()  # output to a string. Call rp.content() to get the string
rp = TextReport(TextReport.STRINGIO)  # same as above

# TextReport will close the output stream automatically by using the with statement
with TextReport.string() as rp:
    rp.header("Lorem Ipsum Analysis", level="h0")
    rp.header("Raw", level="h1")
    rp.print(LOREM_IPSUM)
    rp.header("Character Frequency")
    ct.summarise(report=rp)
    print(rp.content())


# ------------------------------------------------------------------------------
# Web fetcher
# ------------------------------------------------------------------------------
from chirptext import WebHelper

web = WebHelper('~/tmp/webcache.db')
data = web.fetch('https://letuananh.github.io/test/data.json')
print(data)
data_json = web.fetch_json('https://letuananh.github.io/test/data.json')
print(data_json)
Пример #29
0
def dump_result(results, report=None):
    if report is None:
        report = TextReport()
    if results.entries:
        report.print("=" * 40)
        report.print("Found entries")
        report.print("=" * 40)
        for e in results.entries:
            kj = ', '.join([k.text for k in e.kanji_forms])
            kn = ', '.join([k.text for k in e.kana_forms])
            report.print("Entry: {} | Kj:  {} | Kn: {}".format(
                e.idseq, kj, kn))
            report.print("-" * 20)
            for idx, s in enumerate(e.senses):
                report.print("{idx}. {s}".format(idx=idx + 1, s=s))
            report.print('')
    else:
        report.print("No dictionary entry was found.")
    if results.chars:
        report.print("=" * 40)
        report.print("Found characters")
        report.print("=" * 40)
        for c in results.chars:
            report.print("Char: {} | Strokes: {}".format(c, c.stroke_count))
            report.print("-" * 20)
            for rmg in c.rm_groups:
                report.print("Readings:",
                             ", ".join([r.value for r in rmg.readings]))
                report.print(
                    "Meanings:", ", ".join([
                        m.value for m in rmg.meanings
                        if not m.m_lang or m.m_lang == 'en'
                    ]))
    else:
        report.print("No character was found.")
Пример #30
0
def remove_msw_ttl(cli, args):
    doc = read_ttl(args.path)
    rp = TextReport(args.debug)
    rp.print("Doc size: {}".format(len(doc)))
    orig_tag_count = 0
    orig_concept_count = 0
    for s in doc:
        orig_concept_count += len(s.concepts)
        orig_tag_count += len(s.tags)
    print("# tags: {}".format(orig_tag_count))
    print("# concepts: {}".format(orig_concept_count))
    manual = dd(lambda: dd(dict))
    nonsenses = set()  # just ignore any tag with these sense IDs
    if args.manual:
        entries = CSV.read_tsv(args.manual)
        for sid, wid, tag, keep, lemma in entries:
            sid, wid, keep = int(sid), int(wid), int(keep)
            if (sid, wid, keep, lemma) == (-1, -1, -1, 'U'):
                nonsenses.add(tag)
            if not lemma:
                manual[sid][wid][tag] = keep
            else:
                manual[sid][wid][(tag, lemma)] = keep
    wn = get_wn()
    ctx = wn.ctx()
    nope_synsets = set()
    ok_synsets = set()
    if args.wn30:
        rp.print("WN30 filter is activated")
    for sidx, sent in enumerate(doc):
        if args.topk and sidx > int(args.topk):
            break
        getLogger().debug("Processing sentence {}/{}".format(sidx + 1, len(doc)))
        getLogger().debug("Before concepts: {}".format(sent.concepts))
        getLogger().debug("Before tags: {}".format(sent.tags))
        # remove concepts that are not in PWN 3.0
        if args.wn30:
            remove_tags = set()
            for tag in sent.tags:
                if tag.tagtype == 'OMW' or tag.label in nonsenses:
                    remove_tags.add(tag)
            for tag in remove_tags:
                sent.tags.remove(tag)
            remove_concepts = set()
            for c in sent.concepts:
                if c.tag in ok_synsets:
                    pass
                elif c.tag in nope_synsets:
                    remove_concepts.add(c)
                    # pop_concept(sent, c)
                elif wn.get_synset(c.tag, ctx=ctx) is None:
                    # remove it
                    nope_synsets.add(c.tag)
                    remove_concepts.add(c)
                    # pop_concept(sent, c)
                else:
                    ok_synsets.add(c.tag)
            for c in remove_concepts:
                pop_concept(sent, c)
        msw = list(sent.msw())
        tcmap = sent.tcmap()
        # remove_tags = set()
        if msw:
            keep_remove = []
            for w in msw:
                max_len = 0
                keep = []
                remove = set()
                wid = sent.tokens.index(w)
                for c in tcmap[w]:
                    if c.tag in manual[sent.ID][wid]:
                        if manual[sent.ID][wid][c.tag]:
                            keep.append(c)
                        else:
                            remove.add(c)
                    elif (c.tag, c.clemma) in manual[sent.ID][wid]:
                        if manual[sent.ID][wid][(c.tag, c.clemma)]:
                            keep.append(c)
                        else:
                            remove.add(c)
                    elif len(c.tokens) == 1 or len(c.tokens) < max_len:
                        remove.add(c)
                    elif c.tag in nonsenses:
                        remove.add(c)
                    else:
                        max_len = len(c.tokens)
                        keep.append(c)
                if len(keep) != 1:
                    keep_remove.append((w, keep, remove))
                else:
                    # everything is OK, remove them now
                    for c in remove:
                        if args.debug:
                            rp.print("Removing concept {} from {}".format(c, sent.ID))
                        getLogger().debug("Removing concept {} from {}".format(c, sent.ID))
                        pop_concept(sent, c)
            if keep_remove:
                rp.header(sent)
                for w, keep, remove in keep_remove:
                    rp.write(w)
                    rp.writeline(" - Keep: {} | Remove: {}".format(keep, remove))
        # remove sent's tags
        # for tag in remove_tags:
        #     getLogger().debug("removing tag: {}".format(tag))
        #     sent.tags.remove(tag)
        getLogger().debug("After concepts: {}".format(sent.concepts))
        getLogger().debug("After tags: {}".format(sent.tags))
    if nope_synsets:
        rp.print("Noped synsets: {}".format(nope_synsets))
    if args.output:
        doc_path = os.path.dirname(args.output)
        doc_name = os.path.basename(args.output)
        new_doc = ttl.Document(doc_name, doc_path)
        sents = doc if not args.topk else list(doc)[:int(args.topk)]
        for s in sents:
            new_doc.add_sent(s)
        tag_count = 0
        concept_count = 0
        for s in sents:
            concept_count += len(s.concepts)
            tag_count += len(s.tags)
        # baking ...
        if args.bake:
            print("Baking doc ...")
            bake_doc(new_doc)
        print("[New] # tags: {}".format(tag_count))
        print("[New] # concepts: {}".format(concept_count))
        rp.print("Writing fixed TTL to {}".format(new_doc.sent_path))
        new_doc.write_ttl()
Пример #31
0
def compare_ttls(cli, args):
    ''' Compare TTL to gold '''
    rp = TextReport()
    omw = get_omw()
    ctx = omw.ctx()
    gold = None
    profile = None
    ignored_ids = []
    if args.ignore:
        ignored_ids = [x.strip() for x in read_file(args.ignore).splitlines() if x.strip()]
        getLogger().debug("Ignored sentence IDs: {}".format(', '.join(ignored_ids)))
    if args.gold_profile:
        gold = read_ttl(args.gold_profile, ttl_format=args.ttl_format)
        # remove ignored sentences
        if ignored_ids:
            for sid in ignored_ids:
                gold.pop(sid, default=None)
        if not args.batch:
            rp.header("Gold sentences: {} | Loc: {}".format(len(gold), args.gold_profile))
        if args.verbose and not args.batch:
            for s in gold:
                rp.print("Sent #{}: {} tags".format(s.ID, len(s.tags)))
    elif not args.batch:
        print("Oops, no gold!")
    # read profile
    if args.profile:
        profile = read_ttl(args.profile, ttl_format=args.ttl_format)
        if not args.batch:
            rp.header("Profile sentences: {} | Loc: {}".format(len(profile), args.profile))
        # remove ignored sentences
        if ignored_ids:
            for sid in ignored_ids:
                profile.pop(sid, default=None)
        if not args.batch:
            rp.header("Profile sentences: {} (ignored: {}) | Loc: {}".format(len(profile), len(ignored_ids), args.profile))
        if args.verbose and not args.batch:
            for s in profile:
                getLogger().debug("Profile/Sent #{}: {} tags".format(s.ID, len(s.tags)))
    elif not args.batch:
        print("Oops, no profile to evaluate")
    # calculate precision and recall
    if gold and profile:
        gold_tags, gold_tags_len, gold_ignored = prepare_tags(gold, args=args, nonsense=args.nonsense)
        profile_tags, profile_tags_len, profile_ignored = prepare_tags(profile, args=args, nonsense=args.nonsense)
        if gold_tags_len == 0:
            rp.print("WARNING: There was no tag found in the gold profile. Please make sure that the tags for comparison are *sentence level* tags")
        if profile_tags_len == 0:
            rp.print("WARNING: There was no tag found in the evaluating profile. Please make sure that the tags for comparison are *sentence level* tags")
        getLogger().debug("Gold tags: {}".format(gold_tags_len))
        getLogger().debug(list(gold_tags.items())[:5])
        getLogger().debug("Profile tags: {}".format(profile_tags_len))
        getLogger().debug(list(profile_tags.items())[:5])
        true_positive, false_negative = score(gold_tags, profile_tags, args=args)
        precision = len(true_positive) / profile_tags_len
        recall = len(true_positive) / gold_tags_len
        f1 = 2 * precision * recall / (precision + recall)
        getLogger().debug("TP: {}".format(len(true_positive)))
        getLogger().debug("FN: {}".format(len(false_negative)))
        getLogger().debug("Recall (TP/Gtags): {}".format(recall))
        getLogger().debug("Precision (TP/Ptags): {}".format(precision))
        getLogger().debug("F1 (2*p*r/(p+r)): {}".format(f1))
        rc_text = "{:.2f}%".format(recall * 100)
        pr_text = "{:.2f}%".format(precision * 100)
        f1_text = "{:.2f}%".format(f1 * 100)
        if not args.batch:
            rp.print("True positive: {}".format(len(true_positive)))
            rp.print("False Negative: {}".format(len(false_negative)))
            rp.print("Gold # senses: {} | Ignored: {} | Total: {}".format(gold_tags_len, gold_ignored, gold_tags_len + gold_ignored))
            rp.print("Predicted # senses: {} | Ignored: {} | Total: {}".format(profile_tags_len, profile_ignored, profile_tags_len + profile_ignored))
            rp.print("Recall:    {}".format(rc_text))
            rp.print("Precision: {}".format(pr_text))
            rp.print("F1       : {}".format(f1_text))
        if args.org:
            # output org-mode
            columns = [rc_text, pr_text, f1_text]
            if args.cols:
                columns = args.cols + columns
            rp.print('| {} |'.format(' | '.join(columns)))
        if args.debug:
            if not args.batch:
                print("Debug file: {}".format(args.debug))
            debugfile = TextReport(args.debug)
            debugfile.print(".:: Table of content ::.")
            debugfile.print("")
            debugfile.print("[Misisng senses]")
            debugfile.print("[By classes]")
            debugfile.print("[Summary]")
            debugfile.print("")
            ss_map = {}
            debugfile.header("[Missing senses]")
            for sid, cfrom, cto, label in sorted(false_negative):
                if label not in ss_map:
                    ss = omw.get_synset(label, ctx=ctx)
                    ss_map[label] = ss
                else:
                    ss = ss_map[label]
                # get the surface form
                surface = gold.get(sid).text[int(cfrom):int(cto)]
                debugfile.print("{}\t{}\t{}\t{}\t{}\t{}\t{}".format(sid, cfrom, cto, surface, label, ss.definition, ss.lemmas))
            # by classes
            c = Counter()
            c.update(synsetID for sentID, cfrom, cto, synsetID in false_negative)
            debugfile.header("[By classes]")
            for synsetID, freq in c.most_common():
                ss = ss_map[synsetID]
                debugfile.print("{}: {} | ({}) - {}".format(synsetID, freq, ', '.join(ss.lemmas), ss.definition))
            # summary
            debugfile.header("[Summary]")
            debugfile.print("True positive: {}".format(len(true_positive)))
            debugfile.print("False positive: {}".format(len(false_negative)))
            debugfile.print("Gold # senses: {} | Ignored: {} | Total: {}".format(gold_tags_len, gold_ignored, gold_tags_len + gold_ignored))
            debugfile.print("Predicted # senses: {} | Ignored: {} | Total: {}".format(profile_tags_len, profile_ignored, profile_tags_len + profile_ignored))
            debugfile.print("Recall (TP/Gtags)   : {}".format(rc_text))
            debugfile.print("Precision (TP/Ptags): {}".format(pr_text))
            debugfile.print("F1  (2*p*r/(p+r))   : {}".format(f1_text))
    ctx.close()