def gen_vocab(cli, args): ''' Generate vocabulary list from a tokenized file ''' if args.topk and args.topk <= 0: topk = None cli.logger.warning("Invalid k will be ignored (k should be greater than or equal to 1)") else: topk = args.topk if args.stopwords: with open(args.stopwords, 'r') as swfile: stopwords = swfile.read().splitlines() else: stopwords = [] if os.path.isfile(args.input): cli.logger.info("Generating vocabulary list from file {}".format(args.input)) with codecs.open(args.input, encoding='utf-8') as infile: if args.output: cli.logger.info("Output: {}".format(args.output)) rp = TextReport(args.output) lines = infile.read().splitlines() c = Counter() for line in lines: words = line.split() c.update(w for w in words if w not in stopwords) # report vocab word_freq = c.most_common(topk) words = [k for k, v in word_freq] rp.header("Lexicon") rp.writeline("\n".join(textwrap.wrap(" ".join(w for w in words), width=70))) for k, v in word_freq: rp.print("{}: {}".format(k, v)) else: cli.logger.warning("File {} does not exist".format(args.input))
def gen_mfs_5500(cli, args): ''' Generate 3rd round tree banking ''' rp = TextReport(args.output) topk_synsets = topk_mfs(5500) # finished treebanking first_round = read_lines('data/omw3000_synsets.txt') second_round = read_lines('data/omw5000_synsets.txt') done_synsets = set(first_round + second_round) # new third_round = topk_synsets.difference(done_synsets) # report print("All :", len(topk_synsets)) print("Done :", len(done_synsets)) print("New :", len(third_round)) # write to a synset file with open('data/omw5300_synsets.txt', 'w') as outfile: outfile.write('\n'.join(third_round)) with FileHub(working_dir='data', default_mode='w') as hub, omw.ctx() as ctx: profile = 'omw5300' filename = 'omw5300A' for idx, sid in enumerate(third_round): ss = omw.get_synset(sid, ctx=ctx) hub[profile].header(ss.ID, 'lemmas: {}'.format(", ".join(ss.lemmas))) for d in ss.definitions: hub[filename].writeline(d) hub[profile].print(d, level=1) rp.header("Generated files") for f in hub.files.keys(): rp.print(hub[f].path)
def convert(cli, args): ''' Convert patches from CSV format to YAML ''' rp = TextReport() # validate input file if not args.input: patch_path = os.path.join(DATA_FOLDER, 'patches', '20171112_Wn31_glitches_def.csv') else: patch_path = args.input if not os.path.isfile(patch_path): raise Exception("File {} does not exist.".format(patch_path)) # validate output file out_path = args.output if args.output else None if out_path == '*.yaml': out_path = FileHelper.replace_ext(patch_path, 'yaml') rp.print("Input:", patch_path) rp.print("Output:", out_path if out_path else '*stdout*') # convert patches patches = read_csv(patch_path) json_patches = [p.to_json() for p in patches] yaml_str = yaml.dump(json_patches, default_flow_style=False) # dump output if out_path: with open(out_path, 'w') as outfile: outfile.write(yaml_str) if args.echo: print(yaml_str) else: print(yaml_str)
def map_preds(cli, args): rp = TextReport(args.output) ctx = PredSense.wn.ctx() not_found = [] pred_file = 'data/erg_preds_interesting.txt' if args.all: pred_file = 'data/erg_preds_sorted.txt' name, ext = os.path.splitext(pred_file) not_found_file = name + "_notfound" + ext with open(pred_file, 'r') as infile: for p_str in infile.read().splitlines(): p = Predicate.from_string(p_str) candidates = None if p.pos == 'x' and p.sense == 'subord': continue # ignore these for now # if (p.pos == 'x' and p.sense == 'deg') or p.pos == 'p': if args.all or (p.pos and p.pos in 'xpq'): rp.header(p, p.lemma, p.pos, p.sense) candidates = PredSense.search_pred_string(p, ctx=ctx) for c in candidates: rp.print(c.ID, c.lemmas, c.definition) if not candidates: not_found.append(p_str) with TextReport(not_found_file, 'w') as outfile: for p in not_found: outfile.print(p) if args.output: print("Written to: {}".format(args.output)) print("Done")
def read_nttat(cli, args): ''' Convert NTTAT patch to JSON ''' stdout = TextReport() ext = 'json' rp = TextReport("{}_1.{}".format(args.output, ext)) rp2 = TextReport("{}_2.{}".format(args.output, ext)) gwn = get_gwn() data = [] with open(args.input, 'r') as infile, gwn.ctx() as ctx: ssids = re.findall('\d{8}-[nvarx]', infile.read()) print(len(ssids)) print(ssids) for sid in ssids: ss = gwn.get_synset(sid, ctx=ctx) sdef = fix_gwn_def(ss.definition) stdout.header(sid, "Lemmas: {}".format(", ".join(ss.lemmas))) stdout.print(sdef) data.append({ "synset": sid, "lemmas": ss.lemmas, "definition": sdef }) cut = int(len(data) / 2) # first half first_half = json.dumps(data[:cut], indent=2) rp.write(first_half) # second half second_half = json.dumps(data[cut:], indent=2) rp2.write(second_half)
def import_data(cli, args): '''Import XML data into SQLite database''' rp = TextReport() t = Timer(report=rp) db_loc = os.path.abspath(os.path.expanduser(args.jdb)) rp.print("Jamdict DB location : {}".format(db_loc)) rp.print("JMDict XML file location : {}".format(args.jmdxml)) rp.print("Kanjidic2 XML file location: {}".format(args.kd2xml)) jam = get_jam(cli, args) if args and (args.jdb or args.kd2): if os.path.isfile(db_loc): if not confirm( "Database file exists. Do you want to overwite (This action cannot be undone! yes/no?) " ): cli.logger.warning("Program aborted.") exit() else: os.unlink(db_loc) # perform input t.start( "Creating Jamdict SQLite database. This process may take very long time ..." ) jam.import_data() t.stop() else: print("Database paths were not provided. Process aborted.")
def omw_fix_dup(cli, args): rp = TextReport(args.output) omw = get_omw() c = Counter() with omw.ctx() as ctx: senses = ctx.sense.select(limit=args.topk, columns=('synset', )) synsetids = {s.synset for s in senses} rp.print("-- OMW synsets: {}\n".format(len(synsetids))) for sid in synsetids: try: sid = SynsetID.from_string(sid) except: cli.logger.warning("Ignored synset ID: {}".format(sid)) continue ss = omw.get_synset(sid, ctx=ctx) fixed_def, dup_defs = join_definitions(ss) if dup_defs: c.count("Duplicated") rp.print("-- Original {}: {}".format(ss.ID, ss.definition)) rp.print("-- Fixed {}: {}".format(ss.ID, fixed_def)) for dup in dup_defs: rp.print( "DELETE FROM synset_def WHERE synset='{}' and def='{}';" .format(ss.ID, to_sqlite_string(dup))) rp.print() c.summarise() pass
def list_gpreds(cli, args): rp = TextReport(args.output) with open('data/erg_preds_sorted.txt', 'r') as infile: sorted_preds = (Predicate.from_string(l) for l in infile) for pred in sorted_preds: if pred.ptype == Predicate.GRAMMARPRED: rp.print(pred) pass
def list_preds(cli, args): rp = TextReport(args.output) lexdb = read_erg_lex() keyrels = set(l.keyrel for l in lexdb if l.keyrel) preds = [Predicate.from_string(p) for p in keyrels] sorted_preds = sorted(preds, key=lambda x: x.pos or '') # All preds with open('data/erg_preds_sorted.txt', 'w') as outfile: for pred in sorted_preds: outfile.write('{}\n'.format(pred)) poses = set(p.pos for p in preds) trivial_preds = [p for p in preds if p.pos and p.pos in TRIVIAL_POS] if not args.trivial: preds = [p for p in preds if not p.pos or p.pos not in TRIVIAL_POS] interesting_poses = set(p.pos for p in preds) # write interesting preds to file c = Counter() with open('data/erg_preds_interesting.txt', 'w') as outfile: for pred in sorted(preds, key=lambda x: "cqpx".index(x.pos) if x.pos else 0): c.count(pred.pos if pred.pos else 'NONE') outfile.write('{}\n'.format(pred)) # report rp.print("Interesting preds: {}".format(len(preds))) rp.print("Trivial preds: {}".format(len(trivial_preds))) rp.print("POS: {}".format(poses)) rp.print("Interesting POS: {}".format(interesting_poses)) c.summarise(rp)
def lookup(cli, args): '''Lookup words by kanji/kana''' jam = get_jam(cli, args) results = jam.lookup(args.query, strict_lookup=args.strict) report = TextReport(args.output) if args.format == 'json': report.print(json.dumps(results.to_json(), ensure_ascii=args.ensure_ascii, indent=args.indent if args.indent else None)) else: if args.compact: report.print(results.text(separator='\n------\n', entry_sep='\n')) else: dump_result(results, report=report)
def create_ewdb(cli, args): db = EWDB(args.db) c = Counter() rp = TextReport() rp.header("DB location: {}".format(db.ds.path)) with db.ctx() as ctx: for pos in 'nvar': file_name = 'data/tsdb/skeletons/omw_{}.txt'.format(pos) rp.print("Reading file: {}".format(file_name)) for idx, row in enumerate(iter_tsv(file_name)): lemma, sid, sdef = row db.add_sense(sid, lemma, pos, sdef, ctx=ctx) c.count("Added") c.summarise() pass
def order_preds(cli, args): doc = Document.from_file(args.gold) output = TextReport(args.output) if not args.ident: output.print("No ident was provided") for ident in args.ident: sent = doc.by_ident(ident, default=None) if sent is None: output.print("Sent #{} is missing".format(ident)) else: output.print(sent) eps = sent[0].dmrs().obj().eps() sort_eps(eps) output.print(["{}<{}:{}>".format(str(x.pred), x.cfrom, x.cto) for x in eps]) output.print("Done")
def show_info(cli, args): ''' Show jamdict configuration (data folder, configuration file location, etc.) ''' output = TextReport(args.output) if 'output' in args else TextReport() output.header("Jamdict | {} - Version: {}".format( version_info.__description__, version_info.__version__), level='h0') output.header("Basic configuration") output.print("JAMDICT_HOME: {}".format(config.home_dir())) output.print("Configuration location: {}".format( config._get_config_manager().locate_config())) output.header("Data files") output.print("Jamdict DB location: {} - {}".format(args.jdb, file_status(args.jdb))) output.print("JMDict XML file : {} - {}".format( args.jmdxml, file_status(args.jmdxml))) output.print("KanjiDic2 XML file : {} - {}".format( args.kd2xml, file_status(args.kd2xml)))
def manual_patch(cli, args): rp = TextReport() omw = get_omw() if not args.input or not os.path.isfile(args.input): raise Exception("Input file could not be found") with open(args.input, 'r') as infile, omw.ctx() as ctx: synsets = json.loads(infile.read()) # for ss in synsets: # rp.print(ss['synset'], ss['definition']) # rp.print("Found synsets:", len(synsets)) for sinfo in synsets: sid, fixed_def = sinfo['synset'], sinfo['definition'] ss = omw.get_synset(sid, ctx=ctx) orig_def = remove_puncs(ss.definition) if remove_puncs(fixed_def) != orig_def: rp.header("WARNING:", sid) rp.print(ss.definition) rp.print(fixed_def)
def doc_stats(cli, args): ''' Show document statistics ''' doc = Document.from_file(args.path) # input output = TextReport(args.output) # output stats = Counter() pred_counter = Counter() empty_sentences = [] unknown_preds = Counter() all_pos = Counter() not_found = None if args.ttl: ttl_doc = ttl.Document.read_ttl(args.ttl) not_found = set(s.ID for s in ttl_doc).difference(s.ident for s in doc) for sent in doc: stats.count("Sentences") if not len(sent): stats.count("Sentences-empty") empty_sentences.append(sent.ident) for reading in sent: stats.count("Readings") stats['Predicates'] += len(reading.dmrs().layout.nodes) # pred_counter.update(n.predstr for n in reading.dmrs().layout.nodes) for n in reading.dmrs().layout.nodes: if n.pred.pos == 'u' and n.pred.sense == 'unknown': stats.count("Unnown predicates") if '/' in n.pred.lemma: try: lemma, pos = n.pred.lemma.rsplit('/', 1) except: getLogger().warning("Invalid unknown pred: {}".format(n.pred)) raise all_pos.count(pos) unknown_preds.count((str(n.pred), lemma, pos)) else: stats.count("UFO") else: stats.count("Known predicates") pred_counter.count(n.predstr) output.header("Summary", level="h0") stats.summarise(output) output.header("Empty sentences") output.print("\n".join(empty_sentences)) if not_found is not None: output.header("Missing from TTL") for sid in not_found: output.print(sid) output.header("Unknown preds POS") for pos, count in all_pos.most_common(): output.print(pos, count, separator='\t') output.header("Unknown preds") for (pred, lemma, pos), count in unknown_preds.most_common(): output.print(pred, lemma, pos, count, separator='\t') output.header("Known preds", level="h1") pred_counter.summarise(output)
def gen_mfs_3000(cli, args): rp = TextReport(args.output) ssids = list(topk_mfs(3000)) random.shuffle(ssids) with FileHub(working_dir='data', default_mode='w') as hub, omw.ctx() as ctx: filename = 'omw3000A' for idx, sid in enumerate(ssids): ss = omw.get_synset(sid, ctx=ctx) if idx > len(ssids) / 2: filename = 'omw3000B' hub['omw3000'].header(ss.ID, 'lemmas: {}'.format(", ".join(ss.lemmas))) for d in ss.definitions: hub[filename].writeline(d) hub['omw3000'].print(d, level=1) rp.header("Generated files") for f in hub.files.keys(): rp.print(hub[f].path)
def lookup(cli, args): '''Lookup words by kanji/kana''' jam = get_jam(cli, args) if jam.ready: results = jam.lookup(args.query, strict_lookup=args.strict) report = TextReport(args.output) if args.format == 'json': report.print( json.dumps(results.to_dict(), ensure_ascii=args.ensure_ascii, indent=args.indent if args.indent else None)) else: if args.compact: report.print( results.text(separator='\n------\n', entry_sep='\n')) else: dump_result(results, report=report) else: getLogger().warning( f"Jamdict database is not available.\nThere are 3 ways to install data: \n 1) install jamdict_data via PyPI using `pip install jamdict_data` \n 2) download prebuilt dictionary database file from: {jamdict.__url__}, \n 3) or build your own database file from XML source files." )
def gen_mfs_5000(cli, args): rp = TextReport(args.output) from omwtk.wn_ntumc_top3000 import WN_NTUMC_TOP3000 first_round = set(x['synset'] for x in WN_NTUMC_TOP3000) top5000 = topk_mfs(5000) round2 = list(top5000.difference(first_round)) random.shuffle(round2) with FileHub(working_dir='data', default_mode='w') as hub, omw.ctx() as ctx: filename = 'omw5000A' for idx, sid in enumerate(round2): ss = omw.get_synset(sid, ctx=ctx) if idx > 200: filename = 'omw5000B' hub['omw5000'].header(ss.ID, 'lemmas: {}'.format(", ".join(ss.lemmas))) for d in ss.definitions: hub[filename].writeline(d) hub['omw5000'].print(d, level=1) rp.header("Generated files") for f in hub.files.keys(): rp.print(hub[f].path)
def extract_wn31(cli, args): c = Counter() rp = TextReport() entries = [] infile = FileHelper.abspath(args.input) if not os.path.isfile(infile): rp.print("File not found") else: rp.print("Processing {}".format(infile)) tree = etree.iterparse(infile) for event, element in tree: if event == 'end' and element.tag == 'Synset': for child in element: if child.tag == 'Definition': entries.append((element.get('id'), element.get('ili'), child.text)) c.count('Definition') c.count("Synset") element.clear() c.summarise(report=rp) # Format: wn31sid ili definition CSV.write_tsv(args.output, entries)
def gen_vocab(cli, args): ''' Generate vocabulary list from a tokenized file ''' if args.topk and args.topk <= 0: topk = None cli.logger.warning( "Invalid k will be ignored (k should be greater than or equal to 1)" ) else: topk = args.topk if args.stopwords: with open(args.stopwords, 'r') as swfile: stopwords = swfile.read().splitlines() else: stopwords = [] if os.path.isfile(args.input): cli.logger.info("Generating vocabulary list from file {}".format( args.input)) with codecs.open(args.input, encoding='utf-8') as infile: if args.output: cli.logger.info("Output: {}".format(args.output)) rp = TextReport(args.output) lines = infile.read().splitlines() c = Counter() for line in lines: words = line.split() c.update(w for w in words if w not in stopwords) # report vocab word_freq = c.most_common(topk) words = [k for k, v in word_freq] rp.header("Lexicon") rp.writeline("\n".join( textwrap.wrap(" ".join(w for w in words), width=70))) for k, v in word_freq: rp.print("{}: {}".format(k, v)) else: cli.logger.warning("File {} does not exist".format(args.input))
def find_lesk_candidates(cli, args): doc = Document.from_file(args.gold) ne = 0 for s in doc: if len(s): ne += 1 print("Gold ISF: {} | not empty sents: {}".format(args.gold, ne)) # candidates = dd(lambda: dd(set)) notfound = dd(list) ident_sent_map = {} all_preds = Counter() missing_preds = Counter() found_preds = Counter() with PredSense.wn.ctx() as ctx: for idx, sent in enumerate(doc): if not len(sent): continue elif args.ident and sent.ident not in args.ident: continue if args.topk and args.topk < idx: break print(sent) ident_sent_map[sent.ident] = sent dmrs = sent[0].dmrs() if dmrs.tags: for ep in dmrs.get_lexical_preds(): all_preds.count(str(ep.pred)) if ep.nodeid in dmrs.tags: # if there is a tag for this node ep_synsets = PredSense.search_ep(ep, ctx=ctx) # return a SynsetCollection() for tag in dmrs.tags[ep.nodeid]: if tag.synset.ID not in ep_synsets: notfound[sent.ident].append((ep.nodeid, str(ep.pred), tag.synset.ID, tag.synset.lemma, [(x.ID, x.lemma) for x in ep_synsets])) missing_preds.count(str(ep.pred)) else: found_preds.count(str(ep.pred)) output = TextReport(args.output) # summarise total_found = sum(c for pred, c in found_preds.most_common()) total_missing = sum(c for pred, c in missing_preds.most_common()) output.print("Found : {}".format(total_found)) output.print("Not found: {}".format(total_missing)) ratio = (total_missing * 100) / (total_found + total_missing) output.print("Missing %: {}".format(ratio)) # preds by sentences output.header("By sentences") for sid in sorted(notfound.keys()): sent = ident_sent_map[sid] output.print((sid, sent.text)) items = notfound[sid] for item in items: output.print(item) output.print() # by preds output.header("By preds") for pred, occurrence in missing_preds.most_common(): output.print("{}: {}".format(pred, occurrence)) print("Done")
def map_predsense(cli, args): ''' Pred-Sense Mapping (gold DMRSes, gold Senses) ''' rp = TextReport(args.output) if args.output else TextReport() rp.header("Pred-Sense mapping / strategy = {}".format(args.strat)) if args.gold: sents = Document.from_file(args.gold) if args.patchsid: patch_gold_sid(sents) else: sents = read_gold_mrs() patch_gold_sid(sents) # ignore empty sentence empty_sents = [s for s in sents if not len(s)] not_empty_sents = [s for s in sents if len(s)] rp.print("MRS-Sents: {}".format(len(sents))) rp.print("MRS-Sents not empty: {}".format(len(not_empty_sents))) if args.ttl: doc = ttl.read(args.ttl, mode=args.ttl_format) else: # [XXX] using gold by default is bad ... doc = ttl.Document(name='gold', path='data').read() rp.print("TTL-Sents: {}".format(len(doc))) found_sents = 0 for sent in not_empty_sents: if doc.get(sent.ident) is None: cli.logger.warning("Sentence {} could not be found".format(sent.ident)) else: found_sents += 1 rp.print("Matched: {}".format(found_sents)) rp.print("Empty sentences: {}".format([s.ident for s in empty_sents])) # Now mapping is possible # ---------------------------------------- ct = Counter() # total cm = Counter() # matched cnm = Counter() # not matched cig = Counter() # ignored sense_lemmas = dd(set) # sense, lemma, map sense_sents = dd(set) # not-matched senses to sentences lemma_sents = dd(set) # not matched lemmas to sentences rp.print("Performing Pred-Sense Mapping") sents_to_map = not_empty_sents[:args.topk] if args.topk else not_empty_sents for sent in sents_to_map: sent.shallow = doc.get(sent.ident) for m, nm, ig in import_shallow(sent, mode=args.strat, no_small_sense=args.noss, fix_token=args.fixtoken, no_nonsense=args.nononsense): for c, nid, pred in m: ct.count(c.tag) cm.count(c.tag) for c in ig: sense_lemmas[c.tag].add(c.clemma) ct.count(c.tag) cig.count(c.tag) for c in nm: sense_lemmas[c.tag].add(c.clemma) ct.count(c.tag) cnm.count(c.tag) sense_sents[c.tag].add(sent) lemma_sents[c.clemma].add(sent) # print("Sent #{} - Not matched: {}".format(sent.ident, nm)) # print(" Matched : {}".format(len(m))) rp.header("Not matched", level='h0') for sid, c in cnm.most_common(): rp.print("{}: {} | Lemmas: {}".format(sid, c, sense_lemmas[sid])) rp.header("Not matched (by lemma)", level='h0') for clemma, sents in sorted(lemma_sents.items(), key=lambda x: len(x[1]), reverse=True): rp.print("{}: {} | sents: {}".format(clemma, len(sents), [s.ident for s in sents])) if args.matched: rp.header("Total", level='h0') ct.summarise() rp.header("Ignored", level='h0') for sid, c in cig.most_common(): rp.print("{}: {} | Lemmas: {}".format(sid, c, sense_lemmas[sid])) # show sense - sentences rp.header("Sense - Sentences", level='h0') for sid, c in cnm.most_common(): sents = sense_sents[sid] rp.header("{} - {}".format(sid, sense_lemmas[sid]), level='h2') for sent in sents: ttl_sent = doc.get(sent.ident) rp.print(ttl_sent) for concept in ttl_sent.concepts: if concept.tag == sid: rp.print(' -> {}'.format(concept)) rp.header("Lemma - Sentences", level='h0') for clemma, sents in sorted(lemma_sents.items(), key=lambda x: len(x[1]), reverse=True): rp.header("#{}".format(clemma,)) for sent in sents: ttl_sent = doc.get(sent.ident) rp.print(ttl_sent) for concept in ttl_sent.concepts: if concept.clemma == clemma: rp.print(' -> {}'.format(concept)) rp.print() # Show final numbers total_concepts = sum(x[1] for x in ct.most_common()) total_matched = sum(x[1] for x in cm.most_common()) total_notmatched = sum(x[1] for x in cnm.most_common()) total_ignored = sum(x[1] for x in cig.most_common()) rp.header("Summarise") rp.print("Total concepts: {}".format(total_concepts)) rp.print("Matched: {}".format(total_matched)) rp.print("Not matched: {}".format(total_notmatched)) rp.print("Ignored: {}".format(total_ignored)) if args.output: print("Total concepts: {}".format(total_concepts)) print("Matched: {}".format(total_matched)) print("Not matched: {}".format(total_notmatched)) print("Ignored: {}".format(total_ignored)) print("Output file: {}".format(args.output)) print("Done!") return total_concepts, total_matched, total_notmatched, total_ignored
def verify_patch(cli, args): rp = TextReport() c = Counter() if not args.input or not os.path.isfile(args.input): raise Exception("Patch file not found") # load patches with open(args.input) as infile: patches = [DefPatch.from_dict(p) for p in yaml.safe_load(infile)] rp.print("Found {} patches.".format(len(patches))) # Validate against GWN-30 # gwn = get_gwn() # don't use GWN, for now omw = get_omw() wn = get_wn() with omw.ctx() as ctx, wn.ctx() as wnctx: for patch in patches: try: sid = wn.sk2sid(patch.sensekey, ctx=wnctx) if not sid: raise Exception("sensekey `{}' does not exist.".format( patch.sensekey)) ss = omw.get_synset(sid, ctx=ctx) ssdef = ss.definition[:-1] if ss.definition.endswith( ';') else ss.definition if patch.orig_def == ssdef: c.count("Found") rp.print("-", "{} [{}]".format(patch.orig_def, patch.sensekey)) rp.print(" ", patch.new_def) if patch.comment: rp.print("C", patch.comment) else: c.count("Found - diff") rp.print("[DIFF]", "{} [{}]".format(patch.orig_def, patch.sensekey)) rp.print("New: ", "{} [{}]".format(patch.new_def, patch.sensekey)) rp.print(" ", ssdef) rp.print("Note: ", patch.comment) except: getLogger().warn("sensekey `{}' couldn't be found".format( patch.sensekey)) c.count("Not found") continue c.summarise(report=rp)
def isf_to_ukb(cli, args): ''' ISF to UKB ''' doc = Document.from_file(args.input) output = TextReport(args.output) tokenfile = TextReport(args.output + '.tokens.txt') report = TextReport(args.report) report.print("Output file: {}".format(args.output)) processed = 0 if not args.ident: report.print("No ident was provided") for idx, sent in enumerate(doc): # sent = doc.by_ident(ident, default=None) if args.topk and idx > args.topk: break if args.ident and sent.ident not in args.ident: continue if sent is None: report.print("Sent #{} is missing".format(sent.ident)) elif len(sent) == 0: report.print("Sent #{} is empty (i.e. there is no parse)".format(sent.ident)) else: sentid = sent.ID if sent.ID else sent.ident report.print("Processing {}".format(sentid)) tokens = sent.readings[0].dmrs().tokenize_pos(strict=args.strict) if not tokens: report.print("Empty DMRS: {} (no pred???)".format(sentid)) continue # sentense is OK ... output.print(sentid) for idx, (isf_lemma, pos, cfrom, cto) in enumerate(tokens): # In UKB's lemmas, use _ to represent a space lemma = isf_lemma.replace('+', '_') output.write("{text}#{p}#w{wid}#1 ".format(text=lemma, p=pos, wid=idx)) tokenfile.writeline('\t'.join((str(sentid), str(idx), str(cfrom), str(cto)))) output.write('\n\n') processed += 1 report.print("Processed {} sentence(s)".format(processed)) report.print("Done")
def test_tagging_all(self): getLogger().debug("Tagging everything ...") sents = self.gold() smap = {str(s.ident): s for s in sents} # reag tags doc = ttl.Document('gold', TEST_GOLD_DIR).read() filter_wrong_senses(doc) count_good_bad = Counter() perfects = [] to_be_checked = dd(list) tbc_concepts = dd(list) concept_count = Counter() fix_texts = [] instances = Counter() tag_map = dd(set) report = TextReport('data/gold_report.txt') matched_report = TextReport('data/gold_matched.txt') not_matched_report = TextReport('data/gold_notmatched.txt') for s in sents[:5]: sid = str(s.ident) if not doc.has_id(sid): raise Exception("Cannot find sentence {}".format(sid)) elif len(s) == 0: logging.warning("Empty sentence: {}".format(s)) else: tagged = doc.get(sid) if s.text != tagged.text: fix_texts.append((s.ident, s.text, tagged.text)) # try to tag ... dmrs = s[0].dmrs() matched, not_matched, ignored = tag_gold(dmrs, tagged, s.text, mode=Lexsem.ROBUST) if not not_matched: count_good_bad.count("Perfect") perfects.append((s, matched)) else: for nm in not_matched: tag_map[nm.tag].add(nm.clemma) tbc_concepts[nm.tag].append(s.ident) concept_count.count(nm.tag) instances.count('instances') to_be_checked[s.ident].append(nm) count_good_bad.count("To be checked") # report matched for sent, m in perfects: tagged = doc.get(str(sent.ident)) matched_report.header("#{}: {}".format(sent.ident, sent.text), "h0") matched_report.writeline(sent[0].dmrs()) matched_report.header("Concepts") for c, nid, pred in m: matched_report.writeline("{} ===> {}:{}".format(c, nid, pred)) matched_report.writeline() matched_report.writeline() # report not matched not_matched_report.header("By senses", "h0") for k, v in concept_count.most_common(): sids = ' '.join(["#{}".format(x) for x in tbc_concepts[k]]) not_matched_report.print("{}: {} | {} => {}".format(k, v, sids, tag_map[k])) not_matched_report.header("By sentences", "h0") for sid, nm in to_be_checked.items(): not_matched_report.print("#{}: {} | {}".format(sid, nm, smap[str(sid)].text)) # full details for sid, nm in to_be_checked.items(): sent = smap[str(sid)] tagged = doc.get(str(sid)) not_matched_report.header("#{}: {}".format(sid, sent.text)) not_matched_report.writeline(sent[0].dmrs()) for n in nm: not_matched_report.writeline(n) # for i, t1, t2 in fix_texts: # getLogger().debug(i) # getLogger().debug(t1) # getLogger().debug(t2) count_good_bad.summarise(report=report) instances.summarise(report=report)
rp3 = TextReport.null() # ouptut to /dev/null, i.e. nowhere rp3.write("This line goes no where") rp4 = TextReport.string( ) # output to a string. Call rp.content() to get the string rp4.write("This line will be stored in a string buffer") rp5 = TextReport(TextReport.STRINGIO) # same as above rp5.write("This line will also be stored in a string buffer") # TextReport will close the output stream automatically by using the with statement with TextReport.string() as rp: rp.header("Lorem Ipsum Analysis", level="h0") rp.header("Raw", level="h1") rp.print(LOREM_IPSUM) rp.header("Character Frequency") ct.summarise(report=rp) print(rp.content()) # ------------------------------------------------------------------------------ # Web fetcher # ------------------------------------------------------------------------------ from chirptext import WebHelper web = WebHelper('~/tmp/webcache.db') data = web.fetch('https://letuananh.github.io/test/data.json') print(data) data_json = web.fetch_json('https://letuananh.github.io/test/data.json') print(data_json)
def show_info(cli, args): ''' Show jamdict configuration (data folder, configuration file location, etc.) ''' output = TextReport(args.output) if 'output' in args else TextReport() if args.config: jamdict.config.read_config(args.config) output.print("Jamdict " + jamdict.version_info.__version__) output.print(jamdict.version_info.__description__) jam = get_jam(cli, args) output.header("Basic configuration") jamdict_home = jamdict.config.home_dir() if not os.path.isdir(os.path.expanduser(jamdict_home)): jamdict_home += " [Missing]" else: jamdict_home += " [OK]" output.print(f"JAMDICT_HOME: {jamdict_home}") if jamdict.util._JAMDICT_DATA_AVAILABLE: import jamdict_data data_pkg = f"version {jamdict_data.__version__} [OK]" else: data_pkg = "Not installed" output.print(f"jamdict-data: {data_pkg}") if args.config: _config_path = args.config + " [Custom]" if not os.path.isfile(args.config): _config_path += " [Missing]" else: _config_path = jamdict.config._get_config_manager().locate_config() if not _config_path: _config_path = "Not available.\n Run `python3 -m jamdict config` to create configuration file if needed." output.print(f"Config file : {_config_path}") output.header("Data files") output.print( f"Jamdict DB location: {jam.db_file} - {file_status(jam.db_file)}") output.print( f"JMDict XML file : {jam.jmd_xml_file} - {file_status(jam.jmd_xml_file)}" ) output.print( f"KanjiDic2 XML file : {jam.kd2_xml_file} - {file_status(jam.kd2_xml_file)}" ) output.print( f"JMnedict XML file : {jam.jmnedict_xml_file} - {file_status(jam.jmnedict_xml_file)}" ) if jam.ready: output.header("Jamdict database metadata") try: for meta in jam.jmdict.meta.select(): output.print(f"{meta.key}: {meta.value}") except Exception as e: print(e) output.print("Error happened while retrieving database meta data") output.header("Others") output.print(f"puchikarui: version {puchikarui_version}") output.print(f"chirptext : version {chirptext_version}") output.print(f"lxml : {jamdict.jmdict._LXML_AVAILABLE}")
# ------------------------------------------------------------------------------ # Sample text report # ------------------------------------------------------------------------------ # a string report rp = TextReport() # by default, TextReport will write to standard output, i.e. terminal rp = TextReport(TextReport.STDOUT) # same as above rp = TextReport('~/tmp/my-report.txt') # output to a file rp = TextReport.null() # ouptut to /dev/null, i.e. nowhere rp = TextReport.string() # output to a string. Call rp.content() to get the string rp = TextReport(TextReport.STRINGIO) # same as above # TextReport will close the output stream automatically by using the with statement with TextReport.string() as rp: rp.header("Lorem Ipsum Analysis", level="h0") rp.header("Raw", level="h1") rp.print(LOREM_IPSUM) rp.header("Character Frequency") ct.summarise(report=rp) print(rp.content()) # ------------------------------------------------------------------------------ # Web fetcher # ------------------------------------------------------------------------------ from chirptext import WebHelper web = WebHelper('~/tmp/webcache.db') data = web.fetch('https://letuananh.github.io/test/data.json') print(data) data_json = web.fetch_json('https://letuananh.github.io/test/data.json') print(data_json)
def dump_result(results, report=None): if report is None: report = TextReport() if results.entries: report.print("=" * 40) report.print("Found entries") report.print("=" * 40) for e in results.entries: kj = ', '.join([k.text for k in e.kanji_forms]) kn = ', '.join([k.text for k in e.kana_forms]) report.print("Entry: {} | Kj: {} | Kn: {}".format( e.idseq, kj, kn)) report.print("-" * 20) for idx, s in enumerate(e.senses): report.print("{idx}. {s}".format(idx=idx + 1, s=s)) report.print('') else: report.print("No dictionary entry was found.") if results.chars: report.print("=" * 40) report.print("Found characters") report.print("=" * 40) for c in results.chars: report.print("Char: {} | Strokes: {}".format(c, c.stroke_count)) report.print("-" * 20) for rmg in c.rm_groups: report.print("Readings:", ", ".join([r.value for r in rmg.readings])) report.print( "Meanings:", ", ".join([ m.value for m in rmg.meanings if not m.m_lang or m.m_lang == 'en' ])) else: report.print("No character was found.")
def remove_msw_ttl(cli, args): doc = read_ttl(args.path) rp = TextReport(args.debug) rp.print("Doc size: {}".format(len(doc))) orig_tag_count = 0 orig_concept_count = 0 for s in doc: orig_concept_count += len(s.concepts) orig_tag_count += len(s.tags) print("# tags: {}".format(orig_tag_count)) print("# concepts: {}".format(orig_concept_count)) manual = dd(lambda: dd(dict)) nonsenses = set() # just ignore any tag with these sense IDs if args.manual: entries = CSV.read_tsv(args.manual) for sid, wid, tag, keep, lemma in entries: sid, wid, keep = int(sid), int(wid), int(keep) if (sid, wid, keep, lemma) == (-1, -1, -1, 'U'): nonsenses.add(tag) if not lemma: manual[sid][wid][tag] = keep else: manual[sid][wid][(tag, lemma)] = keep wn = get_wn() ctx = wn.ctx() nope_synsets = set() ok_synsets = set() if args.wn30: rp.print("WN30 filter is activated") for sidx, sent in enumerate(doc): if args.topk and sidx > int(args.topk): break getLogger().debug("Processing sentence {}/{}".format(sidx + 1, len(doc))) getLogger().debug("Before concepts: {}".format(sent.concepts)) getLogger().debug("Before tags: {}".format(sent.tags)) # remove concepts that are not in PWN 3.0 if args.wn30: remove_tags = set() for tag in sent.tags: if tag.tagtype == 'OMW' or tag.label in nonsenses: remove_tags.add(tag) for tag in remove_tags: sent.tags.remove(tag) remove_concepts = set() for c in sent.concepts: if c.tag in ok_synsets: pass elif c.tag in nope_synsets: remove_concepts.add(c) # pop_concept(sent, c) elif wn.get_synset(c.tag, ctx=ctx) is None: # remove it nope_synsets.add(c.tag) remove_concepts.add(c) # pop_concept(sent, c) else: ok_synsets.add(c.tag) for c in remove_concepts: pop_concept(sent, c) msw = list(sent.msw()) tcmap = sent.tcmap() # remove_tags = set() if msw: keep_remove = [] for w in msw: max_len = 0 keep = [] remove = set() wid = sent.tokens.index(w) for c in tcmap[w]: if c.tag in manual[sent.ID][wid]: if manual[sent.ID][wid][c.tag]: keep.append(c) else: remove.add(c) elif (c.tag, c.clemma) in manual[sent.ID][wid]: if manual[sent.ID][wid][(c.tag, c.clemma)]: keep.append(c) else: remove.add(c) elif len(c.tokens) == 1 or len(c.tokens) < max_len: remove.add(c) elif c.tag in nonsenses: remove.add(c) else: max_len = len(c.tokens) keep.append(c) if len(keep) != 1: keep_remove.append((w, keep, remove)) else: # everything is OK, remove them now for c in remove: if args.debug: rp.print("Removing concept {} from {}".format(c, sent.ID)) getLogger().debug("Removing concept {} from {}".format(c, sent.ID)) pop_concept(sent, c) if keep_remove: rp.header(sent) for w, keep, remove in keep_remove: rp.write(w) rp.writeline(" - Keep: {} | Remove: {}".format(keep, remove)) # remove sent's tags # for tag in remove_tags: # getLogger().debug("removing tag: {}".format(tag)) # sent.tags.remove(tag) getLogger().debug("After concepts: {}".format(sent.concepts)) getLogger().debug("After tags: {}".format(sent.tags)) if nope_synsets: rp.print("Noped synsets: {}".format(nope_synsets)) if args.output: doc_path = os.path.dirname(args.output) doc_name = os.path.basename(args.output) new_doc = ttl.Document(doc_name, doc_path) sents = doc if not args.topk else list(doc)[:int(args.topk)] for s in sents: new_doc.add_sent(s) tag_count = 0 concept_count = 0 for s in sents: concept_count += len(s.concepts) tag_count += len(s.tags) # baking ... if args.bake: print("Baking doc ...") bake_doc(new_doc) print("[New] # tags: {}".format(tag_count)) print("[New] # concepts: {}".format(concept_count)) rp.print("Writing fixed TTL to {}".format(new_doc.sent_path)) new_doc.write_ttl()
def compare_ttls(cli, args): ''' Compare TTL to gold ''' rp = TextReport() omw = get_omw() ctx = omw.ctx() gold = None profile = None ignored_ids = [] if args.ignore: ignored_ids = [x.strip() for x in read_file(args.ignore).splitlines() if x.strip()] getLogger().debug("Ignored sentence IDs: {}".format(', '.join(ignored_ids))) if args.gold_profile: gold = read_ttl(args.gold_profile, ttl_format=args.ttl_format) # remove ignored sentences if ignored_ids: for sid in ignored_ids: gold.pop(sid, default=None) if not args.batch: rp.header("Gold sentences: {} | Loc: {}".format(len(gold), args.gold_profile)) if args.verbose and not args.batch: for s in gold: rp.print("Sent #{}: {} tags".format(s.ID, len(s.tags))) elif not args.batch: print("Oops, no gold!") # read profile if args.profile: profile = read_ttl(args.profile, ttl_format=args.ttl_format) if not args.batch: rp.header("Profile sentences: {} | Loc: {}".format(len(profile), args.profile)) # remove ignored sentences if ignored_ids: for sid in ignored_ids: profile.pop(sid, default=None) if not args.batch: rp.header("Profile sentences: {} (ignored: {}) | Loc: {}".format(len(profile), len(ignored_ids), args.profile)) if args.verbose and not args.batch: for s in profile: getLogger().debug("Profile/Sent #{}: {} tags".format(s.ID, len(s.tags))) elif not args.batch: print("Oops, no profile to evaluate") # calculate precision and recall if gold and profile: gold_tags, gold_tags_len, gold_ignored = prepare_tags(gold, args=args, nonsense=args.nonsense) profile_tags, profile_tags_len, profile_ignored = prepare_tags(profile, args=args, nonsense=args.nonsense) if gold_tags_len == 0: rp.print("WARNING: There was no tag found in the gold profile. Please make sure that the tags for comparison are *sentence level* tags") if profile_tags_len == 0: rp.print("WARNING: There was no tag found in the evaluating profile. Please make sure that the tags for comparison are *sentence level* tags") getLogger().debug("Gold tags: {}".format(gold_tags_len)) getLogger().debug(list(gold_tags.items())[:5]) getLogger().debug("Profile tags: {}".format(profile_tags_len)) getLogger().debug(list(profile_tags.items())[:5]) true_positive, false_negative = score(gold_tags, profile_tags, args=args) precision = len(true_positive) / profile_tags_len recall = len(true_positive) / gold_tags_len f1 = 2 * precision * recall / (precision + recall) getLogger().debug("TP: {}".format(len(true_positive))) getLogger().debug("FN: {}".format(len(false_negative))) getLogger().debug("Recall (TP/Gtags): {}".format(recall)) getLogger().debug("Precision (TP/Ptags): {}".format(precision)) getLogger().debug("F1 (2*p*r/(p+r)): {}".format(f1)) rc_text = "{:.2f}%".format(recall * 100) pr_text = "{:.2f}%".format(precision * 100) f1_text = "{:.2f}%".format(f1 * 100) if not args.batch: rp.print("True positive: {}".format(len(true_positive))) rp.print("False Negative: {}".format(len(false_negative))) rp.print("Gold # senses: {} | Ignored: {} | Total: {}".format(gold_tags_len, gold_ignored, gold_tags_len + gold_ignored)) rp.print("Predicted # senses: {} | Ignored: {} | Total: {}".format(profile_tags_len, profile_ignored, profile_tags_len + profile_ignored)) rp.print("Recall: {}".format(rc_text)) rp.print("Precision: {}".format(pr_text)) rp.print("F1 : {}".format(f1_text)) if args.org: # output org-mode columns = [rc_text, pr_text, f1_text] if args.cols: columns = args.cols + columns rp.print('| {} |'.format(' | '.join(columns))) if args.debug: if not args.batch: print("Debug file: {}".format(args.debug)) debugfile = TextReport(args.debug) debugfile.print(".:: Table of content ::.") debugfile.print("") debugfile.print("[Misisng senses]") debugfile.print("[By classes]") debugfile.print("[Summary]") debugfile.print("") ss_map = {} debugfile.header("[Missing senses]") for sid, cfrom, cto, label in sorted(false_negative): if label not in ss_map: ss = omw.get_synset(label, ctx=ctx) ss_map[label] = ss else: ss = ss_map[label] # get the surface form surface = gold.get(sid).text[int(cfrom):int(cto)] debugfile.print("{}\t{}\t{}\t{}\t{}\t{}\t{}".format(sid, cfrom, cto, surface, label, ss.definition, ss.lemmas)) # by classes c = Counter() c.update(synsetID for sentID, cfrom, cto, synsetID in false_negative) debugfile.header("[By classes]") for synsetID, freq in c.most_common(): ss = ss_map[synsetID] debugfile.print("{}: {} | ({}) - {}".format(synsetID, freq, ', '.join(ss.lemmas), ss.definition)) # summary debugfile.header("[Summary]") debugfile.print("True positive: {}".format(len(true_positive))) debugfile.print("False positive: {}".format(len(false_negative))) debugfile.print("Gold # senses: {} | Ignored: {} | Total: {}".format(gold_tags_len, gold_ignored, gold_tags_len + gold_ignored)) debugfile.print("Predicted # senses: {} | Ignored: {} | Total: {}".format(profile_tags_len, profile_ignored, profile_tags_len + profile_ignored)) debugfile.print("Recall (TP/Gtags) : {}".format(rc_text)) debugfile.print("Precision (TP/Ptags): {}".format(pr_text)) debugfile.print("F1 (2*p*r/(p+r)) : {}".format(f1_text)) ctx.close()