def read_nttat(cli, args): ''' Convert NTTAT patch to JSON ''' stdout = TextReport() ext = 'json' rp = TextReport("{}_1.{}".format(args.output, ext)) rp2 = TextReport("{}_2.{}".format(args.output, ext)) gwn = get_gwn() data = [] with open(args.input, 'r') as infile, gwn.ctx() as ctx: ssids = re.findall('\d{8}-[nvarx]', infile.read()) print(len(ssids)) print(ssids) for sid in ssids: ss = gwn.get_synset(sid, ctx=ctx) sdef = fix_gwn_def(ss.definition) stdout.header(sid, "Lemmas: {}".format(", ".join(ss.lemmas))) stdout.print(sdef) data.append({ "synset": sid, "lemmas": ss.lemmas, "definition": sdef }) cut = int(len(data) / 2) # first half first_half = json.dumps(data[:cut], indent=2) rp.write(first_half) # second half second_half = json.dumps(data[cut:], indent=2) rp2.write(second_half)
def isf_to_ukb(cli, args): ''' ISF to UKB ''' doc = Document.from_file(args.input) output = TextReport(args.output) tokenfile = TextReport(args.output + '.tokens.txt') report = TextReport(args.report) report.print("Output file: {}".format(args.output)) processed = 0 if not args.ident: report.print("No ident was provided") for idx, sent in enumerate(doc): # sent = doc.by_ident(ident, default=None) if args.topk and idx > args.topk: break if args.ident and sent.ident not in args.ident: continue if sent is None: report.print("Sent #{} is missing".format(sent.ident)) elif len(sent) == 0: report.print("Sent #{} is empty (i.e. there is no parse)".format(sent.ident)) else: sentid = sent.ID if sent.ID else sent.ident report.print("Processing {}".format(sentid)) tokens = sent.readings[0].dmrs().tokenize_pos(strict=args.strict) if not tokens: report.print("Empty DMRS: {} (no pred???)".format(sentid)) continue # sentense is OK ... output.print(sentid) for idx, (isf_lemma, pos, cfrom, cto) in enumerate(tokens): # In UKB's lemmas, use _ to represent a space lemma = isf_lemma.replace('+', '_') output.write("{text}#{p}#w{wid}#1 ".format(text=lemma, p=pos, wid=idx)) tokenfile.writeline('\t'.join((str(sentid), str(idx), str(cfrom), str(cto)))) output.write('\n\n') processed += 1 report.print("Processed {} sentence(s)".format(processed)) report.print("Done")
def remove_msw_ttl(cli, args): doc = read_ttl(args.path) rp = TextReport(args.debug) rp.print("Doc size: {}".format(len(doc))) orig_tag_count = 0 orig_concept_count = 0 for s in doc: orig_concept_count += len(s.concepts) orig_tag_count += len(s.tags) print("# tags: {}".format(orig_tag_count)) print("# concepts: {}".format(orig_concept_count)) manual = dd(lambda: dd(dict)) nonsenses = set() # just ignore any tag with these sense IDs if args.manual: entries = CSV.read_tsv(args.manual) for sid, wid, tag, keep, lemma in entries: sid, wid, keep = int(sid), int(wid), int(keep) if (sid, wid, keep, lemma) == (-1, -1, -1, 'U'): nonsenses.add(tag) if not lemma: manual[sid][wid][tag] = keep else: manual[sid][wid][(tag, lemma)] = keep wn = get_wn() ctx = wn.ctx() nope_synsets = set() ok_synsets = set() if args.wn30: rp.print("WN30 filter is activated") for sidx, sent in enumerate(doc): if args.topk and sidx > int(args.topk): break getLogger().debug("Processing sentence {}/{}".format(sidx + 1, len(doc))) getLogger().debug("Before concepts: {}".format(sent.concepts)) getLogger().debug("Before tags: {}".format(sent.tags)) # remove concepts that are not in PWN 3.0 if args.wn30: remove_tags = set() for tag in sent.tags: if tag.tagtype == 'OMW' or tag.label in nonsenses: remove_tags.add(tag) for tag in remove_tags: sent.tags.remove(tag) remove_concepts = set() for c in sent.concepts: if c.tag in ok_synsets: pass elif c.tag in nope_synsets: remove_concepts.add(c) # pop_concept(sent, c) elif wn.get_synset(c.tag, ctx=ctx) is None: # remove it nope_synsets.add(c.tag) remove_concepts.add(c) # pop_concept(sent, c) else: ok_synsets.add(c.tag) for c in remove_concepts: pop_concept(sent, c) msw = list(sent.msw()) tcmap = sent.tcmap() # remove_tags = set() if msw: keep_remove = [] for w in msw: max_len = 0 keep = [] remove = set() wid = sent.tokens.index(w) for c in tcmap[w]: if c.tag in manual[sent.ID][wid]: if manual[sent.ID][wid][c.tag]: keep.append(c) else: remove.add(c) elif (c.tag, c.clemma) in manual[sent.ID][wid]: if manual[sent.ID][wid][(c.tag, c.clemma)]: keep.append(c) else: remove.add(c) elif len(c.tokens) == 1 or len(c.tokens) < max_len: remove.add(c) elif c.tag in nonsenses: remove.add(c) else: max_len = len(c.tokens) keep.append(c) if len(keep) != 1: keep_remove.append((w, keep, remove)) else: # everything is OK, remove them now for c in remove: if args.debug: rp.print("Removing concept {} from {}".format(c, sent.ID)) getLogger().debug("Removing concept {} from {}".format(c, sent.ID)) pop_concept(sent, c) if keep_remove: rp.header(sent) for w, keep, remove in keep_remove: rp.write(w) rp.writeline(" - Keep: {} | Remove: {}".format(keep, remove)) # remove sent's tags # for tag in remove_tags: # getLogger().debug("removing tag: {}".format(tag)) # sent.tags.remove(tag) getLogger().debug("After concepts: {}".format(sent.concepts)) getLogger().debug("After tags: {}".format(sent.tags)) if nope_synsets: rp.print("Noped synsets: {}".format(nope_synsets)) if args.output: doc_path = os.path.dirname(args.output) doc_name = os.path.basename(args.output) new_doc = ttl.Document(doc_name, doc_path) sents = doc if not args.topk else list(doc)[:int(args.topk)] for s in sents: new_doc.add_sent(s) tag_count = 0 concept_count = 0 for s in sents: concept_count += len(s.concepts) tag_count += len(s.tags) # baking ... if args.bake: print("Baking doc ...") bake_doc(new_doc) print("[New] # tags: {}".format(tag_count)) print("[New] # concepts: {}".format(concept_count)) rp.print("Writing fixed TTL to {}".format(new_doc.sent_path)) new_doc.write_ttl()
ct.count(char) vc.count("Letters") if char in 'auieo': vc.count("Vowels") else: vc.count("Consonants") vc.summarise() ct.summarise(byfreq=True, limit=5) # ------------------------------------------------------------------------------ # Sample text report # ------------------------------------------------------------------------------ # a string report rp = TextReport( ) # by default, TextReport will write to standard output, i.e. terminal rp.write("This line goes to standard output") rp1 = TextReport(TextReport.STDOUT) # same as above rp1.write("This line goes to standard output") rp2 = TextReport('~/tmp/my-report.txt') # output to a file rp2.write("This is a line in my-report.txt") rp3 = TextReport.null() # ouptut to /dev/null, i.e. nowhere rp3.write("This line goes no where") rp4 = TextReport.string( ) # output to a string. Call rp.content() to get the string rp4.write("This line will be stored in a string buffer") rp5 = TextReport(TextReport.STRINGIO) # same as above