Exemplo n.º 1
0
def read_nttat(cli, args):
    ''' Convert NTTAT patch to JSON '''
    stdout = TextReport()
    ext = 'json'
    rp = TextReport("{}_1.{}".format(args.output, ext))
    rp2 = TextReport("{}_2.{}".format(args.output, ext))
    gwn = get_gwn()
    data = []
    with open(args.input, 'r') as infile, gwn.ctx() as ctx:
        ssids = re.findall('\d{8}-[nvarx]', infile.read())
        print(len(ssids))
        print(ssids)
        for sid in ssids:
            ss = gwn.get_synset(sid, ctx=ctx)
            sdef = fix_gwn_def(ss.definition)
            stdout.header(sid, "Lemmas: {}".format(", ".join(ss.lemmas)))
            stdout.print(sdef)
            data.append({
                "synset": sid,
                "lemmas": ss.lemmas,
                "definition": sdef
            })
    cut = int(len(data) / 2)
    # first half
    first_half = json.dumps(data[:cut], indent=2)
    rp.write(first_half)
    # second half
    second_half = json.dumps(data[cut:], indent=2)
    rp2.write(second_half)
Exemplo n.º 2
0
def isf_to_ukb(cli, args):
    ''' ISF to UKB '''
    doc = Document.from_file(args.input)
    output = TextReport(args.output)
    tokenfile = TextReport(args.output + '.tokens.txt')
    report = TextReport(args.report)
    report.print("Output file: {}".format(args.output))
    processed = 0
    if not args.ident:
        report.print("No ident was provided")
    for idx, sent in enumerate(doc):
        # sent = doc.by_ident(ident, default=None)
        if args.topk and idx > args.topk:
            break
        if args.ident and sent.ident not in args.ident:
            continue
        if sent is None:
            report.print("Sent #{} is missing".format(sent.ident))
        elif len(sent) == 0:
            report.print("Sent #{} is empty (i.e. there is no parse)".format(sent.ident))
        else:
            sentid = sent.ID if sent.ID else sent.ident
            report.print("Processing {}".format(sentid))
            tokens = sent.readings[0].dmrs().tokenize_pos(strict=args.strict)
            if not tokens:
                report.print("Empty DMRS: {} (no pred???)".format(sentid))
                continue
            # sentense is OK ...
            output.print(sentid)
            for idx, (isf_lemma, pos, cfrom, cto) in enumerate(tokens):
                # In UKB's lemmas, use _ to represent a space
                lemma = isf_lemma.replace('+', '_')
                output.write("{text}#{p}#w{wid}#1 ".format(text=lemma, p=pos, wid=idx))
                tokenfile.writeline('\t'.join((str(sentid), str(idx), str(cfrom), str(cto))))
            output.write('\n\n')
            processed += 1
    report.print("Processed {} sentence(s)".format(processed))
    report.print("Done")
Exemplo n.º 3
0
def remove_msw_ttl(cli, args):
    doc = read_ttl(args.path)
    rp = TextReport(args.debug)
    rp.print("Doc size: {}".format(len(doc)))
    orig_tag_count = 0
    orig_concept_count = 0
    for s in doc:
        orig_concept_count += len(s.concepts)
        orig_tag_count += len(s.tags)
    print("# tags: {}".format(orig_tag_count))
    print("# concepts: {}".format(orig_concept_count))
    manual = dd(lambda: dd(dict))
    nonsenses = set()  # just ignore any tag with these sense IDs
    if args.manual:
        entries = CSV.read_tsv(args.manual)
        for sid, wid, tag, keep, lemma in entries:
            sid, wid, keep = int(sid), int(wid), int(keep)
            if (sid, wid, keep, lemma) == (-1, -1, -1, 'U'):
                nonsenses.add(tag)
            if not lemma:
                manual[sid][wid][tag] = keep
            else:
                manual[sid][wid][(tag, lemma)] = keep
    wn = get_wn()
    ctx = wn.ctx()
    nope_synsets = set()
    ok_synsets = set()
    if args.wn30:
        rp.print("WN30 filter is activated")
    for sidx, sent in enumerate(doc):
        if args.topk and sidx > int(args.topk):
            break
        getLogger().debug("Processing sentence {}/{}".format(sidx + 1, len(doc)))
        getLogger().debug("Before concepts: {}".format(sent.concepts))
        getLogger().debug("Before tags: {}".format(sent.tags))
        # remove concepts that are not in PWN 3.0
        if args.wn30:
            remove_tags = set()
            for tag in sent.tags:
                if tag.tagtype == 'OMW' or tag.label in nonsenses:
                    remove_tags.add(tag)
            for tag in remove_tags:
                sent.tags.remove(tag)
            remove_concepts = set()
            for c in sent.concepts:
                if c.tag in ok_synsets:
                    pass
                elif c.tag in nope_synsets:
                    remove_concepts.add(c)
                    # pop_concept(sent, c)
                elif wn.get_synset(c.tag, ctx=ctx) is None:
                    # remove it
                    nope_synsets.add(c.tag)
                    remove_concepts.add(c)
                    # pop_concept(sent, c)
                else:
                    ok_synsets.add(c.tag)
            for c in remove_concepts:
                pop_concept(sent, c)
        msw = list(sent.msw())
        tcmap = sent.tcmap()
        # remove_tags = set()
        if msw:
            keep_remove = []
            for w in msw:
                max_len = 0
                keep = []
                remove = set()
                wid = sent.tokens.index(w)
                for c in tcmap[w]:
                    if c.tag in manual[sent.ID][wid]:
                        if manual[sent.ID][wid][c.tag]:
                            keep.append(c)
                        else:
                            remove.add(c)
                    elif (c.tag, c.clemma) in manual[sent.ID][wid]:
                        if manual[sent.ID][wid][(c.tag, c.clemma)]:
                            keep.append(c)
                        else:
                            remove.add(c)
                    elif len(c.tokens) == 1 or len(c.tokens) < max_len:
                        remove.add(c)
                    elif c.tag in nonsenses:
                        remove.add(c)
                    else:
                        max_len = len(c.tokens)
                        keep.append(c)
                if len(keep) != 1:
                    keep_remove.append((w, keep, remove))
                else:
                    # everything is OK, remove them now
                    for c in remove:
                        if args.debug:
                            rp.print("Removing concept {} from {}".format(c, sent.ID))
                        getLogger().debug("Removing concept {} from {}".format(c, sent.ID))
                        pop_concept(sent, c)
            if keep_remove:
                rp.header(sent)
                for w, keep, remove in keep_remove:
                    rp.write(w)
                    rp.writeline(" - Keep: {} | Remove: {}".format(keep, remove))
        # remove sent's tags
        # for tag in remove_tags:
        #     getLogger().debug("removing tag: {}".format(tag))
        #     sent.tags.remove(tag)
        getLogger().debug("After concepts: {}".format(sent.concepts))
        getLogger().debug("After tags: {}".format(sent.tags))
    if nope_synsets:
        rp.print("Noped synsets: {}".format(nope_synsets))
    if args.output:
        doc_path = os.path.dirname(args.output)
        doc_name = os.path.basename(args.output)
        new_doc = ttl.Document(doc_name, doc_path)
        sents = doc if not args.topk else list(doc)[:int(args.topk)]
        for s in sents:
            new_doc.add_sent(s)
        tag_count = 0
        concept_count = 0
        for s in sents:
            concept_count += len(s.concepts)
            tag_count += len(s.tags)
        # baking ...
        if args.bake:
            print("Baking doc ...")
            bake_doc(new_doc)
        print("[New] # tags: {}".format(tag_count))
        print("[New] # concepts: {}".format(concept_count))
        rp.print("Writing fixed TTL to {}".format(new_doc.sent_path))
        new_doc.write_ttl()
Exemplo n.º 4
0
    ct.count(char)
    vc.count("Letters")
    if char in 'auieo':
        vc.count("Vowels")
    else:
        vc.count("Consonants")
vc.summarise()
ct.summarise(byfreq=True, limit=5)

# ------------------------------------------------------------------------------
# Sample text report
# ------------------------------------------------------------------------------
# a string report
rp = TextReport(
)  # by default, TextReport will write to standard output, i.e. terminal
rp.write("This line goes to standard output")

rp1 = TextReport(TextReport.STDOUT)  # same as above
rp1.write("This line goes to standard output")

rp2 = TextReport('~/tmp/my-report.txt')  # output to a file
rp2.write("This is a line in my-report.txt")

rp3 = TextReport.null()  # ouptut to /dev/null, i.e. nowhere
rp3.write("This line goes no where")

rp4 = TextReport.string(
)  # output to a string. Call rp.content() to get the string
rp4.write("This line will be stored in a string buffer")

rp5 = TextReport(TextReport.STRINGIO)  # same as above