Exemplo n.º 1
0
def main():
    # defaults
    outfiles = None
    # initialise argument parser
    ap = argparse.ArgumentParser(description="Guess more data for Finnish TSV databases")
    ap.add_argument("--quiet", "-q", action="store_false", dest="verbose",
            default=False,
            help="do not print output to stdout while processing")
    ap.add_argument("--verbose", "-v", action="store_true", default=False,
            help="print each step to stdout while processing")
    ap.add_argument("--input", "-i", action="store", required=True,
            metavar="IFILE", help="read data from IFILE")
    ap.add_argument("--version", "-V", action="version")
    ap.add_argument("--output", "-o", action="store", required=True, 
            metavar="OFILE", help="write data to OFILE")
    ap.add_argument("--fields", "-f", action="store", default=2,
            metavar="N", help="read N fields from master")
    ap.add_argument("--join", "-j", action="store", required=True,
            metavar="JFILE", help="read join fields from JFILE")
    ap.add_argument("--separator", "-s", action="store", default="\t",
            metavar="SEP", help="use SEP as separator")
    ap.add_argument("--comment", "-C", action="append", default=["#"],
            metavar="COMMENT", help="skip lines starting with COMMENT that"
                "do not have SEPs")
    ap.add_argument("--strip", "-S", action="store",
            metavar="STRIP", help="strip STRIP characters")
    args = ap.parse_args()

    if args.strip == '"' or args.strip == "'":
        quoting = csv.QUOTE_ALL
        quotechar = args.strip
    else:
        quoting = csv.QUOTE_NONE
        quotechar = None

    errors = False
    joinmap = dict()
    # read joins from file if any
    with open(args.join, 'r', newline='') as joins:
        join_reader = csv.DictReader(joins, delimiter=args.separator, 
                quoting=quoting, escapechar='\\', strict=True)
        for join_parts in join_reader:
            if len(join_parts) < 3:
                print("Must have at leas N separators in joins; skipping",
                        join_parts)
                continue
            key = join_parts['new_paras'].strip('[]')
            joinmap[key] = join_parts

    # read from csv files
    with open(args.output, 'w', newline='') as output:
        tsv_writer = csv.DictWriter(output, 
                fieldnames=get_wordmap_fieldnames(),
                delimiter=args.separator, quoting=quoting,
                escapechar='%', quotechar=quotechar, strict=True)
        tsv_writer.writeheader()
        with open(args.input, 'r', newline='') as infile:
            tsv_reader = csv.reader(infile, delimiter=args.separator,
                    quoting=quoting, escapechar='\\', strict=True)
            linecount = 0
            for tsv_parts in tsv_reader:
                linecount += 1
                if args.verbose and (linecount % 10000 == 0):
                    print(linecount, "...", sep='', end='\r')
                if len(tsv_parts) < args.fields:
                    print("Must have at least N separators on each",
                            "non-comment non-empty line; skipping:",
                            tsv_parts, file=stderr)
                    continue
                # here starts the guessworks
                # the aim is to fill dict wordmap with data necessary to
                # generate a lexc line
                wordmap = init_wordmap()
                wordmap = parse_defaults_from_tsv(wordmap, tsv_parts)
                wordmap = parse_extras_from_tsv(wordmap, tsv_parts)
                # Extend from known new paras
                joinkey = ",".join(wordmap['new_paras'])
                if joinkey in joinmap:
                    for k,v in joinmap[joinkey].items():
                        if k != 'new_paras':
                            if v == "False":
                                wordmap[k] = False
                            elif v == "None":
                                wordmap[k] = None
                            elif k == 'kotus_tn':
                                wordmap[k] = int(v)
                            else:
                                wordmap[k] = v
                else:
                    print("\033[93mMissing!\033[0m",
                          "new para not in join data:", joinkey)
                    errors = True
                    continue

                # Guess works in order
                wordmap = guess_stem_features_ktn(wordmap)
                wordmap = guess_pronunciation(wordmap)
                wordmap = guess_grade_dir_from_ktn(wordmap)
                wordmap = guess_harmony(wordmap)
                wordmap = guess_new_class(wordmap)
                # here is actual python code doing the pre-processing
                wordmap = plurale_tantum_get_singular_stem(wordmap)
                wordmap = gradation_make_morphophonemes(wordmap)
                wordmap = stub_all_ktn(wordmap)
                # suffixes can be id'd by the - in beginning. They need an own lexicon
                wordmap = guess_bound_morphs(wordmap)
                if wordmap['is_suffix']:
                    wordmap['real_pos'] = wordmap['pos']
                    wordmap['pos'] = 'SUFFIX'
                # put interjections in separate lexicon to allow chaining them
                if "'PCLE_HAH'" in wordmap['new_paras']:
                    wordmap['real_pos'] = wordmap['pos']
                    wordmap['pos'] = 'INTERJECTION'
                # split multiple particle or subcat definitions to distinct lexemes
                wordmaps = [wordmap]
                wordmaps = [ m for wm in wordmaps 
                                for m in split_wordmap_by_field(wm, 'particle')]
                wordmaps = [ m for wm in wordmaps 
                                for m in split_wordmap_by_field(wm, 'subcat')]
                wordmaps = [ m for wm in wordmaps 
                                for m in split_wordmap_by_field(wm, 'symbol')]
                # print result
                for wordmap in wordmaps:
                    tsv_writer.writerow(wordmap)
    if errors:
        print("you must fix database integrity or hack the scripts",
              "before continuing")
        exit(1)

    exit()
Exemplo n.º 2
0
def main():
    ap = argparse.ArgumentParser(
        description=
        "Converts Omorfi's lexical data from old kotus-csv format to newpara-tsv "
        "with possible attribute fields")

    ap.add_argument("--input",
                    "-i",
                    metavar="INFILE",
                    help="read data from INFILE")
    ap.add_argument('--output',
                    '-o',
                    metavar="OUTFILE",
                    help="write converted stuff to OUTFILE")
    ap.add_argument('--plt-file',
                    '-p',
                    metavar="PLTFILE",
                    help="read plurale tantum info (csv) from PLTFILE")
    ap.add_argument('--verbose',
                    '-v',
                    action="store_true",
                    help="Print verbosely while processing")
    ap.add_argument("--version", "-V", action="version")
    args = ap.parse_args()

    plt_info = defaultdict(lambda: False)
    if args.plt_file:
        if args.verbose:
            print("Reading plurale tantum data from",
                  args.plt_file,
                  file=stderr)
        with open(args.plt_file, 'r', newline='') as plt_in:
            headers_skipped = False
            for line in plt_in:
                if headers_skipped:
                    lex, plt = line.rsplit(',', 1)
                    plt_info[lex] = plt.strip('"')
                elif line.find('HEADERS') >= 0:
                    headers_skipped = True

    if args.input:
        input = open(args.input, 'r', newline='')
    else:
        input = stdin
    if args.output:
        output = open(args.output, 'w', newline='')
    else:
        output = stdout

    for line in input:
        if line.startswith('#') or line.find('<-HEADERS') >= 0:
            continue
        fields = line.strip('"\n').split('","')
        if len(fields) < 4:
            if len(fields) > 0:
                if args.verbose:
                    print("Skipping too short line:", line, file=stderr)
            continue
        wordmap = init_wordmap()
        wordmap['stub'] = wordmap['lemma'] = fields[0]
        if args.verbose:
            print(wordmap['lemma'])
        wordmap['kotus_tn'] = int(fields[1])
        wordmap['kotus_av'] = fields[2]
        if wordmap['kotus_av'] == '0':
            wordmap['kotus_av'] = False
        wordmap['pos'] = fields[3]
        wordmap = expand_pos(wordmap)
        if plt_info:
            wordmap['plurale_tantum'] = plt_info['"' +
                                                 '","'.join(fields[0:4]) + '"']
        for i in range(4, len(fields)):
            if fields[i].startswith('plt='):
                wordmap['plurale_tantum'] = fields[i]
            elif fields[i].startswith('boundaries='):
                fields[i] = fields[i].replace('|', '{WB}')
                wordmap['stub'] = wordmap['boundaries'] = fields[i]
        wordmap = guess_stem_features_ktn(wordmap)
        wordmap = guess_pronunciation(wordmap)
        wordmap = guess_grade_dir_from_ktn(wordmap)
        wordmap = guess_harmony(wordmap)
        wordmap = guess_new_class(wordmap)

        wordmap['extras'] = '\t'.join(fields[4:])
        if wordmap['extras']:
            wordmap['extras'] = '\t' + wordmap['extras']

        if args.verbose:
            print("Guessed new para: %(new_paras)r" % (wordmap))
        print("%(lemma)s\t%(new_paras)r%(extras)s" % (wordmap), file=output)

    input.close()
    output.close()
    exit()
Exemplo n.º 3
0
def main():
    ap = argparse.ArgumentParser(
        description="Converts Omorfi's lexical data from old kotus-csv format to newpara-tsv "
        "with possible attribute fields"
    )

    ap.add_argument("--input", "-i", metavar="INFILE", help="read data from INFILE")
    ap.add_argument("--output", "-o", metavar="OUTFILE", help="write converted stuff to OUTFILE")
    ap.add_argument("--plt-file", "-p", metavar="PLTFILE", help="read plurale tantum info (csv) from PLTFILE")
    ap.add_argument("--verbose", "-v", action="store_true", help="Print verbosely while processing")
    ap.add_argument("--version", "-V", action="version")
    args = ap.parse_args()

    plt_info = defaultdict(lambda: False)
    if args.plt_file:
        if args.verbose:
            print("Reading plurale tantum data from", args.plt_file, file=stderr)
        with open(args.plt_file, "r", newline="") as plt_in:
            headers_skipped = False
            for line in plt_in:
                if headers_skipped:
                    lex, plt = line.rsplit(",", 1)
                    plt_info[lex] = plt.strip('"')
                elif line.find("HEADERS") >= 0:
                    headers_skipped = True

    if args.input:
        input = open(args.input, "r", newline="")
    else:
        input = stdin
    if args.output:
        output = open(args.output, "w", newline="")
    else:
        output = stdout

    for line in input:
        if line.startswith("#") or line.find("<-HEADERS") >= 0:
            continue
        fields = line.strip('"\n').split('","')
        if len(fields) < 4:
            if len(fields) > 0:
                if args.verbose:
                    print("Skipping too short line:", line, file=stderr)
            continue
        wordmap = init_wordmap()
        wordmap["stub"] = wordmap["lemma"] = fields[0]
        if args.verbose:
            print(wordmap["lemma"])
        wordmap["kotus_tn"] = int(fields[1])
        wordmap["kotus_av"] = fields[2]
        if wordmap["kotus_av"] == "0":
            wordmap["kotus_av"] = False
        wordmap["pos"] = fields[3]
        wordmap = expand_pos(wordmap)
        if plt_info:
            wordmap["plurale_tantum"] = plt_info['"' + '","'.join(fields[0:4]) + '"']

        wordmap = guess_stem_features_ktn(wordmap)
        wordmap = guess_pronunciation(wordmap)
        wordmap = guess_grade_dir_from_ktn(wordmap)
        wordmap = guess_harmony(wordmap)
        wordmap = guess_new_class(wordmap)

        wordmap["extras"] = "\t".join(fields[4:])
        if wordmap["extras"]:
            wordmap["extras"] = "\t" + wordmap["extras"]

        if args.verbose:
            print("Guessed new para: %(new_paras)r" % (wordmap))
        print("%(lemma)s\t%(new_paras)r%(extras)s" % (wordmap), file=output)

    input.close()
    output.close()
    exit()
Exemplo n.º 4
0
def main():
    ap = argparse.ArgumentParser(description=
            "Converts Omorfi's lexical data from old kotus-csv format to newpara-tsv "
            "with possible attribute fields")

    ap.add_argument("--input", "-i", metavar="INFILE",
            help="read data from INFILE")
    ap.add_argument('--output', '-o', metavar="OUTFILE",
            help="write converted stuff to OUTFILE")
    ap.add_argument('--plt-file', '-p', metavar="PLTFILE",
            help="read plurale tantum info (csv) from PLTFILE")
    ap.add_argument('--verbose', '-v', action="store_true",
            help="Print verbosely while processing")
    ap.add_argument("--version", "-V", action="version")
    args = ap.parse_args()

    plt_info = defaultdict(lambda: False)
    if args.plt_file:
        if args.verbose:
            print("Reading plurale tantum data from", args.plt_file, file=stderr)
        with open(args.plt_file, 'r', newline='') as plt_in:
            headers_skipped = False
            for line in plt_in:
                if headers_skipped:
                    lex, plt = line.rsplit(',', 1)
                    plt_info[lex] = plt.strip('"')
                elif line.find('HEADERS') >= 0:
                    headers_skipped = True
        
    if args.input:
        input = open(args.input, 'r', newline='')
    else:
        input = stdin
    if args.output:
        output = open(args.output, 'w', newline='')
    else:
        output = stdout
    
    for line in input:
        if line.startswith('#') or line.find('<-HEADERS') >= 0:
            continue
        fields = line.strip('"\n').split('","')
        if len(fields) < 4:
            if len(fields) > 0:
                if args.verbose:
                    print("Skipping too short line:", line, file=stderr)
            continue
        wordmap = init_wordmap()
        wordmap['stub'] = wordmap['lemma'] = fields[0]
        if args.verbose:
            print(wordmap['lemma'])
        wordmap['kotus_tn'] = int(fields[1])
        wordmap['kotus_av'] = fields[2]
        if wordmap['kotus_av'] == '0':
            wordmap['kotus_av'] = False
        wordmap['pos'] = fields[3]
        wordmap = expand_pos(wordmap)
        if plt_info:
            wordmap['plurale_tantum'] = plt_info['"'+'","'.join(fields[0:4])+'"']
        for i in range(4, len(fields)):
            if fields[i].startswith('plt='):
                wordmap['plurale_tantum'] = fields[i]
            elif fields[i].startswith('boundaries='):
                fields[i] = fields[i].replace('|', '{WB}')
                wordmap['stub'] = wordmap['boundaries'] = fields[i]
        wordmap = guess_stem_features_ktn(wordmap)
        wordmap = guess_pronunciation(wordmap)
        wordmap = guess_grade_dir_from_ktn(wordmap)
        wordmap = guess_harmony(wordmap)
        wordmap = guess_new_class(wordmap)
        
        wordmap['extras'] = '\t'.join(fields[4:])
        if wordmap['extras']:
            wordmap['extras'] = '\t' + wordmap['extras']
        
        if args.verbose:
            print("Guessed new para: %(new_paras)r" %(wordmap))
        print("%(lemma)s\t%(new_paras)r%(extras)s" %(wordmap), file=output)

    input.close()
    output.close()
    exit()