Пример #1
0
def main():
    # defaults
    curr_lexicon = dict()
    # initialise argument parser
    ap = argparse.ArgumentParser(
        description=
        "Convert Finnish dictionary TSV data into xerox/HFST lexc format")
    ap.add_argument("--quiet",
                    "-q",
                    action="store_false",
                    dest="verbose",
                    default=False,
                    help="do not print output to stdout while processing")
    ap.add_argument("--verbose",
                    "-v",
                    action="store_true",
                    default=False,
                    help="print each step to stdout while processing")
    ap.add_argument("--master",
                    "-m",
                    action="append",
                    required=True,
                    dest="masterfilenames",
                    metavar="MFILE",
                    help="read lexical roots from MFILEs")
    ap.add_argument("--stemparts",
                    "-p",
                    action="append",
                    required=True,
                    dest='spfilenames',
                    metavar="SPFILE",
                    help="read lexical roots from SPFILEs")
    ap.add_argument("--inflection",
                    "-i",
                    action="append",
                    required=True,
                    dest='inffilenames',
                    metavar="INFFILE",
                    help="read inflection from INFFILEs")
    ap.add_argument("--exclude-pos",
                    "-x",
                    action="append",
                    metavar="XPOS",
                    help="exclude all XPOS parts of speech from generation")
    ap.add_argument("--include-lemmas",
                    "-I",
                    action="append",
                    type=open,
                    metavar="ILFILE",
                    help="read lemmas to include from ILFILE")
    ap.add_argument("--exclude-blacklisted",
                    "-B",
                    action="append",
                    type=str,
                    metavar="BLIST",
                    help="exclude lemmas in BLIST blacklist",
                    choices=[
                        "FGK", "PROPN-BLOCKING", "NOUN-BLOCKING-PROPN",
                        "TOOSHORTFORCOMPOUND"
                    ])
    ap.add_argument("--version", "-V", action="version")
    ap.add_argument("--output",
                    "-o",
                    "--one-file",
                    "-1",
                    type=argparse.FileType("w"),
                    required=True,
                    metavar="OFILE",
                    help="write output to OFILE")
    ap.add_argument("--fields",
                    "-F",
                    action="store",
                    default=2,
                    metavar="N",
                    help="read N fields from master")
    ap.add_argument("--separator",
                    action="store",
                    default="\t",
                    metavar="SEP",
                    help="use SEP as separator")
    ap.add_argument("--comment",
                    "-C",
                    action="append",
                    default=["#"],
                    metavar="COMMENT",
                    help="skip lines starting with COMMENT that"
                    "do not have SEPs")
    ap.add_argument("--strip",
                    action="store",
                    metavar="STRIP",
                    help="strip STRIP from fields before using")
    ap.add_argument("--format",
                    "-f",
                    action="store",
                    default="omor",
                    help="use specific output format for lexc data",
                    choices=[
                        "omor", "giella", "ftb3", "ftb1", "none", "apertium",
                        "labelsegments"
                    ])
    ap.add_argument("--omor-new-para",
                    action="store_true",
                    default=False,
                    help="include NEW_PARA= in raw analyses")
    ap.add_argument("--omor-allo",
                    action="store_true",
                    default=False,
                    help="include ALLO= in raw analyses")
    ap.add_argument("--omor-props",
                    action="store_true",
                    default=False,
                    help="include PROPER= in raw analyses")
    ap.add_argument("--omor-sem",
                    action="store_true",
                    default=False,
                    help="include SEM= in raw analyses")
    ap.add_argument("--none-lemmas",
                    action="store_true",
                    default=False,
                    help="include lemmas in raw analyses")
    ap.add_argument("--none-segments",
                    action="store_true",
                    default=False,
                    help="include segments in raw analyses")
    args = ap.parse_args()

    formatter = None
    if args.format == 'omor':
        formatter = OmorFormatter(args.verbose,
                                  new_para=args.omor_new_para,
                                  allo=args.omor_allo,
                                  props=args.omor_props,
                                  sem=args.omor_sem)
    elif args.format == 'ftb3':
        formatter = Ftb3Formatter(args.verbose)
    elif args.format == 'apertium':
        formatter = ApertiumFormatter(args.verbose)
    elif args.format == 'giella':
        formatter = GiellaFormatter(args.verbose)
    elif args.format == 'none':
        formatter = NoTagsFormatter(args.verbose,
                                    lemmatise=args.none_lemmas,
                                    segment=args.none_segments)
    elif args.format == 'labelsegments':
        formatter = LabeledSegmentsFormatter(args.verbose)
    else:
        print("DIDNT CONVERT FORMATTER YET", args.format)
        exit(1)
    # check args
    if args.strip == '"' or args.strip == "'":
        quoting = csv.QUOTE_ALL
        quotechar = args.strip
    else:
        quoting = csv.QUOTE_NONE
        quotechar = None
    lemmas = []
    if args.include_lemmas:
        for lemma_file in args.include_lemmas:
            if args.verbose:
                print("including only lemmas from", lemma_file.name)
            for line in lemma_file:
                lemmas.append(line.rstrip('\n'))
            lemma_file.close()
    if not args.exclude_pos:
        args.exclude_pos = []
    # setup files
    if args.verbose:
        print("Writing everything to", args.output.name)
        if args.exclude_pos:
            print("Not writing closed parts-of-speech data in",
                  ",".join(args.exclude_pos))
    # print definitions to rootfile
    print(formatter.copyright_lexc(), file=args.output)
    if args.verbose:
        print("Creating Multichar_Symbols and Root")
    print(formatter.multichars_lexc(), file=args.output)
    print(formatter.root_lexicon_lexc(), file=args.output)
    # read from csv files
    for tsv_filename in args.masterfilenames:
        if args.verbose:
            print("Reading from", tsv_filename)
        linecount = 0
        print("! Omorfi stubs generated from",
              tsv_filename,
              "\n! date:",
              strftime("%Y-%m-%d %H:%M:%S+%Z"),
              "\n! params: ",
              ' '.join(argv),
              file=args.output)
        print(formatter.copyright_lexc(), file=args.output)
        curr_lexicon = ""
        # for each line
        with open(tsv_filename, "r", newline='') as tsv_file:
            tsv_reader = csv.DictReader(tsv_file,
                                        delimiter=args.separator,
                                        quoting=quoting,
                                        escapechar='%',
                                        quotechar=quotechar,
                                        strict=True)
            postponed_suffixes = list()
            postponed_abbrs = {'ABBREVIATION': list(), 'ACRONYM': list()}
            for tsv_parts in tsv_reader:
                linecount += 1
                if args.verbose and (linecount % 10000 == 0):
                    print(linecount, "...", sep='', end='\r')
                if len(tsv_parts) < 18:
                    print("Too few tabs on line",
                          linecount,
                          "skipping following line completely:",
                          file=stderr)
                    print(tsv_parts, file=stderr)
                    continue
                # read data from database
                wordmap = tsv_parts
                # exclusions
                if args.exclude_pos:
                    if wordmap['pos'] in args.exclude_pos:
                        continue
                if args.include_lemmas:
                    if wordmap['lemma'] not in lemmas:
                        continue
                if args.exclude_blacklisted:
                    if wordmap['blacklist'] in args.exclude_blacklisted:
                        wordmap['new_para'] = 'XXX_BLACKLISTED_SINK'
                # choose correct lexicon
                incoming_lexicon = tsv_parts['upos']
                if tsv_parts['is_suffix']:
                    postponed_suffixes.append(tsv_parts)
                    continue
                elif tsv_parts['abbr']:
                    postponed_abbrs[tsv_parts['abbr']].append(tsv_parts)
                    continue
                if curr_lexicon != incoming_lexicon:
                    print("\nLEXICON",
                          incoming_lexicon,
                          end="\n\n",
                          file=args.output)
                    curr_lexicon = incoming_lexicon
                # switch back to real POS when possible suffix lexicon has been
                # selected
                if wordmap['real_pos']:
                    wordmap['pos'] = wordmap['real_pos']
                # format output
                print(formatter.wordmap2lexc(wordmap), file=args.output)
            if len(postponed_suffixes) > 0:
                print("\nLEXICON SUFFIX\n\n", file=args.output)
                for suffix in postponed_suffixes:
                    print(formatter.wordmap2lexc(suffix), file=args.output)
            for key, words in sorted(postponed_abbrs.items()):
                print("\nLEXICON", key, "\n\n", file=args.output)
                for word in words:
                    print(formatter.wordmap2lexc(word), file=args.output)
        if args.verbose:
            print("\n", linecount, " entries in master db")
    # print stem parts
    for tsv_filename in args.spfilenames:
        if args.verbose:
            print("Reading from", tsv_filename)
        linecount = 0
        print("! Omorfi stemparts generated from",
              tsv_file.name,
              "! date:",
              strftime("%Y-%m-%d %H:%M:%S+%Z"),
              "! params: ",
              ' '.join(argv),
              file=args.output)
        print(formatter.copyright_lexc(), file=args.output)
        curr_lexicon = ""
        with open(tsv_filename, 'r', newline='') as tsv_file:
            tsv_reader = csv.reader(tsv_file,
                                    delimiter=args.separator,
                                    strict=True)
            for tsv_parts in tsv_reader:
                linecount += 1
                if len(tsv_parts) < 3:
                    print(tsv_filename,
                          linecount,
                          "Too few tabs on line",
                          "skipping following fields:",
                          tsv_parts,
                          file=stderr)
                    continue
                pos = tsv_parts[0].split("_")[0]
                if pos not in [
                        "ADJ", "NOUN", "VERB", "PROPN", "NUM", "PRON", "ADP",
                        "ADV", "SYM", "PUNCT", "INTJ", "X", "DIGITS", "CONJ",
                        "SCONJ", "AUX", "DET"
                ]:
                    print("Cannot deduce pos from incoming cont:",
                          tsv_parts[0], "Skipping")
                    continue
                if pos in args.exclude_pos:
                    continue
                # format output
                if curr_lexicon != tsv_parts[0]:
                    print("\nLEXICON",
                          tsv_parts[0],
                          end="\n\n",
                          file=args.output)
                    curr_lexicon = tsv_parts[0]
                for cont in tsv_parts[3:]:
                    print(formatter.continuation2lexc(tsv_parts[1],
                                                      tsv_parts[2], cont),
                          file=args.output)
    # print inflections
    for tsv_filename in args.inffilenames:
        if args.verbose:
            print("Reading from", tsv_filename)
        linecount = 0
        print("! Omorfi inflects generated from",
              tsv_file.name,
              "! date:",
              strftime("%Y-%m-%d %H:%M:%S+%Z"),
              "! params: ",
              ' '.join(argv),
              file=args.output)
        print(formatter.copyright_lexc(), file=args.output)
        curr_lexicon = ""
        # for each line
        with open(tsv_filename, 'r', newline='') as tsv_file:
            tsv_reader = csv.reader(tsv_file,
                                    delimiter=args.separator,
                                    strict=True)
            for tsv_parts in tsv_reader:
                linecount += 1
                if len(tsv_parts) < 3:
                    print(tsv_filename,
                          linecount,
                          "Too few tabs on line",
                          "skipping following fields:",
                          tsv_parts,
                          file=stderr)
                    continue
                pos = tsv_parts[0].split("_")[0]
                if pos not in [
                        "ADJ", "NOUN", "VERB", "PROPN", "NUM", "PRON", "ADP",
                        "ADV", "SYM", "PUNCT", "INTJ", "X", "DIGITS", "CONJ",
                        "SCONJ"
                ]:
                    print("Cannot deduce pos from incoming cont:",
                          tsv_parts[0], "Skipping")
                    continue
                if pos in args.exclude_pos:
                    continue
                # format output
                if curr_lexicon != tsv_parts[0]:
                    print("\nLEXICON",
                          tsv_parts[0],
                          end="\n\n",
                          file=args.output)
                    curr_lexicon = tsv_parts[0]
                for cont in tsv_parts[3:]:
                    print(formatter.continuation2lexc(tsv_parts[1],
                                                      tsv_parts[2], cont),
                          file=args.output)
    exit(0)
Пример #2
0
def main():
    # defaults
    curr_lexicon = dict()
    # initialise argument parser
    ap = argparse.ArgumentParser(
        description="Convert Finnish dictionary TSV data into xerox/HFST lexc format")
    ap.add_argument("--quiet", "-q", action="store_false", dest="verbose",
                    default=False,
                    help="do not print output to stdout while processing")
    ap.add_argument("--verbose", "-v", action="store_true", default=False,
                    help="print each step to stdout while processing")
    ap.add_argument("--stemparts", "-p", action="append", required=True,
                    dest='spfilenames',
                    metavar="SPFILE", help="read lexical roots from SPFILEs")
    ap.add_argument("--inflection", "-i", action="append", required=True,
                    dest='inffilenames',
                    metavar="INFFILE", help="read inflection from INFFILEs")
    ap.add_argument("--suffix-regexes", "-r", action="append", required=True,
                    dest='refilenames',
                    metavar="REFILE", help="read suffix regexes from REFILEs")
    ap.add_argument("--stub-deletions", "-d", action="append", required=True,
                    dest='sdfilenames',
                    metavar="SDFILE", help="read stub deletions from SDFILEs")
    ap.add_argument("--exclude-pos", "-x", action="append",
                    metavar="XPOS",
                    help="exclude all XPOS parts of speech from generation")
    ap.add_argument("--version", "-V", action="version")
    ap.add_argument("--output", "-o", "--one-file", "-1",
                    type=argparse.FileType("w"), required=True,
                    metavar="OFILE", help="write output to OFILE")
    ap.add_argument("--fields", "-F", action="store", default=1,
                    metavar="N", help="require N fields for tables")
    ap.add_argument("--separator", action="store", default="\t",
                    metavar="SEP", help="use SEP as separator")
    ap.add_argument("--comment", "-C", action="append", default=["#"],
                    metavar="COMMENT", help="skip lines starting with COMMENT that"
                    "do not have SEPs")
    ap.add_argument("--strip", action="store",
                    metavar="STRIP", help="strip STRIP from fields before using")
    ap.add_argument("--format", "-f", action="store", default="omor",
                    help="use specific output format for lexc data",
                    choices=["omor", "giella", "ftb3", "ftb1", "none", "apertium",
                             "labelsegments"])
    args = ap.parse_args()

    formatter = None
    if args.format == 'omor':
        formatter = OmorFormatter(args.verbose, new_para=False,
                                  allo=False, props=False, sem=False)
    elif args.format == 'ftb3':
        formatter = Ftb3Formatter(args.verbose)
    elif args.format == 'apertium':
        formatter = ApertiumFormatter(args.verbose)
    elif args.format == 'giella':
        formatter = GiellaFormatter(args.verbose)
    elif args.format == 'none':
        formatter = NoTagsFormatter(args.verbose,
                                    lemmatise=args.none_lemmas, segment=args.none_segments)
    elif args.format == 'labelsegments':
        formatter = LabeledSegmentsFormatter(args.verbose)
    else:
        print("DIDNT CONVERT FORMATTER YET", args.format)
        exit(1)
    # check args
    if args.strip == '"' or args.strip == "'":
        quoting = csv.QUOTE_ALL
    else:
        quoting = csv.QUOTE_NONE
    # setup files
    if args.verbose:
        print("Writing everything to", args.output.name)
        if args.exclude_pos:
            print("Not writing closed parts-of-speech data in",
                  ",".join(args.exclude_pos))
    # find deletions to map
    deletions = dict()
    for tsv_filename in args.sdfilenames:
        if args.verbose:
            print("Reading suffix mutations from", tsv_filename)
        linecount = 0
        with open(tsv_filename, 'r', newline='') as tsvfile:
            tsv_reader = csv.DictReader(tsvfile, delimiter=args.separator,
                                        quoting=quoting, escapechar='\\',
                                        strict=True)
            linecount = 0
            for tsv_parts in tsv_reader:
                linecount += 1
                if args.verbose and (linecount % 1000 == 0):
                    print(linecount, "...", sep='', end='\r')
                if len(tsv_parts) < 2:
                    print("bleh", file=stderr)
                    continue
                if 'deletion' in tsv_parts:
                    deletions[tsv_parts['new_para']] = tsv_parts['deletion']
                else:
                    deletions[tsv_parts['new_para']] = ''

    # print definitions to rootfile
    print(formatter.copyright_lexc(), file=args.output)
    if args.verbose:
        print("Creating Multichar_Symbols and Root")
    print(formatter.multichars_lexc(), file=args.output)
    print("LEXICON Root", file=args.output)
    print("0 GUESSERS ;", file=args.output)

    # print stem parts
    for tsv_filename in args.refilenames:
        if args.verbose:
            print("Reading from", tsv_filename)
        linecount = 0
        print("! Omorfi guessers generated from", tsv_filename,
              "! date:", strftime("%Y-%m-%d %H:%M:%S+%Z"),
              "! params: ", ' '.join(argv), file=args.output)
        print(formatter.copyright_lexc(), file=args.output)
        curr_lexicon = ""
        print("LEXICON GUESSERS", file=args.output)
        with open(tsv_filename, 'r', newline='') as tsv_file:
            tsv_reader = csv.reader(tsv_file, delimiter=args.separator,
                                    strict=True)
            for tsv_parts in tsv_reader:
                linecount += 1
                if len(tsv_parts) < 1:
                    print(tsv_filename, linecount,
                          "Too few tabs on line",
                          "skipping following fields:",
                          tsv_parts, file=stderr)
                    continue
                pos = tsv_parts[0].split("_")[0]
                if pos not in ["ADJ", "NOUN", "VERB", "PROPN", "NUM",
                               "PRON", "ADP", "ADV", "SYM", "PUNCT", "INTJ", "X",
                               "DIGITS", "CONJ", "SCONJ", "AUX", "DET"]:
                    print("Cannot deduce pos from incoming cont:",
                          tsv_parts[0], "Skipping")
                    continue
                # format output
                if tsv_parts[0] not in deletions:
                    print("DATOISSA VIRHE!", tsv_parts[0], "not in",
                          args.sdfilenames)
                    continue
                if len(tsv_parts) == 2:
                    print(formatter.guesser2lexc(
                        tsv_parts[1], deletions[tsv_parts[0]], tsv_parts[0]),
                        file=args.output)
                else:
                    print(formatter.guesser2lexc(
                        None, deletions[tsv_parts[0]], tsv_parts[0]),
                        file=args.output)
    # FOLLOWING IS SHARED WITH generate-lexcies
    # print stem parts
    for tsv_filename in args.spfilenames:
        if args.verbose:
            print("Reading from", tsv_filename)
        linecount = 0
        print("! Omorfi stemparts generated from", tsv_file.name,
              "! date:", strftime("%Y-%m-%d %H:%M:%S+%Z"),
              "! params: ", ' '.join(argv), file=args.output)
        print(formatter.copyright_lexc(), file=args.output)
        curr_lexicon = ""
        with open(tsv_filename, 'r', newline='') as tsv_file:
            tsv_reader = csv.reader(tsv_file, delimiter=args.separator,
                                    strict=True)
            for tsv_parts in tsv_reader:
                linecount += 1
                if len(tsv_parts) < 3:
                    print(tsv_filename, linecount,
                          "Too few tabs on line",
                          "skipping following fields:",
                          tsv_parts, file=stderr)
                    continue
                pos = tsv_parts[0].split("_")[0]
                if pos not in ["ADJ", "NOUN", "VERB", "PROPN", "NUM",
                               "PRON", "ADP", "ADV", "SYM", "PUNCT", "INTJ", "X",
                               "DIGITS", "CONJ", "SCONJ", "AUX", "DET"]:
                    print("Cannot deduce pos from incoming cont:",
                          tsv_parts[0], "Skipping")
                    continue
                # format output
                if curr_lexicon != tsv_parts[0]:
                    print("\nLEXICON", tsv_parts[0], end="\n\n",
                          file=args.output)
                    curr_lexicon = tsv_parts[0]
                for cont in tsv_parts[3:]:
                    print(formatter.continuation2lexc(
                        tsv_parts[1], tsv_parts[2], cont),
                        file=args.output)
    # print inflections
    for tsv_filename in args.inffilenames:
        if args.verbose:
            print("Reading from", tsv_filename)
        linecount = 0
        print("! Omorfi inflects generated from", tsv_file.name,
              "! date:", strftime("%Y-%m-%d %H:%M:%S+%Z"),
              "! params: ", ' '.join(argv), file=args.output)
        print(formatter.copyright_lexc(), file=args.output)
        curr_lexicon = ""
        # for each line
        with open(tsv_filename, 'r', newline='') as tsv_file:
            tsv_reader = csv.reader(tsv_file, delimiter=args.separator,
                                    strict=True)
            for tsv_parts in tsv_reader:
                linecount += 1
                if len(tsv_parts) < 3:
                    print(tsv_filename, linecount,
                          "Too few tabs on line",
                          "skipping following fields:", tsv_parts,
                          file=stderr)
                    continue
                pos = tsv_parts[0].split("_")[0]
                if pos not in ["ADJ", "NOUN", "VERB", "PROPN", "NUM",
                               "PRON", "ADP", "ADV", "SYM", "PUNCT", "INTJ",
                               "X", "DIGITS", "CONJ", "SCONJ"]:
                    print("Cannot deduce pos from incoming cont:",
                          tsv_parts[0], "Skipping")
                    continue
                # format output
                if curr_lexicon != tsv_parts[0]:
                    print("\nLEXICON", tsv_parts[0], end="\n\n",
                          file=args.output)
                    curr_lexicon = tsv_parts[0]
                for cont in tsv_parts[3:]:
                    print(formatter.continuation2lexc(
                        tsv_parts[1], tsv_parts[2], cont),
                        file=args.output)
    exit(0)