def modify(lines, modifiers): # util.dbg("mods", modifiers) if len(modifiers) == 0: return lines out = [] for line in lines: if not filter_word(line, modifiers): logger.debug("skip %s %s", line, modifiers) continue if "pos" in modifiers: line = re_sub(" [^ :]+:", " " + modifiers["pos"] + ":", line) # logger.debug("pos repl %s in %s", modifiers["pos"], line) if "force_gen" in modifiers and not ":patr" in line: force_gen = modifiers["force_gen"] line = re_sub(":[mfn](:|$)", ":" + force_gen + "\\1", line) logger.debug("gen repl: %s in %s", force_gen, line) out.append(line) if len(out) == 0: raise Exception("emtpy output for "+ str(lines) + " and " + str(modifiers)) return out
def adjustCommonFlag(affixFlag2): if ".cf" in affixFlag2: affixFlag2 = re_sub("(vr?)[1-4]\.cf", "\\1.cf", affixFlag2) # v5.cf is special if ".impers" in affixFlag2: affixFlag2 = re_sub("(vr?)[1-9]\.impers", "\\1.impers", affixFlag2) if ".patr" in affixFlag2: affixFlag2 = re_sub("n[0-9]+\.patr", "n.patr", affixFlag2) return affixFlag2
def expand_subposition_adv(last_adv, line, extra_tags): # print("+.adv", last_adv, file=sys.stderr) out_lines = [] if " +cs=" in line: word = re_match(r" \+cs=([^ ]+)", line).group(1) word = word[:-2] + "е" else: word = main_word[:-2] + "е" if "adjp" in extra_tags: extra_tags = re_sub(r":&?adjp(:pasv|:actv|:pres|:past|:perf|:imperf)+", "", extra_tags) w1 = compose_compar(word, last_adv, "adv:compr" + extra_tags) out_lines.append( w1 ) adv_super = compose_compar("най" + word, last_adv, "adv:super" + extra_tags) adv_super2 = compose_compar("щонай" + word, last_adv, "adv:super" + extra_tags) adv_super3 = compose_compar("якнай" + word, last_adv, "adv:super" + extra_tags) out_lines.extend( (adv_super, adv_super2, adv_super3) ) # print("...", w1, adv_super, file=sys.stderr) return out_lines
def expand_subposition(main_word, line, extra_tags, idx): idx = ":xx" + str(idx) logger.debug("expanding sub " + idx + " " + main_word + ": " + line) if line.startswith(" +cs"): if " +cs=" in line: word = re_match(" \\+cs=([^ ]+)", line).group(1) else: word = main_word[:-2] + "іший" if "&adjp" in extra_tags: extra_tags = re_sub(r":&adjp(:pasv|:actv|:pres|:past|:perf|:imperf)+", "", extra_tags) word_forms = expand(word, "/adj :compr" + idx + extra_tags, flush_stdout) # word_forms[0] = DERIV_PADDING + word_forms[0] word = "най" + word word_forms_super = expand(word, "/adj :super" + idx + extra_tags, flush_stdout) word_forms.extend(word_forms_super) word_scho = "що" + word word_forms_super = expand(word_scho, "/adj :super" + idx + extra_tags, flush_stdout) word_forms.extend(word_forms_super) word_jak = "як" + word word_forms_super = expand(word_jak, "/adj :super" + idx + extra_tags, flush_stdout) word_forms.extend(word_forms_super) if not "-corp" in sys.argv: word_forms = [ replace_base(line, main_word) for line in word_forms ] return word_forms raise "Unknown subposition for " + line + "(" + main_word + ")"
def get_modifiers(mod_flags, flags, word): # if not "^" in mod_flags and "/adj" in flags and "<" in flags: # mod_flags = "^noun " + mod_flags mods = {} if "/adj" in flags and "<" in flags: mods["pos"] = "noun" if not "=" in mod_flags: if "<+" in flags: if word.endswith("а"): mods["gen"] = "f" else: mods["gen"] = "mfp" return mods if "<" in flags: if word.endswith("а"): mods["gen"] = "fp" else: mods["gen"] = "mp" return mods if not "=" in mod_flags: mods["gen"] = "mfp" return mods mod_set = mod_flags.split() for mod in mod_set: if mod[0] == "^": if mod.startswith("^adjp"): mods["pos"] = mod[1:] else: mod_tags = mod[1:].split(":") mods["pos"] = mod_tags[0] if len(mod_tags) > 1 and mod_tags[0] == "noun": if len(mod_tags[1]) != 1: raise Exception("Bad gender override: " + str(mod) + " -- " + str(mod_tags)) mods["force_gen"] = mod_tags[1] elif mod[:2] == "g=": mods["gen"] = re_sub("g=([^ ])", "\\1", mod) #mod[2:3] elif mod[:2] == "p=": mods["pers"] = mod[2:3] elif mod.startswith("tag="): mods["tag"] = mod[4:] if "<+m" in flags or "<m" in flags: mods["force_gen"] = "m" if "n2adj" in flags: mods["gen"] = "m" # logger.debug("mods %s for %s and %s", str(mods), flags, mod_flags) return mods
def cleanup(line): return re_sub(":xx.", "", line)
def post_process(lines): out_lines = [] for line in lines: if " adv" in line and not "advp" in line and not ":compr" in line and not ":super" in line: line = promote(line) if not "--uncontr" in sys.argv: if "uncontr" in line: # we don't need uncontr for rules LT continue if not "-corp" in sys.argv: if "ranim" in line: # we can't handle it yet in LT line = line.replace(":ranim", "") elif "rinanim" in line: # we can't handle it yet in LT line = line.replace(":rinanim", "") if "-corp" in sys.argv: if "advp" in line: line = promote(line) elif "noun" in line: if ":anim" in line: line = line.replace(":anim", "").replace("noun:", "noun:anim:") elif ":inanim" in line: line = line.replace(":inanim", "").replace("noun:", "noun:inanim:") elif not "&pron" in line: line = line.replace("noun:", "noun:inanim:") elif "verb" in line: line = re_sub("(verb(?::rev)?)(.*)(:(im)?perf)", "\\1\\3\\2", line) elif "adj" in line: if ":comp" in line or ":super" in line: line = re_sub(" (adj:)(.*):(comp[br]|super)(.*)", " \\1\\3:\\2\\4", line) if ":&adjp" in line: adjp_line = re.sub(" (adj(?::compb|:compr|:super)?)(.*):&(adjp(?::pasv|:actv|:past|:pres|:perf|:imperf)+)(.*)", " \\3\\2\\4", line) out_lines.append(adjp_line) line = re.sub(":&adjp(:pasv|:actv|:past|:pres|:perf|:imperf)+", "", line) # util.dbg("-1-", line) # if "advp" in line: # line = re_sub("(.*) .* (advp.*)", "\\1 \\1 \\2", line) else: if ":&adjp" in line and ":comp" in line: # if ":comp" in line or ":super" in line: line = re_sub(" (adj:.:v_...:)(.*):(comp[br]|super)(.*)", " \\1\\3:\\2\\4", line) # out_lines.append(line) # TODO: extra :coll # if "сь advp" in line: # other_line = re_sub("(.*)сь (.*сь) (advp.*)", "\\1ся \\2 \\3:coll", line) # out_lines.append(other_line) # # if "verb:" in line and ":inf" in line and ("ти " in line): # or "тися " in line): # other_line = re_sub("^(.*)ти((?:ся)? [^ ]+) (verb:.*)", "\\1ть\\2 \\3:coll", line) # out_lines.append(other_line) #else: out_lines.append(line) out_lines = [ util.tail_tag(line, (":v-u", ":bad", ":slang", ":rare", ":coll", ":abbr")) for line in out_lines ] # TODO: add ":alt" return out_lines
def preprocess2(line): out_lines = [] # if "/v-u" in line or ".v-u" in line: # if "/v-u" in line: # line = re_sub(r"(?i)^([а-яіїєґ\"-]+) /v-u ?\^?", "\\1 ", line).replace(" :", ":") # else: # line = re_sub("\.v-u", "", line) # # space = " " # if " :" in line or not " /" in line: # space = "" # line = line + space + ":v-u" # line1 = re_sub("(^| )в", "\\1у", line) # out_lines = [line, line1] # logger.debug("v-u: " + str(out_lines)) if "/<" in line: if "<+" in line: extra_tag = ":anim:lname" else: extra_tag = ":anim:fname" if not "<m" in line and not "<+m" in line: # tag = "noun:f:v_naz/v_rod/v_dav/v_zna/v_oru/v_mis/k_kly" tag = "noun:f:nv:np" line1 = re_sub("/<\\+?f?", tag + extra_tag, line) out_lines.append(line1) if not "<f" in line and not "<+f" in line: tag = "noun:m:nv:np" line1 = re_sub("/<\\+?m?", tag + extra_tag, line) out_lines.append(line1) elif "/n2" in line and "<+" in line: if not "<+m" in line and util.dual_last_name_ending(line): out_lines.append(line) line_fem_lastname = line.split()[0] + " noun:f:nv:np:anim:lname" out_lines.append(line_fem_lastname) else: out_lines = [line] elif "/n1" in line and "<+" in line: if not "<+f" in line and not "<+m" in line: out_lines.append(line) line_masc_lastname = line.replace("<+", "<+m") out_lines.append(line_masc_lastname) else: out_lines = [line] elif "/np" in line: space = " " if " :" in line or not " /" in line: space = "" line = line + space + ":ns" out_lines = [line] elif ":imperf:perf" in line: line1 = line.replace(":perf", "") line2 = line.replace(":imperf", "").replace(".cf", "") #.replace(".advp") # so we don"t get two identical advp:perf lines out_lines = [line1, line2] elif ":&adj" in line and not " :&adj" in line: line = line.replace(":&adj", " :&adj") out_lines = [line] else: out_lines = [line] # out_lines2 = [] # for out_line in out_lines: # # if ":+f" in out_line: # out_line = out_line.replace(":f", "") # f_line = out_line + "" # out_lines2.append(f_line) # # out_lines2.append(out_line) # print("--", "\n++ ".join(out_lines), file=sys.stderr) return out_lines
def adjust_affix_tags(lines, main_flag, flags, modifiers): lines2 = [] for line in lines: # DL- if main_flag[1] == "n": if main_flag.startswith("/n2") and re_search("^/n2[01234]", main_flag): # base_word = lines[0].split()[0] base_word = line.split()[1] if util.istota(flags): if "m:v_rod" in line and not "/v_zna" in line: line = line.replace("m:v_rod", "m:v_rod/v_zna") if not base_word[-1:] in "аеєиіїоюя" and not ".a" in flags: # util.dbg("```", main_flag, line) word = line.split()[0] if word[-1:] in "ую": logger.debug("u/rod %s - %s", line, base_word) line = line.replace("v_dav", "v_rod/v_dav") if main_flag.startswith("/n2") and "@" in flags: word = line.split(" ", 1)[0] if word[-1:] in "ая" and "m:v_rod" in line: line = line.replace("m:v_rod", "m:v_rod/v_zna") if not "np" in main_flag and not ".p" in main_flag and not "n2adj" in flags: if ":p:" in line: logger.debug("skipping line with p: " + line) elif "//p:" in line: line = re_sub("//p:.*", "", line) logger.debug("removing //p from: " + line) if "/v_kly" in line: if main_flag.startswith("/n1"): # Єремія /n10.ko.patr.< base_word = line.split()[1] if ("<+" in flags and not ":p:" in line) or not util.person(flags) \ or (not ":patr" in line and re_search("\\.k[eo]", flags)) \ or (":m:" in line and ("<+" in flags)) \ or (main_flag.startswith("/n20") and base_word.endswith("ло") and "v_dav" in line): logger.debug("removing v_kly from: %s, %s", line, flags) line = line.replace("/v_kly", "") if ".p" in main_flag or "np" in main_flag: if util.person(flags): line = line.replace("p:v_naz", "p:v_naz/v_kly") if util.istota(flags): line = line.replace("p:v_rod", "p:v_rod/v_zna") if ">" in flags: # animal line = line.replace("p:v_naz", "p:v_naz/v_zna") else: line = line.replace("p:v_naz", "p:v_naz/v_zna") elif ":perf" in flags and ":pres" in line: line = line.replace(":pres", ":futr") elif main_flag.startswith("/adj"): if "<" in flags or "^noun" in flags: if ":uncontr" in line: continue if "<" in flags: if not ">" in flags and ":p:v_naz/v_zna" in line: line = line.replace("v_naz/v_zna", "v_naz/v_kly") if ":m:v_naz" in line and not "<+" in flags: line = line.replace("v_naz", "v_naz/v_kly") elif "^noun" in flags: if ":m:v_rod/v_zna" in line: line = line.replace("v_rod/v_zna", "v_rod") elif ":p:v_rod/v_zna" in line: line = line.replace("v_rod/v_zna", "v_rod") # if "<" in flags: # if util.person(flags): # line = line.replace("p:v_naz", "p:v_naz/v_kly") # # if util.istota(flags): # line = line.replace("p:v_rod", "p:v_rod/v_zna") # if ">" in flags: # animal # line = line.replace("p:v_naz", "p:v_naz/v_zna") # else: # line = line.replace("p:v_naz", "p:v_naz/v_zna") lines2.append(line) return lines2
def post_expand(lines, flags): if len(lines) == 0: raise Exception("emtpy lines") extra_flags = get_extra_flags(flags) if extra_flags: first_name_base = util.firstname(lines[0], flags) out_lines = [] extra_out_lines = [] for line in lines: extra_flags2 = extra_flags if first_name_base and not ":patr" in line: extra_flags2 += ":fname" if " advp" in line: if ":imperf" in line: extra_flags2 = re_sub(":(im)?perf", "", extra_flags2) else: line = line.replace(":perf", "") elif "adj.adv" in flags and " adv" in line: extra_flags2 = re_sub(r":&?adjp(:pasv|:actv|:pres|:past|:perf|:imperf)+", "", extra_flags2) elif ":+m" in extra_flags: extra_flags2 = extra_flags2.replace(":+m", "") if ":f:" in line: masc_line = line.replace(":f:", ":m:") + extra_flags2 extra_out_lines.append(masc_line) elif ":n:" in line: masc_line = line.replace(":n:", ":m:") + extra_flags2 if util.istota(flags): if "m:v_rod" in masc_line: masc_line2 = masc_line.replace("m:v_rod", "m:v_zna") extra_out_lines.append(masc_line2) elif "m:v_zna" in masc_line: masc_line = "" if "m:v_kly" in masc_line: word, lemma, tags = masc_line.split() masc_line = word[:-1]+"е " + lemma + " " + tags if masc_line: extra_out_lines.append(masc_line) elif ":+f" in extra_flags: extra_flags2 = extra_flags2.replace(":+f", "") if ":m:" in line: masc_line = line.replace(":m:", ":f:") + extra_flags2 extra_out_lines.append(masc_line) elif ":n:" in line: masc_line = line.replace(":n:", ":f:") + extra_flags2 # if util.istota(flags): # if "m:v_rod" in masc_line: # masc_line2 = masc_line.replace("m:v_rod", "m:v_zna") # extra_out_lines.append(masc_line2) # elif "m:v_zna" in masc_line: # masc_line = "" if masc_line: extra_out_lines.append(masc_line) elif ":patr" in line and ":anim" in extra_flags2: line = line.replace(":patr", ":anim:patr") extra_flags2 = extra_flags2.replace(":anim", "") out_lines.append(line + extra_flags2) out_lines.extend(extra_out_lines) return out_lines return lines