예제 #1
0
def test_asco():

    wordlist_data = """\
asca {n-meta} :: x
asca {n-forms} :: pl=ascas
asca {m} [mycology] | teca :: ascus
asco {n-meta} :: x
asco {n-forms} :: pl=ascos
asco {m} :: disgust
asco {m} :: nausea
asco {n-meta} :: x
asco {n-forms} :: pl=ascos
asco {m} :: alternative form of "asca"
"""

    wordlist = Wordlist(wordlist_data.splitlines())
    allforms = AllForms.from_wordlist(wordlist)
    freq = FrequencyList(wordlist, allforms, sentences)

    assert freq.all_forms.get_lemmas("asco") == ['n|asco', 'n|asca']
    assert freq.get_lemmas("asco", "n") == ["asca", "asco"]
    assert freq.get_best_lemma("asco", ["asca", "asco"], "n") == "asco"

    flist_data = """\
asco 10
"""
    assert "\n".join(freq.process(flist_data.splitlines())) == """\
예제 #2
0
def test_diva():

    wordlist_data = """\
_____
diva
pos: adj
  meta: {{head|es|adjective form}}
  gloss: adjective form of "divo"
pos: n
  meta: {{es-noun|f|m=divo}}
  g: f
  gloss: diva
_____
divo
pos: adj
  meta: {{es-adj}}
  gloss: star (famous)
pos: n
  meta: {{es-noun|m|f=diva}}
  g: m
  gloss: star, celeb
"""

    wordlist = Wordlist(wordlist_data.splitlines())
    allforms = AllForms.from_wordlist(wordlist)
    freq = FrequencyList(wordlist, allforms, sentences)

    assert freq.all_forms.get_lemmas("diva") == ['adj|divo', 'n|divo']
    assert freq.get_lemmas("diva", "n") == ["divo"]

    flist_data = """\
diva 10
"""
    assert "\n".join(freq.process(flist_data.splitlines())) == """\
예제 #3
0
def test_izquierdas():
    wordlist_data = """\
_____
izquierda
pos: adj
  meta: {{head|es|adjective form|g=f-s}}
  g: f-s
  gloss: adjective form of "izquierdo"
pos: n
  meta: {{es-noun|f|-}}
  g: f
  gloss: left (side, direction)
  gloss: left
    q: politics
_____
izquierdas
pos: adj
  meta: {{head|es|adjective form}}
  gloss: adjective form of "izquierdo"
pos: n
  meta: {{head|es|noun form|g=f-p}}
  g: f-p
  gloss: plural of "izquierda"
_____
izquierdo
pos: adj
  meta: {{es-adj}}
  gloss: left; on the left side or toward the left; the opposite of right
    syn: siniestro
  gloss: left-handed
  gloss: crooked
_____
izquierdos
pos: adj
  meta: {{head|es|adjective form|g=m-p}}
  g: m-p
  gloss: plural of "izquierdo"
_____
"""

    wordlist = Wordlist(wordlist_data.splitlines())
    allforms = AllForms.from_wordlist(wordlist)
    freq = FrequencyList(wordlist, allforms, sentences)

    print(allforms.all_forms)

    assert freq.get_lemmas("izquierdas", "n") == ["izquierda"]
    assert freq.get_lemmas("izquierdo", "adj") == ["izquierdo"]
    assert freq.get_lemmas("izquierdos", "adj") == ["izquierdo"]
    assert freq.get_lemmas("izquierdas", "adj") == ["izquierdo"]
    assert freq.get_ranked_pos("izquierda") == ['n', 'adj']
    assert freq.get_ranked_pos("izquierdas") == ['n', 'adj']

    flist_data = """\
izquierda 34629
izquierdo 8150
izquierdas 436
izquierdos 234
"""
    assert "\n".join(freq.process(flist_data.splitlines())) == """\
예제 #4
0
def wordlist():

    data = """\
_____
actor
pos: n
  meta: {{es-noun|m|f=actriz|f2=+}}
  g: m
  etymology: From Latin "actor".
  gloss: An actor (person who performs in a theatrical play or movie)
_____
actriz
pos: n
  meta: {{es-noun|f|m=actor}}
  gloss: actress
_____
alegre
pos: adj
  meta: {{es-adj}}
  gloss: joyful, cheerful
_____
dentista
pos: n
  meta: {{es-noun|mf}}
  g: mf
  etymology: diente + -ista
  gloss: dentist
_____
rojo
pos: adj
  meta: {{es-adj}}
  gloss: red
"""

    return Wordlist(data.splitlines())
예제 #5
0
def main():
    import argparse

    parser = argparse.ArgumentParser(
        description="List verbs missing a type label in at least one sense")
    parser.add_argument("wordlist", help="wordlist")
    args = parser.parse_args()

    wordlist = Wordlist.from_file(args.wordlist)

    missing_type = set()
    for word in wordlist.iter_all_words():
        if word.pos != "v":
            continue

        if not len(word.senses):
            continue

        if word.meta and "verb form" in word.meta:
            continue

        for s in word.senses:
            if not s.qualifier or not re.search(
                    r"(transitive|reflexive|pronominal)", s.qualifier):
                missing_type.add(word.word)
                break

    for f in sorted(missing_type):
        print(f)
예제 #6
0
def test_aquellos():

    wordlist_data = """\
aquél {pron-meta} :: {{head|es|pronoun|demonstrative, feminine|aquélla|neuter|aquello|masculine plural|aquéllos|feminine plural|aquéllas|g=m}}
aquél {pron-forms} :: demonstrative_feminine=aquélla; feminine_plural=aquéllas; masculine_plural=aquéllos; neuter=aquello
aquél {pron} [demonstrative] :: that one (far from speaker and listener)
aquéllos {pron-meta} :: {{head|es|pronoun|demonstrative|g=m-p}}
aquéllos {pron} :: plural of "aquél"; those ones (far from speaker and listener)
aquel {pron-meta} :: {{head|es|pronoun|g=m|feminine|aquella|neutrum|aquello|masculine plural|aquellos|neutrum plural|aquellos|feminine plural|aquellas}}
aquel {pron-forms} :: feminine=aquella; feminine_plural=aquellas; masculine_plural=aquellos; neutrum=aquello; neutrum_plural=aquellos
aquel {pron} [demonstrative] :: alternative spelling of "aquél"
aquellos {pron-meta} :: {{head|es|pronoun|demonstrative|g=m-p}}
aquellos {pron} :: alternative spelling of "aquéllos"; those ones (over there; implying some distance). The unaccented form can function as a pronoun if it can be unambiguously deduced as such from context.
aquellos {pron-meta} :: {{head|es|pronoun|g=n-p}}
aquellos {pron} :: Those ones. (over there; implying some distance)
"""

    wordlist = Wordlist(wordlist_data.splitlines())
    allforms = AllForms.from_wordlist(wordlist)
    freq = FrequencyList(wordlist, allforms, sentences)

    assert freq.get_lemmas("aquellos", "pron") == ['aquellos', 'aquél']

    assert freq.get_best_lemma("aquellos", ['aquellos', 'aquél'],
                               "pron") == "aquél"

    flist_data = """\
aquellos 10
"""
    assert "\n".join(freq.process(flist_data.splitlines())) == """\
예제 #7
0
def test_format_def():

    wordlist_data = """\
_____
rendir
pos: v
  meta: {{es-verb|rend|ir|pres=rindo}} {{es-conj-ir|r|nd|p=e-i|combined=1}}
  gloss: to conquer
    q: transitive
  gloss: to tire, exhaust
    q: transitive
  gloss: to yield, pay, submit, pass down
    q: ditransitive
  gloss: to vomit
    q: intransitive
  gloss: to make headway
    q: intransitive
  gloss: to surrender, give in, give up
    q: reflexive
  gloss: to be paid (homage or tribute)
    q: reflexive
"""

    wordlist = Wordlist(wordlist_data.splitlines())
    sentences = None
    ignore = []
    allforms = AllForms.from_wordlist(wordlist)

    # Full definition without ignore list
    deck = DeckBuilder(wordlist, sentences, ignore, allforms)

    usage = deck.get_usage("rendir", "v")
    print(usage)
    assert usage == {
          'v': {'ditransitive': ['to yield, pay, submit, pass down']},
          'vi': {'': ['to vomit', 'to make headway']},
          'vr': {'': ['to surrender, give in, give up',
                      'to be paid (homage or tribute)']},
          'vt': {'': ['to conquer', 'to tire, exhaust']}}


    item = {'m/f': {'': ['retiree, pensioner (retired person)']}}
    assert DeckBuilder.format_def(item, hide_word="jubilado") == '<span class="pos n m_f mf">{mf} <span class="usage">retiree, pensioner (retired person)</span></span>'


    format_def = DeckBuilder.format_def

    item = { "m": { "tag": [ "def1", "def2" ] } }
    assert format_def(item) == """<span class="pos n m"> <span class="tag">[tag]:</span><span class="usage">def1; def2</span></span>"""

    item = { "m": { "Peru": [ "def1", "def2" ] } }
    assert format_def(item) == """<span class="pos n m only-latin-america only-peru only-south-america peru"> <span class="tag">[Peru]:</span><span class="usage">def1; def2</span></span>"""

    item = { "m": { "South America": [ "def1", "def2" ] } }
    assert format_def(item) == """<span class="pos n m only-latin-america only-south-america south-america"> <span class="tag">[South America]:</span><span class="usage">def1; def2</span></span>"""

    item = {'f': {'': ['sewer, storm drain'], 'zoology': ['cloaca']}}
    assert format_def(item, hide_word='cloaca') == """<span class="pos n f"> <span class="usage">sewer, storm drain</span></span>"""
예제 #8
0
def test_filters():
    ignore_data = """\
# comment
#
- abuela {f}
- abuelo {m} :: loose tufts
"""

    wordlist_data = """\
abuela {n-meta} :: {{es-noun|m=abuelo}}
abuela {f} :: grandmother, female equivalent of "abuelo"
abuela {f} [colloquial] :: old woman
abuela {f} [Mexico] :: a kind of flying ant
abuelo {n-meta} :: {{es-noun|f=abuela}}
abuelo {m} :: grandfather
abuelo {m} [colloquial, endearing] :: an elderly person
abuelo {m} | tolano :: loose tufts of hair in the nape when one's hair is messed up
"""


    wordlist = Wordlist(wordlist_data.splitlines())
    sentences = None
    ignore = []
    allforms = AllForms.from_wordlist(wordlist)

    # Full definition without ignore list
    deck = DeckBuilder(wordlist, sentences, ignore, allforms)
    assert deck.filter_gloss("abuela", "", "", "grandmother") == "grandmother"
    assert deck.filter_gloss("abuela", "", "", 'grandmother, female equivalent of "abuelo"') == "grandmother"

    usage = deck.get_usage("abuelo", "n")
    assert usage == {
        'm/f':
        {'f': ['grandmother'],
         'f, colloquial': ['old woman'],
         'f, Mexico': ['a kind of flying ant'],
         'm': ['grandfather', "loose tufts of hair in the nape when one's hair is messed up"],
         'm, colloquial, endearing': ['an elderly person']
        }}


    # With ignore list
    ignore = DeckBuilder.load_ignore_data(ignore_data.splitlines())
    deck = DeckBuilder(wordlist, sentences, ignore, allforms)

    assert deck.filter_gloss("abuela", "x", "", "grandmother") == "grandmother"
    assert deck.filter_gloss("abuela", "f", "", "grandmother") == None
    assert deck.filter_gloss("abuela", "f", "colloquial", "old woman") == None
    assert deck.filter_gloss("abuelo", "m", "", "loose tufts of hair") == None
    assert deck.filter_gloss("abuelo", "m", "", "grandfather") == "grandfather"

    usage = deck.get_usage("abuelo", "n")
    assert usage == {
        'm/f':
        {
         '': ['grandfather'],
         'colloquial, endearing': ['an elderly person']
        }}
예제 #9
0
def test_lemma_filters():

    wordlist_data = """\
_____
ir
pos: v
  meta: {{es-verb}} {{es-conj}} {{es-conj|irse}}
  gloss: to go (away from speaker and listener)
    q: intransitive
  gloss: to come (towards or with the listener)
    q: intransitive
  gloss: to be going to (near future), to go (+ a + infinitive)
    q: auxiliary
  gloss: to go away, to leave, to be off (see irse)
    q: reflexive
_____
irse
pos: v
  meta: {{es-verb}} {{es-conj}}
  gloss: to go away, to leave, to depart, to go (when the destination is not essential; when something or someone is going somewhere else)
    syn: andarse; marcharse
  gloss: to leak out (with liquids and gasses), to boil away, to go flat (gas in drinks)
  gloss: to overflow
  gloss: to go out (lights)
  gloss: to finish, to wear out, to disappear (e.g. money, paint, pains, mechanical parts)
  gloss: to die
  gloss: to break wind, to fart
    q: informal
  gloss: to wet/soil oneself (i.e., urinate or defecate in one's pants)
    q: informal
  gloss: to come, to cum, to e*******e, to o****m
    q: vulgar
"""

    wordlist = Wordlist(wordlist_data.splitlines())
    allforms = AllForms.from_wordlist(wordlist)
    freq = FrequencyList(wordlist, allforms, sentences)

    print(allforms.all_forms["nos vamos"])

    assert freq.all_forms.get_lemmas("vamos") == ['v|ir']
    assert freq.all_forms.get_lemmas("nos vamos") == ['v|ir', 'v|irse']
    assert freq.get_lemmas("vamos", "v") == ["ir"]
    assert freq.get_lemmas("ir", "v") == ["ir"]

    assert freq.include_word("vamos", "v") == True
    assert freq.filter_pos("vamos", ["v"]) == ["v"]
    #    assert len(freq.wordlist.get_words("vamos", "v")) > 0
    assert freq.get_ranked_pos("vamos") == ["v"]
    assert freq.get_lemmas("vamos", "v") == ["ir"]

    flist_data = """\
vamos 10
va 10
"""
    assert "\n".join(freq.process(flist_data.splitlines())) == """\
예제 #10
0
def test_filters2():
    ignore_data = """\
- test {f}
"""

    wordlist_data = """\
test {n-meta} :: x
test {n-forms} :: pl=tests
test {m} :: masculine
test {n-meta} :: x
test {n-forms} :: pl=tests
test {f} :: feminine
"""

    xwordlist_data = """\
_____
test
  forms: pl=tests
  pos: n
  form: m
  gloss: masculine
____
test
  forms: pl=tests
  pos: n
  form: f
  gloss: feminine
"""


    wordlist = Wordlist(wordlist_data.splitlines())
    sentences = None
    ignore = []
    allforms = AllForms.from_wordlist(wordlist)

    # Full definition without ignore list
    deck = DeckBuilder(wordlist, sentences, ignore, allforms)

    usage = deck.get_usage("test", "n")
    assert usage == {
        'm-f':
        {'f': ['feminine'],
         'm': ['masculine']
        }}


    # With ignore list
    ignore = DeckBuilder.load_ignore_data(ignore_data.splitlines())
    deck = DeckBuilder(wordlist, sentences, ignore, allforms)

    usage = deck.get_usage("test", "n")
    assert usage == {
        'm':
        {
         '': ['masculine'],
        }}
예제 #11
0
    def __init__(self, lang_id, wordlist=None, debug=()):
        self.LANG_SECTION = lang_ids[lang_id]
        self.LANG_ID = lang_id
        self._problems = {}
        self._stats = {}
        self._debug_fix = set(debug)
        self.fixes = set()

        self.wordlist = Wordlist.from_file(wordlist) if isinstance(
            wordlist, str) else wordlist
예제 #12
0
def test_veros():

    wordlist_data = """\
ver {v-meta} :: {{es-verb}} {{es-conj}}
ver {v} :: x
vero {n-meta} :: {{es-noun|m}}
vero {m} [heraldry] :: vair
"""

    wordlist = Wordlist(wordlist_data.splitlines())
    allforms = AllForms.from_wordlist(wordlist)
    freq = FrequencyList(wordlist, allforms, sentences)

    assert freq.get_ranked_pos("veros") == ["v", "n"]
예제 #13
0
def test_filters():

    wordlist_data = """\
test {n-meta} :: x
test {m} :: test
test {adj-meta} :: x
test {adj} :: obsolete form of "test"
"""

    wordlist = Wordlist(wordlist_data.splitlines())
    allforms = AllForms.from_wordlist(wordlist)
    freq = FrequencyList(wordlist, allforms, sentences)

    assert freq.filter_pos("test", ["n", "adj"]) == ["n"]
    assert freq.get_ranked_pos("test") == ["n"]
예제 #14
0
def test_piernas():
    wordlist_data = """\
pierna {n-meta} :: {{es-noun|f}}
pierna {n-forms} :: pl=piernas
pierna {f} | pata :: leg (lower limb of a human)
piernas {n-meta} :: {{es-noun|m|piernas}}
piernas {n-forms} :: pl=piernas
piernas {m} [dated] :: twit; idiot
"""

    wordlist = Wordlist(wordlist_data.splitlines())
    allforms = AllForms.from_wordlist(wordlist)
    freq = FrequencyList(wordlist, allforms, sentences)

    lemmas = ['pierna', 'piernas']
    assert freq.get_lemmas("piernas", "n") == lemmas
    assert freq.get_best_lemma("piernas", lemmas, "n") == "pierna"
예제 #15
0
def test_hamburguesa():
    wordlist_data = """\
hamburgués {n-meta} :: {{es-noun|m|hamburgueses|f=hamburguesa|fpl=hamburguesas}}
hamburgués {n-forms} :: f=hamburguesa; fpl=hamburguesas; pl=hamburgueses
hamburgués {m} :: Hamburger, a person from Hamburg
hamburguesa {n-meta} :: {{es-noun|f}}
hamburguesa {n-forms} :: pl=hamburguesas
hamburguesa {f} :: hamburger
hamburguesa {f} :: female equivalent of "hamburgués"; Hamburger
"""

    wordlist = Wordlist(wordlist_data.splitlines())
    allforms = AllForms.from_wordlist(wordlist)
    freq = FrequencyList(wordlist, allforms, sentences)

    lemmas = ['hamburguesa', 'hamburgués']
    assert freq.get_lemmas("hamburguesa", "n") == lemmas
    assert freq.get_best_lemma("hamburguesa", lemmas, "n") == "hamburguesa"
예제 #16
0
def test_simple2():

    wordlist_data = """\
rojo {adj-meta} :: {{es-adj|f=roja}}
rojo {adj} :: red (colour)
rojo {n-meta} :: {{es-noun|m}}
rojo {m} :: red (colour)
rojo {m} [Costa Rica] :: a 1000 colón bill
rojo {m} [Spain, derogatory] :: a left-wing, especially communist
roja {n-meta} :: {{es-noun|f|m=rojo}}
roja {f} :: Red (Communist)
"""

    wordlist = Wordlist(wordlist_data.splitlines())
    allforms = AllForms.from_wordlist(wordlist)
    freq = FrequencyList(wordlist, allforms, sentences)

    assert freq.get_ranked_pos("roja") == ["adj", "n"]
예제 #17
0
def test_microondas():

    wordlist_data = """\
microonda {n-meta} :: {{es-noun|f}}
microonda {n-forms} :: pl=microondas
microonda {f} :: microwave (electromagnetic wave)
microondas {n-meta} :: {{es-noun|m|microondas}}
microondas {n-forms} :: pl=microondas
microondas {m} | horno de microondas :: microwave oven, microwave
microondas {m} :: necklacing (execution by burning tyre)
"""

    wordlist = Wordlist(wordlist_data.splitlines())
    allforms = AllForms.from_wordlist(wordlist)
    freq = FrequencyList(wordlist, allforms, sentences)

    lemmas = ["microonda", "microondas"]
    assert freq.get_lemmas("microondas", "n") == lemmas
    assert freq.get_best_lemma("microondas", lemmas, "n") == "microondas"
예제 #18
0
def test_rasguno():

    wordlist_data = """\
rasguñar {v-meta} :: {{es-verb}} {{es-conj}}
rasguñar {vt} | arañar; rascar :: to scratch
rasguño {n-meta} :: {{es-noun}}
rasguño {m} | arañazo :: scratch
"""

    wordlist = Wordlist(wordlist_data.splitlines())
    allforms = AllForms.from_wordlist(wordlist)
    freq = FrequencyList(wordlist, allforms, sentences)

    assert freq.all_forms.get_lemmas("rasguño") == ['v|rasguñar', 'n|rasguño']
    assert freq.get_ranked_pos("rasguño") == ["n", "v"]

    flist_data = """\
rasguño 10
"""
    assert "\n".join(freq.process(flist_data.splitlines())) == """\
예제 #19
0
def test_veras():

    wordlist_data = """\
vera {n-meta} :: {{es-noun|f}}
vera {n-forms} :: pl=veras
vera {f} [poetic] | lado :: side, face
vera {n-meta} :: {{es-noun|f}}
vera {n-forms} :: pl=veras
vera {f} :: verawood (Bulnesia arborea)
veras {n-meta} :: {{es-noun|f-p}}
veras {fp} :: truth; reality
veras {fp} :: serious things
"""

    wordlist = Wordlist(wordlist_data.splitlines())
    allforms = AllForms.from_wordlist(wordlist)
    freq = FrequencyList(wordlist, allforms, sentences)

    assert freq.get_lemmas("veras", "n") == ["vera", "veras"]
    assert freq.get_best_lemma("veras", ["vera", "veras"], "n") == "veras"
예제 #20
0
def test_vete():

    wordlist_data = """\
ir {v-meta} :: {{es-verb}} {{es-conj}} {{es-conj|irse}}
ir {v} :: x
ver {v-meta} :: {{es-verb}} {{es-conj}}
ver {v} :: x
verse {v-meta} :: {{es-verb}} {{es-conj}}
verse {v} :: x
vetar {v-meta} :: {{es-verb}} {{es-conj}}
vetar {v} :: x
"""

    wordlist = Wordlist(wordlist_data.splitlines())
    allforms = AllForms.from_wordlist(wordlist)
    freq = FrequencyList(wordlist, allforms, sentences)

    assert freq.get_lemmas("vete", "v") == ['ir', 'ver', 'vetar']

    assert freq.get_best_lemma("vete", ['ir', 'ver', 'vetar'], "v") == "ir"
예제 #21
0
def main():

    import argparse
    argparser = argparse.ArgumentParser(
        description="Find lemmas with only 'form of' senses")
    argparser.add_argument("--wordlist",
                           help="wordlist to load",
                           required=True)
    argparser.add_argument("--limit",
                           type=int,
                           help="Limit processing to first N articles")
    argparser.add_argument("--progress",
                           help="Display progress",
                           action='store_true')
    argparser.add_argument(
        "--save", help="Save to wiktionary with specified commit message")
    args = argparser.parse_args()

    if not os.path.isfile(args.wordlist):
        raise FileNotFoundError(f"Cannot open: {args.wordlist}")

    wordlist = Wordlist.from_file(args.wordlist)

    count = 0
    for word in wordlist.iter_all_words():

        if not count % 1000 and args.progress:
            print(count, end='\r', file=sys.stderr)

        if args.limit and count >= args.limit:
            break
        count += 1

        check_word(word)

    if args.save:
        base_url = "User:JeffDoozan/lists"
        logger.save(base_url, WikiSaver, commit_message=args.save)
    else:
        dest = ""
        logger.save(dest, FileSaver)
예제 #22
0
def test_bienes():

    wordlist_data = """\
bien {n-meta} :: {{es-noun|m|bienes}}
bien {m} :: good (as opposed to evil)
bienes {n-meta} :: {{es-noun|m-p}}
bienes {mp} :: goods (that which is produced, traded, bought or sold)
"""

    wordlist = Wordlist(wordlist_data.splitlines())
    allforms = AllForms.from_wordlist(wordlist)
    freq = FrequencyList(wordlist, allforms, sentences)

    assert freq.all_forms.get_lemmas("bienes") == ['n|bien', 'n|bienes']
    assert freq.get_lemmas("bienes", "n") == ["bien", "bienes"]
    assert freq.get_best_lemma("bienes", ["bien", "bienes"], "n") == "bienes"

    flist_data = """\
bienes 10
"""
    assert "\n".join(freq.process(flist_data.splitlines())) == """\
예제 #23
0
def main():
    parser = argparse.ArgumentParser(description="Find verbs with split data")
    parser.add_argument("--dictionary", help="Dictionary file name", required=True)
    parser.add_argument("--save", help="Save to wiktionary with specified commit message")
    args = parser.parse_args()

    wordlist = Wordlist.from_file(args.dictionary)

    for word in wordlist.iter_all_words():
        if not word.pos == "v" or " " in word.word or not word.word.endswith("r"):
            continue

        if wordlist.has_word(word.word + "se", "v"):
            log(word.word)

    if args.save:
        base_url = "User:JeffDoozan/lists"
        logger.save(base_url, WikiSaver, commit_message=args.save)
    else:
        dest = ""
        logger.save(dest, FileSaver)
예제 #24
0
def test_hijo():

    wordlist_data = """\
hija {n-meta} :: x
hija {n-forms} :: m=hijo; mpl=hijos; pl=hijas
hija {f} :: daughter; feminine noun of "hijo"
hijo {n-meta} :: x
hijo {n-forms} :: f=hija; fpl=hijas; pl=hijos
hijo {m} :: son
hijo {m} :: child (when the gender of the child is unknown)
"""

    wordlist = Wordlist(wordlist_data.splitlines())
    allforms = AllForms.from_wordlist(wordlist)
    freq = FrequencyList(wordlist, allforms, sentences)

    assert freq.all_forms.get_lemmas("hijo") == ['n|hijo']
    assert freq.get_lemmas("hijo", "n") == ["hijo"]

    flist_data = """\
hijo 10
"""
    assert "\n".join(freq.process(flist_data.splitlines())) == """\
예제 #25
0
def test_dios():

    wordlist_data = """\
dios {n-meta} :: {{es-noun|m|dioses|f=diosa}}
dios {n-forms} :: f=diosa; fpl=diosas; pl=dioses
dios {m} :: god
diosa {n-meta} :: {{es-noun|f|m=dios}}
diosa {n-forms} :: m=dios; mpl=dios; pl=diosas
diosa {f} :: goddess
diosa {n-meta} :: {{es-noun|f}}
diosa {n-forms} :: pl=diosas
diosa {f} [biochemistry] :: diose
"""

    wordlist = Wordlist(wordlist_data.splitlines())
    allforms = AllForms.from_wordlist(wordlist)
    freq = FrequencyList(wordlist, allforms, sentences)

    assert freq.get_lemmas("dioses", "n") == ["dios"]
    assert freq.get_lemmas("diosas", "n") == ["dios", "diosa"]
    assert freq.get_lemmas("diosa", "n") == ["dios", "diosa"]

    assert freq.get_best_lemma("diosa", ["dios", "diosa"], "n") == "dios"

    #    assert list(freq.all_forms.get_lemmas("dios", {})) == ['n:dios:m']
    #    assert list(freq.all_forms.get_lemmas("dioses", {})) == ['n:dios:pl']
    #    assert list(freq.all_forms.get_lemmas("diosa", {})) == ["n:dios:f"]
    #    assert list(freq.all_forms.get_lemmas("diosas", {})) == ["n:diosa:pl"]

    flist_data = """\
dios 10
dioses 10
diosa 10
diosas 10
"""
    assert "\n".join(freq.process(flist_data.splitlines())) == """\
예제 #26
0
    )
    parser.add_argument("--low-mem", help="Use less memory", action='store_true', default=False)
    args = parser.parse_args()

    if not args.sentences:
        args.sentences = "sentences.tsv"

    if not args.data_dir:
        args.data_dir = os.environ.get("SPANISH_DATA_DIR", "spanish_data")

    if not args.custom_dir:
        args.custom_dir = os.environ.get("SPANISH_CUSTOM_DIR", "spanish_custom")

    with open(args.dictionary) as wordlist_data:
        cache_words = not args.low_mem
        wordlist = Wordlist(wordlist_data, cache_words=cache_words)

    print("wordlist", mem_use(), file=sys.stderr)
    ignore_data = open(args.ignore) if args.ignore else []

    if args.allforms:
        allforms = AllForms.from_file(args.allforms)
    else:
        allforms = AllForms.from_wordlist(wordlist)
    print("all_forms", mem_use(), file=sys.stderr)

    sentences = spanish_sentences(
        sentences=args.sentences, data_dir=args.data_dir, custom_dir=args.custom_dir
    )

    flist = FrequencyList(wordlist, allforms, sentences)
예제 #27
0
def main():
    parser = argparse.ArgumentParser(description="Find usually plural nouns")
    parser.add_argument("--dictionary",
                        help="Dictionary file name",
                        required=True)
    parser.add_argument("--ngprobs", help="Ngram probability data file")
    parser.add_argument(
        "--save", help="Save to wiktionary with specified commit message")
    args = parser.parse_args()

    probs = NgramPosProbability(args.ngprobs)
    wordlist = Wordlist.from_file(args.dictionary)

    for form, data in probs.form_probs.items():
        s_total, s_form_count = probs.get_data(form)
        s_usage = 0

        # Check all words without any detected POS
        if not s_form_count:
            s_usage = s_total
        else:
            # Only check words that are primarily nouns
            if next(iter(s_form_count.keys())) != "n":
                continue

            # And only when the noun usage is at least %60 of total usage
            s_usage = s_form_count.get("n", 0)
            if s_usage / s_total < .6:
                continue
        if not s_usage:
            continue

        words = wordlist.get_words(form, "n")
        if not words:
            continue

        plurals = [pl for word in words for pl in word.forms.get("pl", [])]
        for plural in plurals:
            if plural == form:
                continue

            pl_total, pl_form_count = probs.get_data(plural)
            if not pl_total:
                continue
            pl_usage = 0
            if not pl_form_count:
                pl_usage = pl_total
            else:
                # Only allow plurals that are primarily nouns
                if next(iter(pl_form_count.keys())) != "n":
                    continue
                pl_usage = pl_form_count.get("n", 0)

                # And only when the noun usage is at least %60 of total usage
                if pl_usage / pl_total < .6:
                    continue

            if pl_usage >= s_usage:
                log(form, plural, s_usage, pl_usage)

    if args.save:
        base_url = "User:JeffDoozan/lists"
        logger.save(base_url, WikiSaver, commit_message=args.save)
    else:
        dest = ""
        logger.save(dest, FileSaver)
예제 #28
0
)
parser.add_argument("--tags", nargs=1, help="Merged tagged data with original data")
parser.add_argument("--dictionary", help="Dictionary file", required=True)
parser.add_argument("--allforms", help="Load word forms from file")
parser.add_argument("--low-mem", help="Use less memory", action='store_true', default=False)
args = parser.parse_args()

if not os.path.isfile(args.sentences):
    raise FileNotFoundError(f"Cannot open: {args.sentences}")

if args.tags and not os.path.isfile(args.tags[0]):
    raise FileNotFoundError(f"Cannot open: {args.tags}")

cache_words = not args.low_mem
with open(args.dictionary) as infile:
    wordlist = Wordlist(infile, cache_words=cache_words)

if args.allforms:
    all_forms = AllForms.from_file(args.allforms)
else:
    all_forms = AllForms.from_wordlist(wordlist)

def tag_to_pos(tag, word):

    lemma = tag["lemma"]
    ctag = tag["ctag"]

    pos = None
    if ctag.startswith("A"):  # and lemma not in ["el", "la", "uno"]:
        pos = "adj"
    elif ctag.startswith("C"):  # and lemma not in ["si", "que"]:
예제 #29
0
def test_simple():

    wordlist_data = """\
_____
protector
pos: n
  meta: {{es-noun|m|f=+|f2=protectriz}}
  g: m
  gloss: protector (someone who protects or guards)
pos: n
  meta: {{es-noun|m}}
  g: m
  gloss: protector (a device or mechanism which is designed to protect)
_____
protectora
pos: n
  meta: {{es-noun|f|m=protector}}
  forms: m=protector; mpl=protectores; pl=protectoras
  g: f
  gloss: female equivalent of "protector"
pos: n
  meta: {{es-noun|f}}
  forms: pl=protectoras
  g: f
  gloss: animal shelter (an organization that provides temporary homes for stray pet animals)
    syn: protectora de animales
_____
protectoras
pos: n
  meta: {{head|es|noun plural form|g=f-p}}
  g: f-p
  gloss: inflection of "protector"
_____
protectores
pos: n
  meta: {{head|es|noun plural form|g=m-p}}
  g: m-p
  gloss: inflection of "protector"
_____
protectrices
pos: n
  meta: {{head|es|noun plural form|g=f-p}}
  g: f-p
  gloss: inflection of "protector"
_____
protectriz
pos: n
  meta: {{es-noun|f|m=protector}}
  forms: m=protector; mpl=protectores; pl=protectrices
  g: f
  gloss: alternative form of "protectora"
    q: uncommon
"""

    flist_data = """\
protector 10
protectora 10
protectoras 10
protectores 10
protectriz 10
protectrices 10
unknown 10
"""

    wordlist = Wordlist(wordlist_data.splitlines())
    allforms = AllForms.from_wordlist(wordlist)
    freq = FrequencyList(wordlist, allforms, sentences)

    assert freq.wordlist.has_lemma("protectora", "n") == False

    assert freq.get_lemmas("protectores", "n") == ["protector"]
    assert freq.get_lemmas("protectoras", "n") == ["protector", "protectora"]
    assert freq.get_lemmas("notaword", "n") == ["notaword"]

    assert freq.get_ranked_pos("protectoras") == ["n"]

    assert "\n".join(freq.process(flist_data.splitlines())) == """\
예제 #30
0
def wordlist():

    data = """\
_____
Señor
pos: n
  meta: {{es-noun|m|f=+}}
  g: m
  gloss: alternative letter-case form of "señor", used before a name (also Sr.)
_____
Señora
pos: n
  meta: {{es-noun|f|m=Señor}}
  g: f
  gloss: alternative letter-case form of "señora", used before a name
_____
ababillarse
pos: v
  meta: {{es-verb}} {{es-conj|nocomb=1}}
  etymology: From a- + babilla ("the stifle (as of a horse)") + -ar.
  gloss: to be sick with the stifle (of horses and other quadrupeds)
    q: veterinary medicine, Chile, Mexico
_____
abjad
pos: n
  meta: {{es-noun|m}}
  g: m
  gloss: abjad (writing system)
    q: linguistics
_____
aborregarse
pos: v
  meta: {{es-verb}} {{es-conj}}
  gloss: verb
_____
aborrascarse
pos: v
  meta: {{es-verb}} {{es-conj}}
  etymology: a + borrasca
  gloss: to get stormy
    q: reflexive
_____
abuelito
pos: n
  meta: {{es-noun|m|f=abuelita}}
  g: m
  gloss: diminutive of "abuelo", grandfather, gramps, grandpa
_____
abyad
pos: n
  meta: {{es-noun|m}}
  g: m
  gloss: alternative form of "abjad"
_____
académico
pos: adj
  meta: {{es-adj}}
  gloss: academic
pos: n
  meta: {{es-noun|m|f=académica}}
  g: m
  gloss: academician, academic
_____
accidentar
pos: v
  meta: {{es-verb}} {{es-conj}}
  gloss: to cause an accident
_____
accidentarse
pos: v
  meta: {{es-verb}} {{es-conj|nocomb=1}}
  gloss: to have an accident, get into an accident, crash
_____
actor
pos: n
  meta: {{es-noun|m|f=actriz|f2=+}}
  g: m
  etymology: From Latin "actor".
  gloss: An actor (person who performs in a theatrical play or movie)
pos: n
  meta: {{es-noun|m|f=+}}
  g: m
  etymology: From Latin "actor".
  gloss: A defendant
    q: law
_____
aduanero
pos: adj
  meta: {{es-adj}}
  etymology: From aduana + -ero.
  gloss: customs
    q: relational
    syn: aduanal
pos: n
  meta: {{es-noun|mf|f=aduanera}}
  g: mf
  etymology: From aduana + -ero.
  gloss: customs officer
_____
alegre
pos: adj
  meta: {{es-adj}}
  gloss: joyful, cheerful
_____
ambos
pos: adj
  meta: {{head|es|adjective|g=m-p|feminine plural|ambas}}
  g: m-p
  gloss: both
    syn: los dos, las dos
pos: num
  meta: {{head|es|numeral}}
  gloss: both
pos: pron
  meta: {{head|es|pronoun}}
  gloss: both
_____
amigar
pos: v
  meta: {{es-verb}} {{es-conj}}
  gloss: to cause (people) to be friends
_____
amigue
pos: n
  meta: {{es-noun|m|g2=f|m=amigo|f=amiga}}
  g: m; f
  gloss: friend
    q: gender-neutral, neologism
_____
aparecido
pos: adj
  meta: {{es-adj}}
  gloss: appeared
pos: adj
  meta: {{es-noun|m}}
  g: m
  gloss: ghost, apparition, revenant
_____
aquél
pos: pron
  meta: {{head|es|pronoun|demonstrative||feminine|aquélla|neuter|aquello|masculine plural|aquéllos|feminine plural|aquéllas|g=m}}
  g: m
  gloss: that one (far from speaker and listener)
_____
ayuda
pos: n
  meta: {{es-noun|f}}
  g: f
  etymology: From ayudar (“to help”).
  gloss: help, aid, assistance
    syn: asistencia
pos: n
  meta: {{es-noun|mf}}
  g: mf
  etymology: From ayudar (“to help”).
  gloss: helper
    syn: ayudante
_____
bosniaca
pos: n
  meta: {{es-noun|f|m=bosniaco}}
  g: f
  gloss: female equivalent of "bosniaco"
_____
bosniaco
pos: n
  meta: {{es-noun|m|f=bosniaca}}
  g: m
  gloss: alternative spelling of "bosníaco"
_____
cabra
pos: n
  meta: {{es-noun|f|m=cabro}}
  g: f
  gloss: goat (unknown gender)
 _____
caldeo
pos: adj
  meta: {{es-adj}}
  etymology: From Latin "Chaldaeus", from Ancient Greek "Χαλδαῖος", from Akkadian "𒅗𒀠𒌅".
  gloss: Chaldean
pos: n
  meta: {{es-noun|m|f=caldea}}
  g: m
  etymology: From Latin "Chaldaeus", from Ancient Greek "Χαλδαῖος", from Akkadian "𒅗𒀠𒌅".
  gloss: Chaldean
pos: v
  meta: {{head|es|verb form}}
  etymology: See caldear
  gloss: inflection of "caldear"
_____
chama
pos: n
  meta: {{es-noun|m}}
  g: m
  gloss: chama
_____
chamo
pos: n
  meta: {{es-noun|m|f=chama}}
  g: m
  gloss: kid, child
    q: Venezuela, colloquial
_____
comer
pos: v
  meta: {{es-verb}} {{es-conj}}
  gloss: to eat
_____
comida
pos: n
  meta: {{es-noun|f}}
  g: f
  gloss: food
_____
comidas
pos: n
  meta: {{head|es|noun form|g=f-p}}
  g: f-p
  gloss: plural of "comida"
_____
comido
pos: v
  meta: {{es-past participle|comid}}
  gloss: pp_ms of "comer"
_____
crudívoro
pos: adj
  meta: {{es-adj}}
  gloss: crudivorous
pos: n
  meta: {{es-noun|m|f=crudívora}}
  g: m
  gloss: crudivore
_____
del mismo
pos: adj
  meta: {{es-adj|f=de la misma|mpl=de los mismos|fpl=de las mismas}}
  gloss: of it, them (substantive, refers back to a previous word in the text [see usage notes])
_____
dentista
pos: n
  meta: {{es-noun|mf}}
  g: mf
  etymology: diente + -ista
  gloss: dentist
_____
descomer
pos: v
  meta: {{es-verb}} {{es-conj|nocomb=1}}
  etymology: des + comer
  gloss: to defecate
    q: euphemistic
_____
descomedirse
pos: v
  meta: {{es-verb|<i>}} {{es-conj|<i>}}
  gloss: to be rude or disrespectful
    q: reflexive
_____
errar
pos: v
  meta: {{es-verb|<ye[Spain],+[Latin America]>}} {{es-conj|<ye[Spain],+[Latin America]>}}
  gloss: to miss
_____
estanciera
pos: n
  meta: {{es-noun|f|m=estanciero}}
  g: f
  gloss: ranch owner
_____
exconseller
pos: n
  meta: {{es-noun|m|+|pl2=exconsellers}}
  g: m
  etymology: ex- + conseller
  gloss: former conseller
_____
fulano
pos: prop
  meta: {{head|es|proper noun|g=m|plural|fulanos|feminine|fulana|feminine plural|fulanas}}
  g: m
  gloss: alternative letter-case form of "Fulano", what's-his-name, so-and-so
_____
gongo
pos: n
  meta: {{es-noun|m}}
  g: m
  gloss: alternative form of "gong"
  gloss: bell or cowbell
    q: Puerto Rico
    syn: campana; cencerro
_____
granado
pos: adj
  meta: {{es-adj}}
  gloss: grained
pos: n
  meta: {{es-noun|m}}
  g: m
  gloss: pomegranate tree
_____
hijodalgo
pos: n
  meta: {{es-noun|m|hijosdalgo|f=hijadalgo|fpl=hijasdalgo|pl2=hijosdalgos}}
  g: m
  etymology: contraction of "hijo de algo"
  gloss: alternative form of "hidalgo"
_____
huila
pos: n
  meta: {{es-noun|f}}
  g: f
  etymology: From Mapudungun.
  gloss: rags (tattered clothes)
    q: colloquial, Chile
pos: n
  meta: {{es-noun|f}}
  g: f
  gloss: female equivalent of "huilo"
pos: adj
  meta: {{head|es|adjective form}}
  gloss: feminine singular of "huilo"
_____
huilo
pos: adj
  meta: {{es-adj}}
  gloss: crippled
    q: colloquial, Mexico
    syn: tullido
pos: n
  meta: {{es-noun|m|f=huila}}
  g: m
  gloss: a crippled person
    q: colloquial, Mexico
_____
kirguiso
pos: adj
  meta: {{es-adj}}
  gloss: of Kyrgyzstan; Kyrgyzstani (of or relating to Kyrgyzstan)
pos: n
  meta: {{es-noun|m|f=kirguisa}}
  g: m
  gloss: Kyrgyzstani (native or inhabitant of Kyrgyzstan)
_____
kirguís
pos: adj
  meta: {{es-adj}}
  gloss: Kyrgyz (Turkic ethnic group)
  gloss: alternative form of "kirguiso"
pos: n
  meta: {{es-noun|m|f=+}}
  g: m
  gloss: Kyrgyz (Turkic ethnic group)
  gloss: alternative form of "kirguiso" (inhabitant)
_____
malayo
pos: adj
  meta: {{es-adj}}
  gloss: Malay (from Malaysia)
pos: n
  meta: {{es-noun|m|f=+}}
  g: m
  gloss: Malay (person)
pos: n
  meta: {{es-noun|m|-}}
  g: m
  gloss: Malay (language)
_____
parada
pos: n
  meta: {{es-noun|f}}
  g: f
  etymology: From the feminine past participle of parar.
  gloss: stop (the act of stopping)
  gloss: station (a location where things stop)
pos: n
  meta: {{es-noun|f|m=parado}}
  g: f
  etymology: From the feminine past participle of parar.
  gloss: female equivalent of "parado"
_____
parado
pos: n
  meta: {{es-noun|m|f=parada}}
  g: m
  gloss: unemployed person
    syn: desempleado; cesante
_____
sumar
pos: v
  meta: {{es-verb}} {{es-conj}}
gloss: to add
_____
sumir
pos: v
  meta: {{es-verb}} {{es-conj}}
gloss: to submerge
_____
vosotres
pos: pron
  meta: {{head|es|pronoun|masculine|vosotros|feminine|vosotras|g=m|g2=f}}
  g: m; f
  gloss: you (plural)
    q: gender-neutral, neologism
"""

    return Wordlist(data.splitlines())