예제 #1
0
def extract_lemmas(lang):
    """
    Returns dictionaries of the lemmas and words in language 'lang' (with the respective features)
    """
    if lang == 'it':
        words = defaultdict(list)
        lemmas = defaultdict(list)
        with open('../data/lemmatizer_unique.txt', 'r',
                  encoding='latin-1') as f:
            for l in f:
                l = l.strip().split('\t')
                if len(l) == 3:
                    atts = l[2].split(':')
                    if len(atts) > 1:
                        features = set(atts[1].split('+'))
                    else:
                        features = None
                    pos = set(atts[0].split('-'))
                    words[l[0]].append((l[1], pos, features))
                    lemmas[l[1]].append((l[0], pos, features))

    if lang == 'de':
        analyzer = Analyzer(char_subs_allowed=True)

        words = defaultdict(list)
        lemmas = defaultdict(list)
        for w in vocab:
            try:
                s = analyzer.analyze(w)
            except:
                continue
            else:
                if len(s) == 0:
                    continue
                for anlyss in s:
                    features = ast.literal_eval(str(anlyss))
                    words[w].append((features['LEMMA'], features))
                    lemmas[features['LEMMA']].append((w, features))

    return words, lemmas
예제 #2
0
with open(cache_path, 'w') as fc:
    json.dump(cache, fc)
    msg.debug("Words cache saved")

# Generate docs
TITLE = "= %s\n\n"
PROP = ":%s:\t\t%s\n"
EOHMARK = "// END-OF-HEADER. DO NOT MODIFY OR DELETE THIS LINE\n\n"
BODY = "_%s_ is a _%s_. Its lema is _%s_."

# FIXME: Write Nouns in capital
# FIXME: Check suffixes
for key in cache:
    doc_path = os.path.join('docs', "%s.adoc" % key)
    with open(doc_path, 'w') as fdp:
        s = analyzer.analyze(key)
        # ~ print(key)
        # ~ print('='*len(key))
        # ~ pp.pprint(s)
        # ~ print()
        # ~ print()
        fdp.write(TITLE % cache[key]['word'])
        fdp.write(PROP % ("Part Of Speech", cache[key]['pos']))
        fdp.write(PROP % ("Lema", cache[key]['lema']))
        fdp.write(PROP % ("Prefix", cache[key]['prefix']))
        fdp.write(PROP % ("Suffix", cache[key]['suffix']))
        fdp.write(EOHMARK)
        fdp.write(BODY % (cache[key]['word'], cache[key]['pos'], cache[key]['lema']))

pd.missing()
msg.info("Ending Deutschkurs")