Python morph_map примеры использования

Язык программирования: Python

Пространство имен/Пакет: elp

Метод/Функция: morph_map

Примеров на hotexamples.com: 2

Python morph_map - 2 примера найдено. Это лучшие примеры Python кода для elp.morph_map, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Пример #1

Показать файл

Файл: count_elp_suffixes.py Проект: ConstantineLignos/MorphProcessing

def count_morphs(in_file):
    """Count morphs in in_file."""
    _, _, suffix_words = morph_map(in_file)
    suffix_counts = Counter({suffix: len(words) for suffix, words in suffix_words.iteritems()})

    print "Suffixes:"
    for suffix, count in suffix_counts.most_common():
        print "{}\t{}\t{}".format(suffix, count, ", ".join(shuffled(suffix_words[suffix])[:25]))

Пример #2

Показать файл

Файл: merge_analyses.py Проект: ConstantineLignos/MorphProcessing

def main(verbose):
    """Merge analyses from CELEX into ELP."""
    # Get arguments
    try:
        elp_analyses = sys.argv[1]
        celex_root = sys.argv[2]
        out_path = sys.argv[3]
    except IndexError:
        print >> sys.stderr, "Usage: merge_analyses elp_analyses celex_root output"
        sys.exit(64)

    # Get the elp analyses
    _, elp_root_words, _ = morph_map(open(elp_analyses, 'Ur'))

    # Read in CELEX
    creader = CelexDB(celex_root, ENGLISH)
    clx_root_words = dict(creader.lemma_map)

    # Perform some cleanup on both
    print "Excluding proper nouns and words with apostrophes and hyphens..."
    for root_words, name in ((clx_root_words, "CELEX"), (elp_root_words, "ELP")):
        # Sanitize the words in the roots
        n_word_deletions = 0
        for root, words in root_words.iteritems():
            clean_words = [word for word in words if not exclude_item(word)]
            if len(clean_words) < len(words):
                n_word_deletions += len(words) - len(clean_words)
                root_words[root] = clean_words
                if verbose:
                    excluded_words = set(words) - set(clean_words)
                    for word in excluded_words:
                        print "Excluded word {}".format(word)
        print "Excluded {} words from {}.".format(n_word_deletions, name)

        # Exclude any empty or exclusion-worthy roots
        root_deletions = [root for root, words in root_words.iteritems()
                          if not words or exclude_item(root)]
        for root in root_deletions:
            if verbose:
                print "Excluded root {}".format(root)
            del root_words[root]
        print "Excluded {} roots from {}.".format(len(root_deletions), name)

    print

    # Map each word back to its roots
    clx_word_roots = defaultdict(set)
    for root, words in clx_root_words.iteritems():
        for word in words:
            clx_word_roots[word].add(root)

    # Remove any useless singleton lemmas where a word maps to multiple
    # lemmas and one of those lemmas is headed by that word and only
    # consists of the word itself. For example, the 'abandoned' lemma:
    # abandon: abandon, abandoned, abandoning, abandonment, abandons
    # abandoned: abandoned
    # We perform deletions after iteration so we can do so safely.
    print "Removing useless singleton lemmas in CELEX..."
    deletions = [(word, roots) for word, roots in clx_word_roots.iteritems()
                 if len(roots) > 1 and word in roots and len(clx_root_words[word]) == 1]
    for word, roots in deletions:
        if verbose:
            print "Removed useless singleton", word
        # Remove the singleton entry
        del clx_root_words[word]
        # Remove the word as its own root
        roots.remove(word)
    print "Removed {} useless singleton lemmas from CELEX.".format(len(deletions))
    print

    # Calculate how many forms have multiple roots associated with them.
    clx_total_words = len(clx_word_roots)
    clx_ambig_words = sum(1 for word, roots in clx_word_roots.iteritems()
                          if len(roots) > 1)

    # Get basic counts
    elp_words = set(word for words in elp_root_words.values()
                    for word in words)
    clx_words = set(clx_word_roots)
    print "{} roots, {} words in ELP.".format(len(elp_root_words), len(elp_words))
    print "{} roots, {} words in CELEX.".format(len(clx_root_words), clx_total_words)
    print "CELEX provides {:2.2f}% coverage of the ELP words.".format(
        len(clx_words & elp_words) / float(len(elp_words)) * 100)
    # This is a sanity check that doesn't affect anything.
    print "{} of {} words ({:2.2f}%) in CELEX have multiple roots.".format(
        clx_ambig_words, clx_total_words, clx_ambig_words / float(clx_total_words) * 100)
    print

    shared_roots = set(clx_root_words).intersection(elp_root_words)
    print "{} roots shared between ELP and CELEX.".format(len(shared_roots))
    unshared_roots = set(clx_root_words).symmetric_difference(elp_root_words)
    print "{} roots appear in only one ELP or CELEX.".format(len(unshared_roots))

    # Augment the ELP analyses
    aug_root_words = {}
    for root in elp_root_words:
        if root in clx_root_words:
            # Add extra items, but only if they are not already analyzed
            aug_root_words[root] = list(set(elp_root_words[root]).union(
                [word for word in clx_root_words[root] if word not in elp_words]))
            if verbose:
                additional_words = set(aug_root_words[root]) - set(elp_root_words[root])
                if additional_words:
                    print "Root {} gains {}".format(root, ", ".join(additional_words))
        else:
            aug_root_words[root] = list(elp_root_words[root])

    # Reverse the mapping and sort out items with multiple roots
    aug_word_roots = defaultdict(set)
    for root, words in aug_root_words.iteritems():
        for word in words:
            aug_word_roots[word].add(root)
    aug_ambig_words = [(word, roots) for word, roots in aug_word_roots.iteritems()
                       if len(roots) > 1]
    for word, roots in aug_ambig_words:
        for root in roots:
            aug_root_words[root].remove(word)
        if verbose:
            print "Removed word {} from roots {}".format(word, ", ".join(roots))
    print "Removed {} ambiguously-rooted words in augmented ELP.".format(len(aug_ambig_words))

    # Count augmented forms
    aug_word_count = len(set(word for words in aug_root_words.values()
                             for word in words))
    print
    print "{} roots, {} words in augmented ELP.".format(len(aug_root_words), aug_word_count)
    print "ELP analyses now cover {:2.2f}% more words.".format(
        (aug_word_count - len(elp_words)) / float(len(elp_words)) * 100)

    print "Writing output to {}...".format(out_path)
    with open(out_path, 'w') as out_file:
        for root, words in sorted(aug_root_words.items()):
            if not words:
                print >> sys.stderr, "Empty root {}".format(root)
                continue
            print >> out_file, "{}\t{}".format(root, ",".join(sorted(words)))
    print "Done."