def count_morphs(in_file): """Count morphs in in_file.""" _, _, suffix_words = morph_map(in_file) suffix_counts = Counter({suffix: len(words) for suffix, words in suffix_words.iteritems()}) print "Suffixes:" for suffix, count in suffix_counts.most_common(): print "{}\t{}\t{}".format(suffix, count, ", ".join(shuffled(suffix_words[suffix])[:25]))
def main(verbose): """Merge analyses from CELEX into ELP.""" # Get arguments try: elp_analyses = sys.argv[1] celex_root = sys.argv[2] out_path = sys.argv[3] except IndexError: print >> sys.stderr, "Usage: merge_analyses elp_analyses celex_root output" sys.exit(64) # Get the elp analyses _, elp_root_words, _ = morph_map(open(elp_analyses, 'Ur')) # Read in CELEX creader = CelexDB(celex_root, ENGLISH) clx_root_words = dict(creader.lemma_map) # Perform some cleanup on both print "Excluding proper nouns and words with apostrophes and hyphens..." for root_words, name in ((clx_root_words, "CELEX"), (elp_root_words, "ELP")): # Sanitize the words in the roots n_word_deletions = 0 for root, words in root_words.iteritems(): clean_words = [word for word in words if not exclude_item(word)] if len(clean_words) < len(words): n_word_deletions += len(words) - len(clean_words) root_words[root] = clean_words if verbose: excluded_words = set(words) - set(clean_words) for word in excluded_words: print "Excluded word {}".format(word) print "Excluded {} words from {}.".format(n_word_deletions, name) # Exclude any empty or exclusion-worthy roots root_deletions = [root for root, words in root_words.iteritems() if not words or exclude_item(root)] for root in root_deletions: if verbose: print "Excluded root {}".format(root) del root_words[root] print "Excluded {} roots from {}.".format(len(root_deletions), name) print # Map each word back to its roots clx_word_roots = defaultdict(set) for root, words in clx_root_words.iteritems(): for word in words: clx_word_roots[word].add(root) # Remove any useless singleton lemmas where a word maps to multiple # lemmas and one of those lemmas is headed by that word and only # consists of the word itself. For example, the 'abandoned' lemma: # abandon: abandon, abandoned, abandoning, abandonment, abandons # abandoned: abandoned # We perform deletions after iteration so we can do so safely. print "Removing useless singleton lemmas in CELEX..." deletions = [(word, roots) for word, roots in clx_word_roots.iteritems() if len(roots) > 1 and word in roots and len(clx_root_words[word]) == 1] for word, roots in deletions: if verbose: print "Removed useless singleton", word # Remove the singleton entry del clx_root_words[word] # Remove the word as its own root roots.remove(word) print "Removed {} useless singleton lemmas from CELEX.".format(len(deletions)) print # Calculate how many forms have multiple roots associated with them. clx_total_words = len(clx_word_roots) clx_ambig_words = sum(1 for word, roots in clx_word_roots.iteritems() if len(roots) > 1) # Get basic counts elp_words = set(word for words in elp_root_words.values() for word in words) clx_words = set(clx_word_roots) print "{} roots, {} words in ELP.".format(len(elp_root_words), len(elp_words)) print "{} roots, {} words in CELEX.".format(len(clx_root_words), clx_total_words) print "CELEX provides {:2.2f}% coverage of the ELP words.".format( len(clx_words & elp_words) / float(len(elp_words)) * 100) # This is a sanity check that doesn't affect anything. print "{} of {} words ({:2.2f}%) in CELEX have multiple roots.".format( clx_ambig_words, clx_total_words, clx_ambig_words / float(clx_total_words) * 100) print shared_roots = set(clx_root_words).intersection(elp_root_words) print "{} roots shared between ELP and CELEX.".format(len(shared_roots)) unshared_roots = set(clx_root_words).symmetric_difference(elp_root_words) print "{} roots appear in only one ELP or CELEX.".format(len(unshared_roots)) # Augment the ELP analyses aug_root_words = {} for root in elp_root_words: if root in clx_root_words: # Add extra items, but only if they are not already analyzed aug_root_words[root] = list(set(elp_root_words[root]).union( [word for word in clx_root_words[root] if word not in elp_words])) if verbose: additional_words = set(aug_root_words[root]) - set(elp_root_words[root]) if additional_words: print "Root {} gains {}".format(root, ", ".join(additional_words)) else: aug_root_words[root] = list(elp_root_words[root]) # Reverse the mapping and sort out items with multiple roots aug_word_roots = defaultdict(set) for root, words in aug_root_words.iteritems(): for word in words: aug_word_roots[word].add(root) aug_ambig_words = [(word, roots) for word, roots in aug_word_roots.iteritems() if len(roots) > 1] for word, roots in aug_ambig_words: for root in roots: aug_root_words[root].remove(word) if verbose: print "Removed word {} from roots {}".format(word, ", ".join(roots)) print "Removed {} ambiguously-rooted words in augmented ELP.".format(len(aug_ambig_words)) # Count augmented forms aug_word_count = len(set(word for words in aug_root_words.values() for word in words)) print print "{} roots, {} words in augmented ELP.".format(len(aug_root_words), aug_word_count) print "ELP analyses now cover {:2.2f}% more words.".format( (aug_word_count - len(elp_words)) / float(len(elp_words)) * 100) print "Writing output to {}...".format(out_path) with open(out_path, 'w') as out_file: for root, words in sorted(aug_root_words.items()): if not words: print >> sys.stderr, "Empty root {}".format(root) continue print >> out_file, "{}\t{}".format(root, ",".join(sorted(words))) print "Done."