def process(dict_names, gram_fname, xml, encoding): """""" gram_file = open(gram_fname, "r") gram = gram_file.read() gram_file.close() lexicon = parse_corpus(dict_names, grammar=gram, encoding=encoding, errors="replace") mkr_counts, nonblank_mkr_counts = count_mkrs(lexicon) analysis = analyse_dict(lexicon) if xml: indent(lexicon) out_file = open(xml, "w") out_file.write(ET.tostring(lexicon, encoding="UTF-8")) out_file.close() print "analysing files\n%s\n" % "\n".join(dict_names) if xml: print 'XML lexicon output in file "%s"\n' % xml print "====chunk grammar====" print gram print "\n" max_positions = 30 for structure, patt_dict in analysis.items(): print "\n\n===%s===: total= %d" % (structure, pattern_count(patt_dict)) for pattern, positions in sorted(patt_dict.items(), key=lambda t: (-len(t[1]), t[0])): if len(positions) <= max_positions: pos_str = "Entries: %s" % ", ".join(positions) else: pos_str = "Too many entries to list." print "\t%5d: %s %s" % (len(positions), ":".join(pattern), pos_str) print "\n\n" print "mkr\tcount\tnonblank" for mkr in mkr_counts: print "%s\t%5d\t%5d" % (mkr, mkr_counts.get(mkr, 0), nonblank_mkr_counts.get(mkr, 0))
def process(dict_names, gram_fname, xml, encoding): """""" gram_file = open(gram_fname, 'r') gram = gram_file.read() gram_file.close() lexicon = parse_corpus(dict_names, grammar=gram, encoding=encoding, errors='replace') mkr_counts, nonblank_mkr_counts = count_mkrs(lexicon) analysis = analyse_dict(lexicon) if xml: indent(lexicon) out_file = open(xml, "w") out_file.write(ET.tostring(lexicon, encoding='UTF-8')) out_file.close() print 'analysing files\n%s\n' % '\n'.join(dict_names) if xml: print 'XML lexicon output in file "%s"\n' % xml print '====chunk grammar====' print gram print '\n' max_positions = 30 for structure, patt_dict in analysis.items(): print '\n\n===%s===: total= %d' % (structure, pattern_count(patt_dict)) for pattern, positions in sorted(patt_dict.items(), key=lambda t: (-len(t[1]), t[0])): if len(positions) <= max_positions: pos_str = 'Entries: %s' % ', '.join(positions) else: pos_str = 'Too many entries to list.' print "\t%5d: %s %s" % (len(positions), ':'.join(pattern), pos_str) print "\n\n" print 'mkr\tcount\tnonblank' for mkr in mkr_counts: print '%s\t%5d\t%5d' % (mkr, mkr_counts.get( mkr, 0), nonblank_mkr_counts.get(mkr, 0))
#!/usr/bin/env python """ Build the corpus package index. Usage: build_pkg_index.py <path-to-packages> <base-url> <output-file> """ xml_header = """<?xml version="1.0"?> <?xml-stylesheet href="index.xsl" type="text/xsl"?> """ import sys from nltk.downloader import build_index from nltk.etree import ElementTree if len(sys.argv) != 4: print "Usage: " print "build_pkg_index.py <path-to-packages> <base-url> <output-file>" sys.exit(-1) ROOT, BASE_URL, OUT = sys.argv[1:] index = build_index(ROOT, BASE_URL) s = ElementTree.tostring(index) out = open(OUT, 'w') out.write(xml_header) out.write(s) out.close()