def build_meaning(tree, parent=None, parts=[], cptotalentropy=None, cpcolumns=None, cwtotalentropy=None, cwcolumns=None, threshold=0.75): cptotalentropy,cpcolumns,cwtotalentropy,cwcolumns,printlength = get_total_entropy() lhs = tree.node if isinstance(tree[0], ParentedTree): rhs = ' '.join(n.node for n in tree) else: rhs = ' '.join(n for n in tree) print '+++',lhs,'-->',rhs,'+++' if lhs in NONTERMINALS: if not lhs == 'LOCATION-PHRASE': if lhs == 'RELATION': parts.append( ('relation',Counter()) ) elif lhs == parent == 'LANDMARK-PHRASE': parts.append( ('parent-landmark',Counter()) ) elif lhs == 'LANDMARK-PHRASE': parts.append( ('landmark',Counter()) ) cp_db = CProduction.get_production_counts(lhs=lhs,rhs=rhs) totalss = get_query_totalss(cp_db,cpcolumns) for name,totals in zip(cpcolumns[:-1],totalss): ent = entropy_of_counts( totals.values() ) totent = cptotalentropy[name.name] if ent < threshold*totent: parts[-1][1][ "%s = %s" % (name.name, max(zip(*reversed(zip(*totals.items()))))[1]) ]+=1 for subtree in tree: parts = build_meaning(subtree, lhs, parts, cptotalentropy, cpcolumns, cwtotalentropy, cwcolumns, threshold) else: cw_db = CWord.get_word_counts(pos=lhs,word=rhs) totalss = get_query_totalss(cw_db,cwcolumns) for name,totals in zip(cwcolumns[:-1],totalss): ent = entropy_of_counts( totals.values() ) totent = cwtotalentropy[name.name] if ent < threshold*totent: parts[-1][1][ "%s = %s" % (name.name, max(zip(*reversed(zip(*totals.items()))))[1]) ]+=1 return parts
def print_totalss_entropy(totalss,totalentropy,columns,printlength): print rjust('column',printlength), rjust('context',7), rjust('overall',7), 'best' print rjust('',printlength), rjust('entropy',7), rjust('entropy',7) for name,totals in zip(columns[:-1],totalss): print rjust(name.name,printlength), \ rjust("%02.4f" % entropy_of_counts( totals.values() ),7), \ rjust("%02.4f" % totalentropy[name.name],7), \ zip(*sorted(zip(*reversed(zip(*totals.items()))),reverse=True))[1] print print
def get_total_entropy(): cp_db = CProduction.query cpcolumns = list(CProduction.__table__.columns)[3:] totalss = get_query_totalss(cp_db,cpcolumns) cptotalentropy = {} for name,totals in zip(cpcolumns[:-1],totalss): ent = entropy_of_counts( totals.values() ) cptotalentropy[name.name] = ent cw_db = CWord.query cwcolumns = list(CWord.__table__.columns)[3:] totalss = get_query_totalss(cw_db,cwcolumns) cwtotalentropy = {} for name,totals in zip(cwcolumns[:-1],totalss): ent = entropy_of_counts( totals.values() ) cwtotalentropy[name.name] = ent printlength = max( [len(column.name) for column in \ list(CProduction.__table__.columns)[3:-1] + \ list(CWord.__table__.columns)[3:-1] ] ) return cptotalentropy,cpcolumns,cwtotalentropy,cwcolumns,printlength