def readandfixRound1(taglength=2): sentences1, pretagged1 = readconll() sentences2, pretagged2 = tb.readtb() tagset1, contexts1 = tagsandstuff(pretagged1) tagset2, contexts2 = tagsandstuff(pretagged2) tags = alltags([tagset1, tagset2]) pretagged = pretagged1 + pretagged2 names = reallynames(tags) fixnames(pretagged, names) fixdates(pretagged, tags) fixlower(pretagged) return pretagged, [contexts1, contexts2], tags, [tagset1, tagset2], names
def comp34411(metering=0, brilltagger=False, sentences=False, pretagged=False, qwindow=2, stackwindow=False, folds=5, tagsize=2, atagsize=False, forced=False, precision=0.97, threshold=500): print "reading data" tb.METERING = metering if not atagsize: atagsize = tagsize if stackwindow == False: stackwindow = qwindow if not sentences: sentences, pretagged = tb.readtb() N = 10000 K = 10000 print "initial values for pretagged[:5], sentences[0]" print pretagged[:5] sentences[0].showDTree() tbl.METERING = metering tbl.tag.METERING = metering if not brilltagger: print "making tagger" t0 = datetime.datetime.now() brilltagger = tbl.trainwithmxlandbrill(pretagged=pretagged, N=N, K=K, mxltagsize=tagsize, tagsize=tagsize, atagsize=atagsize) print "training the tagger took %s seconds"%((datetime.datetime.now()-t0).seconds) print "pretagged[:5], sentences[0] after training the tagger" print pretagged[:5] sentences[0].showDTree() fp.METERING = metering if not forced: print "forceparse" t0 = datetime.datetime.now() fp.forceparseall(sentences, stackwindow=stackwindow, qwindow=qwindow,tagsize=tagsize) print "pretagged[:5], sentences[0] after forceparseall" print "forced parsing took %s seconds"%((datetime.datetime.now()-t0).seconds) print "nfold %s (precision = %s)"%(folds, precision) a, ca, c, parser, training, testing = fp.nfold(sentences, n=folds, stackwindow=stackwindow, qwindow=qwindow, tagsize=tagsize, precision=precision, threshold=threshold) print "average accuracy over all folds (precision = %s): %.3f\naverage classifier accuracy %.3f"%(precision, float(a), float(ca)) print """ return the parser from the last fold & the training and testing sets for that parser so we can do subsequent experiments soundly """ return sentences, pretagged, brilltagger, parser, training, testing