def main(): parser = argparse.ArgumentParser(description="label data given model file", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--infile", "-i", nargs='?', type=argparse.FileType('r'), default=sys.stdin, help="input text file") parser.add_argument("--modelfile", "-m", nargs='?', type=argparse.FileType('rb'), default=sys.stdin, help="input model file") parser.add_argument("--handlabel", "-H", action='store_true', default=False, help="use hand labels if available") parser.add_argument("--outfile", "-o", nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="output file") parser.add_argument("--debug", "-d", action='store_true', default=False, help="debug mode") try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) infile = prepfile(args.infile, 'r') modelfile = prepfile(args.modelfile, 'rb') outfile = prepfile(args.outfile, 'w') fullmodel = pickle.load(modelfile) settings = fullmodel['settings'] features, tokfeatures = hkmc.prepfeatures(settings) # slight bit of bkwd compat sparse = settings['sparse'] if 'sparse' in settings else True external = settings['externalfeatures'] if 'externalfeatures' in settings else None data, info, datamap = hkmc.prepdata(infile, features, tokfeatures, args.debug, settings, dv=fullmodel['feats']) labels = fullmodel['model'].handlabeldata(data) if args.handlabel else fullmodel['model'].labeldata(data) for label, theinfo in izip(labels, info): outfile.write("%d\t%d\t%s\n" % (theinfo['ln'], theinfo['offset'], label))
def main(): parser = argparse.ArgumentParser(description="add hand labels on model file given annotated data", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--modelfile", "-m", nargs='?', type=argparse.FileType('rb'), default=sys.stdin, help="input model file") parser.add_argument("--infile", "-i", nargs='?', type=argparse.FileType('r'), default=sys.stdin, help="input untokenized text") parser.add_argument("--goldfile", "-g", nargs='?', type=argparse.FileType('r'), default=sys.stdin, help="input gold labels") parser.add_argument("--outfile", "-o", nargs='?', type=argparse.FileType('wb'), default=None, help="output model file") parser.add_argument("--annfile", "-a", nargs='?', type=argparse.FileType('w'), default=None, help="output annotation file") # TODO! parser.add_argument("--refine", "-r", action='store_true', default=False, help="dynamically refine") parser.add_argument("--debug", "-d", action='store_true', default=False, help="debug mode") parser.add_argument("--thresh", "-t", type=float, default=0.75, help="how pure a class has to be") try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) modelfile = prepfile(args.modelfile, 'rb') infile = prepfile(args.infile, 'r') goldfile = prepfile(args.goldfile, 'r') outfile = prepfile(args.outfile, 'wb') if args.outfile is not None else None annfile = prepfile(args.annfile, 'w') if args.annfile is not None else None fullmodel = pickle.load(modelfile) settings = fullmodel['settings'] features, tokfeatures = hkmc.prepfeatures(settings) values = [ ('AA', 'attach both sides (noop)'), ('DD', 'disconnect both sides'), ('AD', 'disconnect right only'), ('DA', 'disconnect left only'), ('AN', 'attach left, newline right'), ('DN', 'disconnect left, newline right'), ] data, info, datamap = hkmc.prepdata(infile, features, tokfeatures, args.debug, settings, dv=fullmodel['feats']) goldlabels = [] golddata = [ x.split() for x in [ y.strip() for y in goldfile.readlines() ] ] for infoblock in info: goldlabels.append(golddata[infoblock['ln']][infoblock['offset']]) goldlabels = np.array(goldlabels) fullmodel['model'].classifydata(data, info, goldlabels, annfile, thresh=args.thresh) if outfile is not None: pickle.dump(fullmodel, outfile)
def main(): parser = argparse.ArgumentParser(description="k means clustering for periods. see unitok/scripts/learntok for some inspiration", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--infile", "-i", nargs='?', type=argparse.FileType('r'), default=sys.stdin, help="input file") #parser.add_argument("--outfile", "-o", nargs='?', type=argparse.FileType('wb'), default=None, help="output file") parser.add_argument("--tontfile", "-t", nargs='?', type=argparse.FileType('w'), default=None, help="test on train output file") parser.add_argument("--unicodepossibles", "-u", action='store_true', default=False, help="interpret possibles list as unicode class prefixes") parser.add_argument("--kclusters", "-k", default=2, type=int, help="number of clusters per layer") # parser.add_argument("--clean", "-c", action='store_true', default=False, help="clean model training (no tont)") parser.add_argument("--layers", "-y", default=2, type=int, help="number of layers") parser.add_argument("--minclustersize", "-z", default=10.0, type=float, help="no cluster splitting below this pct of training data") parser.add_argument("--leftcontext", "-l", default=5, type=int, help="make features for this number of previous characters") parser.add_argument("--rightcontext", "-r", default=0, type=int, help="make features for this number of next characters") parser.add_argument("--nochar", "-n", action='store_false', dest='charfeature', default=True, help="no character features (class only)") parser.add_argument("--possibles", "-p", nargs='+', default=['.'], help="set of characters to possibly split on") parser.add_argument("--handlabel", "-H", action='store_true', default=False, help="do hand labeling after training") parser.add_argument("--dbscan", action='store_true', default=False, help="try dbscan instead of kmeans") parser.add_argument("--debug", "-d", action='store_true', default=False, help="debug mode") parser.add_argument("--banned", nargs='+', default=[], help='tok-based features to remove') parser.add_argument("--paramnames", nargs='+', default=[], help='algorithm parameter names') parser.add_argument("--paramvals", nargs='+', default=[], help='algorithm parameter values') try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) infile = prepfile(args.infile, 'r') tontfile = prepfile(args.tontfile, 'w') if args.tontfile is not None else None settings = {} settings['kclusters'] = args.kclusters settings['layers'] = args.layers settings['minclustersize'] = args.minclustersize settings['leftcontext'] = args.leftcontext settings['rightcontext'] = args.rightcontext settings['possibles'] = args.possibles settings['unicodepossibles'] = args.unicodepossibles settings['charfeature'] = args.charfeature settings['banned'] = args.banned features, tokfeatures = hkmc.prepfeatures(settings) # print("Preparing data") data, info, datamap = hkmc.prepdata(infile, args.possibles, features, tokfeatures, args.debug, isTargetPunc=args.unicodepossibles) # print("Done") #print(data.shape) if(args.debug): print(data) modeltype = MiniBatchKMeans modelkwargs = {'n_clusters':args.kclusters} if args.dbscan: modeltype = DBSCAN modelkwargs = {'eps':0.2} if len(args.paramnames) != 0: modelkwargs = dict(zip(args.paramnames, map(float, args.paramvals))) print(modelkwargs) modelTree = ModelTree(modeltype, data, info, modelkwargs=modelkwargs) labels = modelTree.model.fit_predict(modelTree.data) for label in set(labels): subset = modelTree.data[labels==label] subinfo = modelTree.info[labels==label] tontfile.write("%s\t%d\n" % (label, len(subinfo))) for elem in subinfo: tontfile.write("%s\t%s\t%s\n" % (label, hkmc.formatContext(elem), str(elem['feats'])))
def main(): parser = argparse.ArgumentParser( description="label data given model file", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--infile", "-i", nargs='?', type=argparse.FileType('r'), default=sys.stdin, help="input text file") parser.add_argument("--modelfile", "-m", nargs='?', type=argparse.FileType('rb'), default=sys.stdin, help="input model file") parser.add_argument("--handlabel", "-H", action='store_true', default=False, help="use hand labels if available") parser.add_argument("--outfile", "-o", nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="output file") parser.add_argument("--debug", "-d", action='store_true', default=False, help="debug mode") try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) infile = prepfile(args.infile, 'r') modelfile = prepfile(args.modelfile, 'rb') outfile = prepfile(args.outfile, 'w') fullmodel = pickle.load(modelfile) settings = fullmodel['settings'] features, tokfeatures = hkmc.prepfeatures(settings) # slight bit of bkwd compat sparse = settings['sparse'] if 'sparse' in settings else True external = settings[ 'externalfeatures'] if 'externalfeatures' in settings else None data, info, datamap = hkmc.prepdata(infile, features, tokfeatures, args.debug, settings, dv=fullmodel['feats']) labels = fullmodel['model'].handlabeldata( data) if args.handlabel else fullmodel['model'].labeldata(data) for label, theinfo in izip(labels, info): outfile.write("%d\t%d\t%s\n" % (theinfo['ln'], theinfo['offset'], label))