def main(fmodel, fvocab, rpath, wpath): clf = Classifier() dr = DocReader() clf.loadmodel(fmodel) flist = [join(rpath,fname) for fname in listdir(rpath) if fname.endswith('conll')] vocab = load(gzip.open(fvocab)) for (fidx, fname) in enumerate(flist): print "Processing file: {}".format(fname) doc = dr.read(fname, withboundary=False) sg = SampleGenerator(vocab) sg.build(doc) M, _ = sg.getmat() predlabels = clf.predict(M) doc = postprocess(doc, predlabels) writedoc(doc, fname, wpath)
def main(rpath, thresh, fvocab): """ Build vocab and save it into a pickle file """ vg = VocabGenerator(thresh=thresh) dr = DocReader() flist = [join(rpath,fname) for fname in listdir(rpath) if fname.endswith('merge')] for fname in flist: print "Reading file: {}".format(fname) doc = dr.read(fname) vg.build(doc) vg.filter() vocab = vg.getvocab() print "Vocab size = {}".format(len(vocab)) if not fvocab.endswith('.pickle.gz'): fvocab += '.pickle.gz' vg.savevocab(fvocab) with open('vocab.txt', 'w') as fout: for (feat, idx) in vocab.iteritems(): fout.write(str(feat) + '\t' + str(idx) + '\n')
def main(rpath, fdata, fvocab): """ Create data and dump it into a pickle file """ print('Load vocab ...') vocab = load(gzip.open(fvocab)) dr = DocReader() sg = SampleGenerator(vocab) flist = [join(rpath,fname) for fname in listdir(rpath) if fname.endswith('merge')] for fname in flist: # print "Reading file: {}".format(fname) doc = dr.read(fname) sg.build(doc) M, labels = sg.getmat() print('M.shape = {}, len(labels) = {}'.format(M.shape, len(labels))) data = {'data':M, 'labels':labels} if not fdata.endswith('.pickle.gz'): fdata += '.pickle.gz' with gzip.open(fdata, 'w') as fout: dump(data, fout) print ('Save data into file: {}'.format(fdata))
def main(rpath, fdata, fvocab): """ Create data and dump it into a pickle file """ print 'Load vocab ...' vocab = load(gzip.open(fvocab)) dr = DocReader() sg = SampleGenerator(vocab) flist = [join(rpath,fname) for fname in listdir(rpath) if fname.endswith('merge')] for fname in flist: # print "Reading file: {}".format(fname) doc = dr.read(fname) sg.build(doc) M, labels = sg.getmat() print 'M.shape = {}, len(labels) = {}'.format(M.shape, len(labels)) data = {'data':M, 'labels':labels} if not fdata.endswith('.pickle.gz'): fdata += '.pickle.gz' with gzip.open(fdata, 'w') as fout: dump(data, fout) print 'Save data into file: {}'.format(fdata)