示例#1
0
文件: buildedu.py 项目: OlafLee/DPLP
def main(fmodel, fvocab, rpath, wpath):
    clf = Classifier()
    dr = DocReader()
    clf.loadmodel(fmodel)
    flist = [join(rpath,fname) for fname in listdir(rpath) if fname.endswith('conll')]
    vocab = load(gzip.open(fvocab))
    for (fidx, fname) in enumerate(flist):
        print "Processing file: {}".format(fname)
        doc = dr.read(fname, withboundary=False)
        sg = SampleGenerator(vocab)
        sg.build(doc)
        M, _ = sg.getmat()
        predlabels = clf.predict(M)
        doc = postprocess(doc, predlabels)
        writedoc(doc, fname, wpath)
示例#2
0
def main(rpath, thresh, fvocab):
    """ Build vocab and save it into a pickle file
    """
    vg = VocabGenerator(thresh=thresh)
    dr = DocReader()
    flist = [join(rpath,fname) for fname in listdir(rpath) if fname.endswith('merge')]
    for fname in flist:
        print "Reading file: {}".format(fname)
        doc = dr.read(fname)
        vg.build(doc)
    vg.filter()
    vocab = vg.getvocab()
    print "Vocab size = {}".format(len(vocab))
    if not fvocab.endswith('.pickle.gz'):
        fvocab += '.pickle.gz'
    vg.savevocab(fvocab)
    with open('vocab.txt', 'w') as fout:
        for (feat, idx) in vocab.iteritems():
            fout.write(str(feat) + '\t' + str(idx) + '\n')
示例#3
0
def main(rpath, fdata, fvocab):
    """ Create data and dump it into a pickle file
    """
    print('Load vocab ...')
    vocab = load(gzip.open(fvocab))
    dr = DocReader()
    sg = SampleGenerator(vocab)
    flist = [join(rpath,fname) for fname in listdir(rpath) if fname.endswith('merge')]
    for fname in flist:
        # print "Reading file: {}".format(fname)
        doc = dr.read(fname)
        sg.build(doc)
    M, labels = sg.getmat()
    print('M.shape = {}, len(labels) = {}'.format(M.shape, len(labels)))
    data = {'data':M, 'labels':labels}
    if not fdata.endswith('.pickle.gz'):
        fdata += '.pickle.gz'
    with gzip.open(fdata, 'w') as fout:
        dump(data, fout)
    print ('Save data into file: {}'.format(fdata))
示例#4
0
def main(rpath, fdata, fvocab):
    """ Create data and dump it into a pickle file
    """
    print 'Load vocab ...'
    vocab = load(gzip.open(fvocab))
    dr = DocReader()
    sg = SampleGenerator(vocab)
    flist = [join(rpath,fname) for fname in listdir(rpath) if fname.endswith('merge')]
    for fname in flist:
        # print "Reading file: {}".format(fname)
        doc = dr.read(fname)
        sg.build(doc)
    M, labels = sg.getmat()
    print 'M.shape = {}, len(labels) = {}'.format(M.shape, len(labels))
    data = {'data':M, 'labels':labels}
    if not fdata.endswith('.pickle.gz'):
        fdata += '.pickle.gz'
    with gzip.open(fdata, 'w') as fout:
        dump(data, fout)
    print 'Save data into file: {}'.format(fdata)