示例#1
0
def asyn_framework():
    pool = Pool(processes  = nthread)
    nActPro = LockedSum(0,Lock())
    nBatch = LockedSum(0,Lock())
    results = []
    for doc in corpus:
        
        
        if doc_id % thread_batch == thread_batch - 1:   # accumulate one batch

            while True: # check for active processes amount
                if nActPro.get_value() < nthread:
                    break
                
            cb = lambda x: callback(x,lockedEta,nActPro,nBatch,var_path,nthread,thread_batch,old_doc_seen)
            result = pool.apply_async(asyn_workder,(doc_buffer,eta_temp,etaSum,alpha),callback = cb)
            results.append(result)
            nActPro.add_value(1)
            
            # clear buffer
            doc_buffer = []
            voc_temp = set()
            batch_id += 1
            
            doc_id += 1

    # some remain doc may not be processed

    if len(doc_buffer) > 0:

        while True: # check for active processes amount
            if nActPro.get_value() < nthread:
                break
                
        cb = lambda x: callback(x,lockedEta,nActPro,nBatch,var_path,nthread,thread_batch,old_doc_seen)
        result = pool.apply_async(asyn_workder,(doc_buffer,eta_temp,etaSum,alpha),callback = cb)
        results.append(result)
        nActPro.add_value(1)
        batch_id += 1

    for r in results:
        r.wait()

    if nBatch.get_value() % nthread != 0:
        nBatch_value = nBatch.get_value()
        fn = 'eta.{}.pickle'.format(nBatch_value/nthread)
        path = os.path.join(var_path,fn)
        lockedEta.write_eta(path)
        
    return lockedEta.eta
示例#2
0
def batch_generator(corpus, k, V, thread_batch, lockedEta):
    # ids
    doc_id = 0
    batch_id = 0
    round_id = 0
    # temp data
    doc_buffer = []
    voc_temp = set()

    # process contral
    nBatch = LockedSum(0, Lock())

    for doc in corpus:

        for vid, count in doc:
            voc_temp.add(vid)
        doc_buffer.append(doc)

        if doc_id % thread_batch == thread_batch - 1:
            eta_temp = lockedEta.get_eta(k, voc_temp)
            etaSum = lockedEta.get_eta_sum(k, V)
            alpha = _mea.get_alpha(k)

            yield (doc_buffer, eta_temp, etaSum, alpha, batch_id)

            # clear buffer
            doc_buffer = []
            voc_temp = set()
            batch_id += 1

        doc_id += 1

    # some remain doc may not be processed
    if len(doc_buffer) > 0:
        eta_temp = lockedEta.get_eta(k, voc_temp)
        etaSum = lockedEta.get_eta_sum(k, V)
        alpha = _mea.get_alpha(k)

        yield (doc_buffer, eta_temp, etaSum, alpha, batch_id)

        batch_id += 1
示例#3
0
def asyn_framework(corpus,k,V,nthread,minibatch,var_path,record_eta = False):
    # configs
    thread_batch = minibatch/nthread
    # ids 
    doc_id = 0
    batch_id = 0
    round_id = 0
    # temp data
    doc_buffer = []
    voc_temp = set()
    old_doc_seen = {}
    old_doc_seen[0] = 0
    # global data
    lockedEta = LockedEta({},Lock())
    
    # process contral
    pool = Pool(processes = nthread)
    nActPro = LockedSum(0,Lock())
    nBatch = LockedSum(0,Lock())
    results = []
    for doc in corpus:
        
        for vid,count in doc:
            voc_temp.add(vid)
        doc_buffer.append(doc)

        if doc_id % thread_batch == thread_batch - 1:
            eta_temp = lockedEta.get_eta(k,voc_temp)
            etaSum = lockedEta.get_eta_sum(k,V)
            alpha = _mea.get_alpha(k)
            while True: # check for active processes amount
                if nActPro.get_value() < nthread:
                    break
                
            cb = lambda x: callback(x,lockedEta,nActPro,nBatch,var_path,nthread,thread_batch,old_doc_seen)
            result = pool.apply_async(asyn_workder,(doc_buffer,eta_temp,etaSum,alpha),callback = cb)
            results.append(result)
            nActPro.add_value(1)
            
            # clear buffer
            doc_buffer = []
            voc_temp = set()
            batch_id += 1

            
        doc_id += 1

    # some remain doc may not be processed
    if len(doc_buffer) > 0:
        eta_temp = lockedEta.get_eta(k,voc_temp)
        etaSum = lockedEta.get_eta_sum(k,V)
        alpha = _mea.get_alpha(k)
        while True: # check for active processes amount
            if nActPro.get_value() < nthread:
                break
                
        cb = lambda x: callback(x,lockedEta,nActPro,nBatch,var_path,nthread,thread_batch,old_doc_seen)
        result = pool.apply_async(asyn_workder,(doc_buffer,eta_temp,etaSum,alpha),callback = cb)
        results.append(result)
        nActPro.add_value(1)
        batch_id += 1

    for r in results:
        r.wait()

    if nBatch.get_value() % nthread != 0:
        nBatch_value = nBatch.get_value()
        fn = 'eta.{}.pickle'.format(nBatch_value/nthread)
        path = os.path.join(var_path,fn)
        lockedEta.write_eta(path)
        
    return lockedEta.eta
示例#4
0
def asyn_framework(corpus,
                   k,
                   V,
                   nthread,
                   minibatch,
                   var_path,
                   record_eta=False):
    # configs
    thread_batch = minibatch / nthread
    # ids
    doc_id = 0
    batch_id = 0
    round_id = 0
    # temp data
    doc_buffer = []
    voc_temp = set()
    old_doc_seen = {}
    old_doc_seen[0] = 0
    # global data
    lockedEta = LockedEta({}, Lock())

    # process contral
    pool = Pool(processes=nthread)
    nActPro = LockedSum(0, Lock())
    nBatch = LockedSum(0, Lock())
    results = []
    for doc in corpus:

        for vid, count in doc:
            voc_temp.add(vid)
        doc_buffer.append(doc)

        if doc_id % thread_batch == thread_batch - 1:
            eta_temp = lockedEta.get_eta(k, voc_temp)
            etaSum = lockedEta.get_eta_sum(k, V)
            alpha = _mea.get_alpha(k)
            while True:  # check for active processes amount
                if nActPro.get_value() < nthread:
                    break

            cb = lambda x: callback(x, lockedEta, nActPro, nBatch, var_path,
                                    nthread, thread_batch, old_doc_seen)
            result = pool.apply_async(asyn_workder,
                                      (doc_buffer, eta_temp, etaSum, alpha),
                                      callback=cb)
            results.append(result)
            nActPro.add_value(1)

            # clear buffer
            doc_buffer = []
            voc_temp = set()
            batch_id += 1

        doc_id += 1

    # some remain doc may not be processed
    if len(doc_buffer) > 0:
        eta_temp = lockedEta.get_eta(k, voc_temp)
        etaSum = lockedEta.get_eta_sum(k, V)
        alpha = _mea.get_alpha(k)
        while True:  # check for active processes amount
            if nActPro.get_value() < nthread:
                break

        cb = lambda x: callback(x, lockedEta, nActPro, nBatch, var_path,
                                nthread, thread_batch, old_doc_seen)
        result = pool.apply_async(asyn_workder,
                                  (doc_buffer, eta_temp, etaSum, alpha),
                                  callback=cb)
        results.append(result)
        nActPro.add_value(1)
        batch_id += 1

    for r in results:
        r.wait()

    if nBatch.get_value() % nthread != 0:
        nBatch_value = nBatch.get_value()
        fn = 'eta.{}.pickle'.format(nBatch_value / nthread)
        path = os.path.join(var_path, fn)
        lockedEta.write_eta(path)

    return lockedEta.eta