Пример #1
0
def master_process(comm,
                   status,
                   tags,
                   corpus,
                   k,
                   V,
                   nthread,
                   minibatch,
                   var_path,
                   record_eta=False):

    # parameter for threads
    num_workers = comm.size - 1
    task_id = 0
    closed_workers = 0
    logging.info('Master start with {} works'.format(num_workers))

    thread_batch = minibatch / nthread
    lockedEta = LockedEta({}, Lock())
    bg = batch_generator(corpus, k, V, thread_batch, lockedEta)
    last_doc_seen = 0

    while closed_workers < num_workers:
        data = comm.recv(source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG, status=status)
        source = status.Get_source()
        tag = status.Get_tag()

        if tag == tags.READY:
            try:
                if source <= nthread:
                    doc_buffer, eta_temp, etaSum, alpha, batch_id = next(bg)
                    comm.send((doc_buffer, eta_temp, etaSum, alpha, batch_id),
                              dest=source,
                              tag=tags.START)
                else:
                    comm.send(None, dest=source, tag=tags.EXIT)
            except StopIteration:
                comm.send(None, dest=source, tag=tags.EXIT)
        elif tag == tags.DONE:
            delta_eta, nBatch_value = data
            nBatch_value += 1
            logging.info('Got batch {} results from worker {}'.format(
                nBatch_value, source))
            lockedEta.add_eta(delta_eta)
            doc_seen = nBatch_value * thread_batch
            if doc_seen - last_doc_seen >= 100000:
                fn = 'eta.{}.pickle'.format(nBatch_value / nthread - 1)
                path = os.path.join(var_path, fn)
                lockedEta.write_eta(path)
                logging.info('round:{}, batch:{}'.format(
                    nBatch_value / nthread - 1, nBatch_value))
                last_doc_seen = doc_seen

        elif tag == tags.EXIT:
            logging.info('Worker {} exited.'.format(source))
            closed_workers += 1

    return lockedEta.eta
Пример #2
0
def master_process(comm,status,tags,corpus,k,V,nthread,minibatch,var_path,record_eta = False):

    # parameter for threads
    num_workers = comm.size - 1
    task_id = 0
    closed_workers = 0
    logging.info('Master start with {} works'.format(num_workers))

    thread_batch = minibatch/nthread
    lockedEta = LockedEta({},Lock())
    bg = batch_generator(corpus,k,V,thread_batch,lockedEta)
    last_doc_seen = 0


    while closed_workers < num_workers:
        data = comm.recv(source = MPI.ANY_SOURCE, tag = MPI.ANY_TAG, status = status)
        source = status.Get_source()
        tag = status.Get_tag()
        
        if tag == tags.READY:
            try:
                if source <= nthread:
                    doc_buffer,eta_temp,etaSum,alpha,batch_id = next(bg)
                    comm.send((doc_buffer,eta_temp,etaSum,alpha,batch_id),dest = source,tag = tags.START)
                else:
                    comm.send(None, dest = source, tag = tags.EXIT)
            except StopIteration:
                comm.send(None, dest = source, tag = tags.EXIT)
        elif tag == tags.DONE:
            delta_eta, nBatch_value = data
            nBatch_value += 1
            logging.info('Got batch {} results from worker {}'.format(nBatch_value,source))
            lockedEta.add_eta(delta_eta)
            doc_seen = nBatch_value * thread_batch
            if doc_seen - last_doc_seen >= 100000:
                fn = 'eta.{}.pickle'.format(nBatch_value/nthread-1)
                path = os.path.join(var_path,fn)
                lockedEta.write_eta(path)
                logging.info('round:{}, batch:{}'.format(nBatch_value/nthread-1,nBatch_value))
                last_doc_seen = doc_seen

        elif tag == tags.EXIT:
            logging.info('Worker {} exited.'.format(source))
            closed_workers += 1

    return lockedEta.eta
Пример #3
0
def asyn_framework(corpus,k,V,nthread,minibatch,var_path,record_eta = False):
    # configs
    thread_batch = minibatch/nthread
    # ids 
    doc_id = 0
    batch_id = 0
    round_id = 0
    # temp data
    doc_buffer = []
    voc_temp = set()
    old_doc_seen = {}
    old_doc_seen[0] = 0
    # global data
    lockedEta = LockedEta({},Lock())
    
    # process contral
    pool = Pool(processes = nthread)
    nActPro = LockedSum(0,Lock())
    nBatch = LockedSum(0,Lock())
    results = []
    for doc in corpus:
        
        for vid,count in doc:
            voc_temp.add(vid)
        doc_buffer.append(doc)

        if doc_id % thread_batch == thread_batch - 1:
            eta_temp = lockedEta.get_eta(k,voc_temp)
            etaSum = lockedEta.get_eta_sum(k,V)
            alpha = _mea.get_alpha(k)
            while True: # check for active processes amount
                if nActPro.get_value() < nthread:
                    break
                
            cb = lambda x: callback(x,lockedEta,nActPro,nBatch,var_path,nthread,thread_batch,old_doc_seen)
            result = pool.apply_async(asyn_workder,(doc_buffer,eta_temp,etaSum,alpha),callback = cb)
            results.append(result)
            nActPro.add_value(1)
            
            # clear buffer
            doc_buffer = []
            voc_temp = set()
            batch_id += 1

            
        doc_id += 1

    # some remain doc may not be processed
    if len(doc_buffer) > 0:
        eta_temp = lockedEta.get_eta(k,voc_temp)
        etaSum = lockedEta.get_eta_sum(k,V)
        alpha = _mea.get_alpha(k)
        while True: # check for active processes amount
            if nActPro.get_value() < nthread:
                break
                
        cb = lambda x: callback(x,lockedEta,nActPro,nBatch,var_path,nthread,thread_batch,old_doc_seen)
        result = pool.apply_async(asyn_workder,(doc_buffer,eta_temp,etaSum,alpha),callback = cb)
        results.append(result)
        nActPro.add_value(1)
        batch_id += 1

    for r in results:
        r.wait()

    if nBatch.get_value() % nthread != 0:
        nBatch_value = nBatch.get_value()
        fn = 'eta.{}.pickle'.format(nBatch_value/nthread)
        path = os.path.join(var_path,fn)
        lockedEta.write_eta(path)
        
    return lockedEta.eta
Пример #4
0
def asyn_framework(corpus,
                   k,
                   V,
                   nthread,
                   minibatch,
                   var_path,
                   record_eta=False):
    # configs
    thread_batch = minibatch / nthread
    # ids
    doc_id = 0
    batch_id = 0
    round_id = 0
    # temp data
    doc_buffer = []
    voc_temp = set()
    old_doc_seen = {}
    old_doc_seen[0] = 0
    # global data
    lockedEta = LockedEta({}, Lock())

    # process contral
    pool = Pool(processes=nthread)
    nActPro = LockedSum(0, Lock())
    nBatch = LockedSum(0, Lock())
    results = []
    for doc in corpus:

        for vid, count in doc:
            voc_temp.add(vid)
        doc_buffer.append(doc)

        if doc_id % thread_batch == thread_batch - 1:
            eta_temp = lockedEta.get_eta(k, voc_temp)
            etaSum = lockedEta.get_eta_sum(k, V)
            alpha = _mea.get_alpha(k)
            while True:  # check for active processes amount
                if nActPro.get_value() < nthread:
                    break

            cb = lambda x: callback(x, lockedEta, nActPro, nBatch, var_path,
                                    nthread, thread_batch, old_doc_seen)
            result = pool.apply_async(asyn_workder,
                                      (doc_buffer, eta_temp, etaSum, alpha),
                                      callback=cb)
            results.append(result)
            nActPro.add_value(1)

            # clear buffer
            doc_buffer = []
            voc_temp = set()
            batch_id += 1

        doc_id += 1

    # some remain doc may not be processed
    if len(doc_buffer) > 0:
        eta_temp = lockedEta.get_eta(k, voc_temp)
        etaSum = lockedEta.get_eta_sum(k, V)
        alpha = _mea.get_alpha(k)
        while True:  # check for active processes amount
            if nActPro.get_value() < nthread:
                break

        cb = lambda x: callback(x, lockedEta, nActPro, nBatch, var_path,
                                nthread, thread_batch, old_doc_seen)
        result = pool.apply_async(asyn_workder,
                                  (doc_buffer, eta_temp, etaSum, alpha),
                                  callback=cb)
        results.append(result)
        nActPro.add_value(1)
        batch_id += 1

    for r in results:
        r.wait()

    if nBatch.get_value() % nthread != 0:
        nBatch_value = nBatch.get_value()
        fn = 'eta.{}.pickle'.format(nBatch_value / nthread)
        path = os.path.join(var_path, fn)
        lockedEta.write_eta(path)

    return lockedEta.eta