Exemplo n.º 1
0
 def parse_gutenberg(self, workers=None, verbose=False):
     '''Parse and store nss docs into a doctable.
     Args:
         years (list): years to request from the nss corpus
         dbfname (str): fname for DocTable to initialize in each process.
         workers (int or None): number of processes to create for parsing.
     '''
     with doctable.Distribute(workers) as d:
         res = d.map_chunk(self.parse_guten_chunk, self.metadata,
                           self.dbfname, verbose)
     return res
Exemplo n.º 2
0
def test_basic():
    elements = list(range(100))
    res = [i*2 for i in elements]
    
    for NCORES in (1,3):
        with doctable.Distribute(NCORES) as d:
            el1 = d.map_chunk(chunk_thread, elements)
            el2 = d.map(thread_func, elements)

        assert(len(elements) == len(el1))
        assert(len(elements) == len(el2))
        assert(all([r==e for r,e in zip(res,el1)]))
        assert(all([r==e for r,e in zip(res,el2)]))
Exemplo n.º 3
0
def test_play(n=1000):
    '''Tests ability to solve tasks when tasks take an 
        unequal ammount of time to execute.
    '''

    timer = doctable.Timer(logfile='logs/parallel_primefinder.log')

    timer.step('making elements')

    elements = list(range(3))
    with multiprocessing.Pool(24) as p:
        out = p.map(TestWorker, elements)

    print(out)

    exit()

    if False:
        test_func = find_prime_long
        elements = list(range(n))
        random.shuffle(elements)

    elif False:
        test_func = array_test

        import numpy as np
        elements = [np.ones((int(5e7) * (i + 1), )) for i in range(10)]
        for a in elements:
            a[0] = 0
        print(len(elements), elements[0].shape)

    elif True:
        test_func = timed_step
        elements = list(range(n))
        random.shuffle(elements)

    timer.step('check ram')

    if False:
        timer.step('single-core eval')
        prime_single = list(map(test_func, elements))

    timer.step('multiprocessing.Pool')
    with multiprocessing.Pool(24) as p:
        prime_multi = p.map(test_func, elements)

    if False:
        timer.step('map_async')
        with multiprocessing.Pool(6) as p:
            prime_async = list(p.map_async(test_func, elements).get())

        timer.step('imap')
        with multiprocessing.Pool(6) as p:
            prime_imap = list(p.imap(test_func, elements, 100))

        timer.step('imap_unordered')
        with multiprocessing.Pool(6) as p:
            prime_unordered = list(p.imap_unordered(test_func, elements, 100))

    if False:
        timer.step('doctable.Distribute')
        with doctable.Distribute(24) as d:
            prime_distribute = d.map_chunk(test_func, elements)

    timer.step('doctable.WorkerPool')
    with doctable.WorkerPool(24) as p:
        prime_async = p.map(test_func, elements)
        print(f'av efficiency: {p.av_efficiency()}')

    timer.step('annnnndddd time!')

    assert (prime_multi == prime_async)

    timer.step('done')
Exemplo n.º 4
0
                        tripdb.insert(
                            {
                                'subject':
                                subj.tok if subj is not None else None,
                                'verb': verb.tok,
                                'object': obj.tok if obj is not None else None,
                                'docid': docid,
                                'gutenid': gid,
                                'language': lang,
                            },
                            ifnotunique='replace')


if __name__ == '__main__':
    docdb_fname = 'db/gutenberg_24.db'
    tripdb_fname = 'db/actornet1.db'

    gutendb = GutenDocsDB(docdb_fname)
    tripdb = SubjVerbObjDB(tripdb_fname)
    all_ids = set(gutendb.select('gutenid'))
    finished_ids = set(tripdb.select('gutenid'))
    ids = list(all_ids - finished_ids)

    print('parsing {} docs (of {} total)'.format(len(ids), len(all_ids)))

    with doctable.Distribute(50, override_maxcores=True) as d:
        d.map_chunk(insert_triplets, ids, docdb_fname, tripdb_fname)

    #insert_triplets(gutendb, tripdb)
    print('finished')
Exemplo n.º 5
0
if __name__ == '__main__':
    # 10 mil / 512 MB
    # 20 mil / 900 MB
    # 30 mil / 1285 MB
    # 40 mil / 1672 MB
    # 50 mil / 20158 MB
    with doctable.Timer('Making big array'):
        m = [i for i in range(30000000)]
    time.sleep(20)

    # Create an actor from this class
    print('the ray way')
    m = ray.put(m)
    counters = [Counter.remote(m) for _ in range(5)]
    print(ray.get([c.shape.remote() for c in counters]))
    del m
    del counters
    exit()

    # alternatively, use doctable
    print('old school way')
    mats = [m.copy() for _ in range(5)]
    with doctable.Distribute(1) as d:
        shapes = d.map_chunk(test_thread, mats)
    print(shapes)

    #counter = Counter.remote()
    #counter.increment.remote(10)
    #print(ray.get(counter.read.remote()))