def format_to_bert(args):
    for corpus_type in ["train", "test"]:
        a_lst = []
        for json_f in glob.glob(pjoin(args.raw_path, "*" + corpus_type + "*.json")):
            real_name = json_f.split("/")[-1]
            a_lst.append(
                (
                    corpus_type,
                    json_f,
                    args,
                    pjoin(args.save_path, real_name.replace("json", "bert.pt")),
                )
            )
        print(a_lst)
        pool = Pool(args.n_cpus)
        for d in pool.imap(_format_to_bert, a_lst):
            pass

        pool.close()
        pool.join()
Exemplo n.º 2
0
def format_to_bert(args):
    if (args.dataset != ''):
        datasets = [args.dataset]
    else:
        datasets = ['train', 'valid', 'test']
    for corpus_type in datasets:
        a_lst = []
        for json_f in glob.glob(
                pjoin(args.raw_path, '*' + corpus_type + '.*.json')):
            real_name = json_f.split('/')[-1]
            a_lst.append((json_f, args,
                          pjoin(args.save_path,
                                real_name.replace('json', 'bert.pt'))))
        print(a_lst)
        pool = Pool(args.n_cpus)
        for d in pool.imap(_format_to_bert, a_lst):
            pass

        pool.close()
        pool.join()
def main():
    config = {
        "data_level": 0.5,
        "tolerance": 1.0,
        "seg_threshold": 0.5,
        "min_area": 10
    }
    # --- Process args --- #
    args = get_args()

    pool = Pool()
    list(
        tqdm(pool.imap(
            partial(run_one,
                    out_dirpath=args.out_dirpath,
                    config=config,
                    im_dirpath=args.im_dirpath,
                    out_ext=args.out_ext,
                    bbox=args.bbox), args.seg_filepath),
             desc="Simple poly.",
             total=len(args.seg_filepath)))
Exemplo n.º 4
0
def format_to_bert_w_scores(args):
    os.makedirs(args.save_path, exist_ok=True)
    if (args.dataset != ''):
        datasets = [args.dataset]
    else:
        datasets = ['train', 'valid', 'test']
    for corpus_type in datasets:
        a_lst = []
        for json_f in glob.glob(
                pjoin(args.raw_path, '*' + corpus_type + '.*.json')):
            real_name = os.path.basename(json_f)
            a_lst.append((json_f, args,
                          pjoin(args.save_path,
                                real_name.replace('json', 'sent_score.pt'))))
        logger.info(a_lst)
        pool = Pool(args.n_cpus)
        for d in pool.imap(_format_to_bert_w_scores, a_lst):
            pass

        pool.close()
        pool.join()
Exemplo n.º 5
0
def run_fit_two_poolable(outfile, cores, tasks, two_pulse_fit, pulse_params,
                         height_th, sigma0):
    """
    Append new pandas row to an outfile.
    """
    p = Pool(cores)
    file_exists = os.path.isfile(outfile)
    with open(outfile, 'w+') as f:
        if not file_exists:
            """create header"""
            df = p.map(
                lambda f: fit_two_poolable(f, two_pulse_fit, pulse_params,
                                           height_th, sigma0), tasks[0:1])
            df[0].to_csv(f, header=True)
        for df in p.imap(
                lambda f: fit_two_poolable(f, two_pulse_fit, pulse_params,
                                           height_th, sigma0), tasks):
            try:
                df.to_csv(f, header=False)
            except:
                pass
Exemplo n.º 6
0
def format_to_bert(args):
    """ Transforms words to ids with BERT tokenizer. """

    # Create folders
    if not os.path.isdir(args.save_path):
        os.makedirs(args.save_path)

    if (args.dataset != ''):
        datasets = [args.dataset]
    else:
        datasets = ['train', 'valid', 'test']

    # Multiprocessing for _format_to_bert()
    for corpus_type in datasets:
        if not args.debug:
            a_lst = []
            for json_f in glob.glob(
                    pjoin(args.raw_path, '*' + corpus_type + '.*.json')):
                real_name = json_f.split('/')[-1]
                a_lst.append((corpus_type, json_f, args,
                              pjoin(args.save_path,
                                    real_name.replace('json', 'bert.pt'))))
            print("Processing {} dataset...".format(corpus_type))
            pool = Pool(args.n_cpus)
            for d in pool.imap(_format_to_bert, a_lst):
                pass

            pool.close()
            pool.join()
        else:
            # NOTE: debug without multiprocessing
            print("Processing {} dataset...".format(corpus_type))
            for json_f in glob.glob(
                    pjoin(args.raw_path, '*' + corpus_type + '.*.json')):
                real_name = json_f.split('/')[-1]
                _format_to_bert((corpus_type, json_f, args,
                                 pjoin(args.save_path,
                                       real_name.replace('json', 'bert.pt'))))
Exemplo n.º 7
0
def format_to_bert(args):
    """ 
    Function to create dataset in bert format --- main function is _format_to_bert which create gold summaries using greedy_selection
    """
    if (args.dataset != ''):
        datasets = [args.dataset]
    else:
        datasets = ['train', 'valid', 'test']
    for corpus_type in datasets:
        a_lst = []
        for json_f in glob.glob(
                pjoin(args.raw_path, '*' + corpus_type + '.*.json')):
            real_name = json_f.split('/')[-1]
            a_lst.append((corpus_type, json_f, args,
                          pjoin(args.save_path,
                                real_name.replace('json', 'bert.pt'))))
        print(a_lst)
        pool = Pool(args.n_cpus)
        for d in pool.imap(_format_to_bert, a_lst):
            pass

        pool.close()
        pool.join()
Exemplo n.º 8
0
def format_xlnet(args):
    if args.dataset is not '':
        data_type = [args.dataset]
    else:
        data_type = ['train', 'valid', 'test']

    for corpus_type in data_type:
        a_lst = []
        for json_f in glob.glob(
                join(args.json_path, '*' + corpus_type + '.*.json')):
            real_name = os.path.basename(json_f)
            print(real_name)
            a_lst.append((json_f, args,
                          join(args.save_path,
                               real_name.replace('json', 'xlnet.pt'))))
        print(a_lst)

        pool = Pool(args.n_cpus)
        for _ in pool.imap(_format_xlnet, a_lst):
            pass

        pool.close()
        pool.join()
Exemplo n.º 9
0

def proc(i, json, day, pd, time):
    url = "https://coinmarketcap.com/currencies/" + json[i][
        "slug"] + "/historical-data/?start=20130428&end=" + day
    try:
        r = pd.read_html(url)[0]
    except:
        time.sleep(10)
        r = pd.read_html(url)[0]
    r["Name"] = json[i]["name"]
    r["Symbol"] = json[i]["symbol"]
    return r


def calculate(args):
    return args[0](*args[1])


num_tasks = len(json)
pool = Pool(processes=cpu_count())
results = pool.imap(calculate, [(proc, [i, json, day, pd, time])
                                for i in range(num_tasks)])
la = []
for i, r in enumerate(results):
    la.append(r)
    sys.stderr.write('\rdone {0:%}'.format((i + 1) / num_tasks))
da = pd.concat(la)

da.replace("-", 0, inplace=True)
da.to_csv("Put your address here", index=False, encoding="utf-8")
Exemplo n.º 10
0
from multiprocess import Pool


def say(s):
    print(s)


pool = Pool(1)
for d in pool.imap(say, 'hello'):
    pass

pool.close()
pool.join()
Exemplo n.º 11
0
def test():
    print('cpuCount() = %d\n' % cpuCount())

    #
    # Create pool
    #

    PROCESSES = 4
    print('Creating pool with %d processes\n' % PROCESSES)
    pool = Pool(PROCESSES)

    #
    # Tests
    #

    TASKS = [(mul, (i, 7)) for i in range(10)] + \
            [(plus, (i, 8)) for i in range(10)]

    results = [pool.apply_async(calculate, t) for t in TASKS]
    imap_it = pool.imap(calculatestar, TASKS)
    imap_unordered_it = pool.imap_unordered(calculatestar, TASKS)

    print('Ordered results using pool.apply_async():')
    for r in results:
        print('\t', r.get())
    print()

    print('Ordered results using pool.imap():')
    for x in imap_it:
        print('\t', x)
    print()

    print('Unordered results using pool.imap_unordered():')
    for x in imap_unordered_it:
        print('\t', x)
    print()

    print('Ordered results using pool.map() --- will block till complete:')
    for x in pool.map(calculatestar, TASKS):
        print('\t', x)
    print()

    #
    # Simple benchmarks
    #

    N = 100000
    print('def pow3(x): return x**3')

    t = time.time()
    A = list(map(pow3, range(N)))
    print('\tmap(pow3, range(%d)):\n\t\t%s seconds' % \
          (N, time.time() - t))

    t = time.time()
    B = pool.map(pow3, range(N))
    print('\tpool.map(pow3, range(%d)):\n\t\t%s seconds' % \
          (N, time.time() - t))

    t = time.time()
    C = list(pool.imap(pow3, range(N), chunksize=N // 8))
    print('\tlist(pool.imap(pow3, range(%d), chunksize=%d)):\n\t\t%s' \
          ' seconds' % (N, N//8, time.time() - t))

    assert A == B == C, (len(A), len(B), len(C))
    print()

    L = [None] * 1000000
    print('def noop(x): pass')
    print('L = [None] * 1000000')

    t = time.time()
    A = list(map(noop, L))
    print('\tmap(noop, L):\n\t\t%s seconds' % \
          (time.time() - t))

    t = time.time()
    B = pool.map(noop, L)
    print('\tpool.map(noop, L):\n\t\t%s seconds' % \
          (time.time() - t))

    t = time.time()
    C = list(pool.imap(noop, L, chunksize=len(L) // 8))
    print('\tlist(pool.imap(noop, L, chunksize=%d)):\n\t\t%s seconds' % \
          (len(L)//8, time.time() - t))

    assert A == B == C, (len(A), len(B), len(C))
    print()

    del A, B, C, L

    #
    # Test error handling
    #

    print('Testing error handling:')

    try:
        print(pool.apply(f, (5, )))
    except ZeroDivisionError:
        print('\tGot ZeroDivisionError as expected from pool.apply()')
    else:
        raise AssertionError('expected ZeroDivisionError')

    try:
        print(pool.map(f, range(10)))
    except ZeroDivisionError:
        print('\tGot ZeroDivisionError as expected from pool.map()')
    else:
        raise AssertionError('expected ZeroDivisionError')

    try:
        print(list(pool.imap(f, range(10))))
    except ZeroDivisionError:
        print('\tGot ZeroDivisionError as expected from list(pool.imap())')
    else:
        raise AssertionError('expected ZeroDivisionError')

    it = pool.imap(f, range(10))
    for i in range(10):
        try:
            x = it.next()
        except ZeroDivisionError:
            if i == 5:
                pass
        except StopIteration:
            break
        else:
            if i == 5:
                raise AssertionError('expected ZeroDivisionError')

    assert i == 9
    print('\tGot ZeroDivisionError as expected from IMapIterator.next()')
    print()

    #
    # Testing timeouts
    #

    print('Testing ApplyResult.get() with timeout:', end='')
    res = pool.apply_async(calculate, TASKS[0])
    while 1:
        sys.stdout.flush()
        try:
            sys.stdout.write('\n\t%s' % res.get(0.02))
            break
        except TimeoutError:
            sys.stdout.write('.')
    print()
    print()

    print('Testing IMapIterator.next() with timeout:', end='')
    it = pool.imap(calculatestar, TASKS)
    while 1:
        sys.stdout.flush()
        try:
            sys.stdout.write('\n\t%s' % it.next(0.02))
        except StopIteration:
            break
        except TimeoutError:
            sys.stdout.write('.')
    print()
    print()

    #
    # Testing callback
    #

    print('Testing callback:')

    A = []
    B = [56, 0, 1, 8, 27, 64, 125, 216, 343, 512, 729]

    r = pool.apply_async(mul, (7, 8), callback=A.append)
    r.wait()

    r = pool.map_async(pow3, range(10), callback=A.extend)
    r.wait()

    if A == B:
        print('\tcallbacks succeeded\n')
    else:
        print('\t*** callbacks failed\n\t\t%s != %s\n' % (A, B))

    #
    # Check there are no outstanding tasks
    #

    assert not pool._cache, 'cache = %r' % pool._cache

    #
    # Check close() methods
    #

    print('Testing close():')

    for worker in pool._pool:
        assert worker.is_alive()

    result = pool.apply_async(time.sleep, [0.5])
    pool.close()
    pool.join()

    assert result.get() is None

    for worker in pool._pool:
        assert not worker.is_alive()

    print('\tclose() succeeded\n')

    #
    # Check terminate() method
    #

    print('Testing terminate():')

    pool = Pool(2)
    ignore = pool.apply(pow3, [2])
    results = [pool.apply_async(time.sleep, [10]) for i in range(10)]
    pool.terminate()
    pool.join()

    for worker in pool._pool:
        assert not worker.is_alive()

    print('\tterminate() succeeded\n')

    #
    # Check garbage collection
    #

    print('Testing garbage collection:')

    pool = Pool(2)
    processes = pool._pool

    ignore = pool.apply(pow3, [2])
    results = [pool.apply_async(time.sleep, [10]) for i in range(10)]

    del results, pool

    time.sleep(0.2)

    for worker in processes:
        assert not worker.is_alive()

    print('\tgarbage collection succeeded\n')
Exemplo n.º 12
0
tr = pytextrank.TextRank()
nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)


# Read corpus and extract key phrases.
def worker(text):
    doc = nlp(text)
    phrases = [p.text for p in doc._.phrases]
    num_words = len(doc)
    return phrases, num_words


p = Pool(options.nproc)
total_words = 0
vocab = collections.Counter()
for phrases, num_words in tqdm(p.imap(worker, corpus)):
    # Note: This count include punctuation as well as words.
    total_words += num_words

    # examine the top-ranked phrases in the document
    seen = 0
    for i, p in enumerate(phrases):
        if len(p.split()) == 1:
            continue
        # print("{:.3f} {}".format(p.rank, p.text))
        vocab[p] += 1
        seen += 1
        if options.maxphrases_per_doc > 0 and seen == options.maxphrases_per_doc:
            break

for k in sorted(vocab.keys()):
Exemplo n.º 13
0
                            == 'abstractive')):
                    with open(
                            os.path.join(args.output_path, split_name,
                                         str(file_id) + '.txt'),
                            'w') as out_file:
                        # Write article
                        art = data[contents[0]].numpy().decode('utf-8')
                        out_file.write(art)

                        # Write summary
                        summ = data[contents[1]].numpy().decode('utf-8')
                        out_file.write('\n@highlight\n')
                        out_file.write(summ)

    # Sentence split for dataset
    if args.mode == 'ssplit':
        for split_name, data_num in zip(split_names, data_nums):
            files = os.listdir(os.path.join(args.output_path, split_name))
            file_list = [
                os.path.join(args.output_path, split_name, f) for f in files
            ]
            parallel_func = partial(_reseperate_sentence, args.ssplit_target)
            pool = Pool(args.cpu_num)
            for d in tqdm(pool.imap(parallel_func, file_list),
                          total=data_num,
                          desc=split_name,
                          unit=' file'):
                pass
            pool.close()
            pool.join()
Exemplo n.º 14
0
def format_to_bert(args):
    test_kws = pd.read_csv('csv_files/train_papers_sect8.csv')

    kws = {
        'intro': [kw.strip() for kw in test_kws['intro'].dropna()],
        'related': [kw.strip() for kw in test_kws['related work'].dropna()],
        'exp': [kw.strip() for kw in test_kws['experiments'].dropna()],
        'res': [kw.strip() for kw in test_kws['results'].dropna()],
        'conclusion': [kw.strip() for kw in test_kws['conclusion'].dropna()]
    }

    if (args.dataset != ''):
        datasets = [args.dataset]
    else:
        datasets = ['test']


    if len(args.sent_numbers_file) > 0:
        sent_numbers = pickle.load(open(args.sent_numbers_file, "rb"))
    else:
        sent_numbers = None

    # ARXIVIZATION
    bart = args.bart
    check_path_existence(args.save_path)
    for corpus_type in datasets:
        a_lst = []
        c = 0
        for json_f in glob.glob(pjoin(args.raw_path, corpus_type + '.*.json')):
            real_name = json_f.split('/')[-1]
            c += 1
            a_lst.append(
                (corpus_type, json_f, args, pjoin(args.save_path, real_name.replace('json', 'bert.pt')), kws, bart,
                 sent_numbers, 1))
        print("Number of files: " + str(c))

        ##########################
        ###### <DEBUGGING> #######
        ##########################

        # for a in a_lst:
        #     _format_to_bert(a)


        # single
        # json_f = args.raw_path + '/train.6.json'
        # _format_to_bert(('val', str(json_f), args, pjoin(args.save_path, str(json_f).replace('json', 'bert.pt')), kws, bart,
        #          sent_numbers, 25))

        ##########################
        ###### <DEBUGGING> #######
        ##########################

        pool = Pool(args.n_cpus)
        print('Processing {} set with {} json files...'.format(corpus_type, len(a_lst)))
        all_papers_count = 0
        all_paper_ids = {}
        for d in tqdm(pool.imap(_format_to_bert, a_lst), total=len(a_lst), desc=''):
            all_paper_ids[d[0]] = d[1]
            all_papers_count += d[2]

        pool.close()
        pool.join()
Exemplo n.º 15
0

    def runInParallel(*proc):
        for p in proc:
            p.start()
            p.join()
        return

    # thread1.start()
    # thread1.join()

    # thread2.start()
    # thread1.join()

    pool = Pool()
    parallel_run = pool.imap(runInParallel, [p1, p2])
    pool.close()
    pool.join()

    print("done with parallel")
    print("data loaded")

# def runInParallel(*funcs):
# 	proc = []
# 	for fn in funcs:
# 		p = Process(target=fn[0], args=fn[1])
# 		p.start()
# 		proc.append(p)
# 	for p in proc:
# 		p.join()
# dill.dump_session("sample_pregen_dat.out")
Exemplo n.º 16
0
def test():
    print('cpuCount() = %d\n' % cpuCount())
    
    #
    # Create pool
    #
    
    PROCESSES = 4
    print('Creating pool with %d processes\n' % PROCESSES)
    pool = Pool(PROCESSES)    

    #
    # Tests
    #

    TASKS = [(mul, (i, 7)) for i in range(10)] + \
            [(plus, (i, 8)) for i in range(10)]

    results = [pool.apply_async(calculate, t) for t in TASKS]
    imap_it = pool.imap(calculatestar, TASKS)
    imap_unordered_it = pool.imap_unordered(calculatestar, TASKS)

    print('Ordered results using pool.apply_async():')
    for r in results:
        print('\t', r.get())
    print()

    print('Ordered results using pool.imap():')
    for x in imap_it:
        print('\t', x)
    print()

    print('Unordered results using pool.imap_unordered():')
    for x in imap_unordered_it:
        print('\t', x)
    print()

    print('Ordered results using pool.map() --- will block till complete:')
    for x in pool.map(calculatestar, TASKS):
        print('\t', x)
    print()

    #
    # Simple benchmarks
    #

    N = 100000
    print('def pow3(x): return x**3')
    
    t = time.time()
    A = list(map(pow3, xrange(N)))
    print('\tmap(pow3, xrange(%d)):\n\t\t%s seconds' % \
          (N, time.time() - t))
    
    t = time.time()
    B = pool.map(pow3, xrange(N))
    print('\tpool.map(pow3, xrange(%d)):\n\t\t%s seconds' % \
          (N, time.time() - t))

    t = time.time()
    C = list(pool.imap(pow3, xrange(N), chunksize=N//8))
    print('\tlist(pool.imap(pow3, xrange(%d), chunksize=%d)):\n\t\t%s' \
          ' seconds' % (N, N//8, time.time() - t))
    
    assert A == B == C, (len(A), len(B), len(C))
    print()
    
    L = [None] * 1000000
    print('def noop(x): pass')
    print('L = [None] * 1000000')
    
    t = time.time()
    A = list(map(noop, L))
    print('\tmap(noop, L):\n\t\t%s seconds' % \
          (time.time() - t))
    
    t = time.time()
    B = pool.map(noop, L)
    print('\tpool.map(noop, L):\n\t\t%s seconds' % \
          (time.time() - t))

    t = time.time()
    C = list(pool.imap(noop, L, chunksize=len(L)//8))
    print('\tlist(pool.imap(noop, L, chunksize=%d)):\n\t\t%s seconds' % \
          (len(L)//8, time.time() - t))

    assert A == B == C, (len(A), len(B), len(C))
    print()

    del A, B, C, L

    #
    # Test error handling
    #

    print('Testing error handling:')

    try:
        print(pool.apply(f, (5,)))
    except ZeroDivisionError:
        print('\tGot ZeroDivisionError as expected from pool.apply()')
    else:
        raise AssertionError('expected ZeroDivisionError')

    try:
        print(pool.map(f, range(10)))
    except ZeroDivisionError:
        print('\tGot ZeroDivisionError as expected from pool.map()')
    else:
        raise AssertionError('expected ZeroDivisionError')
            
    try:
        print(list(pool.imap(f, range(10))))
    except ZeroDivisionError:
        print('\tGot ZeroDivisionError as expected from list(pool.imap())')
    else:
        raise AssertionError('expected ZeroDivisionError')

    it = pool.imap(f, range(10))
    for i in range(10):
        try:
            x = it.next()
        except ZeroDivisionError:
            if i == 5:
                pass
        except StopIteration:
            break
        else:
            if i == 5:
                raise AssertionError('expected ZeroDivisionError')
            
    assert i == 9
    print('\tGot ZeroDivisionError as expected from IMapIterator.next()')
    print()
    
    #
    # Testing timeouts
    #
    
    print('Testing ApplyResult.get() with timeout:', end='')
    res = pool.apply_async(calculate, TASKS[0])
    while 1:
        sys.stdout.flush()
        try:
            sys.stdout.write('\n\t%s' % res.get(0.02))
            break
        except TimeoutError:
            sys.stdout.write('.')
    print()
    print()

    print('Testing IMapIterator.next() with timeout:', end='')
    it = pool.imap(calculatestar, TASKS)
    while 1:
        sys.stdout.flush()
        try:
            sys.stdout.write('\n\t%s' % it.next(0.02))
        except StopIteration:
            break
        except TimeoutError:
            sys.stdout.write('.')
    print()
    print()
            
    #
    # Testing callback
    #

    print('Testing callback:')
    
    A = []
    B = [56, 0, 1, 8, 27, 64, 125, 216, 343, 512, 729]
        
    r = pool.apply_async(mul, (7, 8), callback=A.append)
    r.wait()

    r = pool.map_async(pow3, range(10), callback=A.extend)
    r.wait()

    if A == B:
        print('\tcallbacks succeeded\n')
    else:
        print('\t*** callbacks failed\n\t\t%s != %s\n' % (A, B))
    
    #
    # Check there are no outstanding tasks
    #
    
    assert not pool._cache, 'cache = %r' % pool._cache

    #
    # Check close() methods
    #

    print('Testing close():')

    for worker in pool._pool:
        assert worker.is_alive()

    result = pool.apply_async(time.sleep, [0.5])
    pool.close()
    pool.join()

    assert result.get() is None

    for worker in pool._pool:
        assert not worker.is_alive()

    print('\tclose() succeeded\n')

    #
    # Check terminate() method
    #

    print('Testing terminate():')

    pool = Pool(2)
    ignore = pool.apply(pow3, [2])
    results = [pool.apply_async(time.sleep, [10]) for i in range(10)]
    pool.terminate()
    pool.join()

    for worker in pool._pool:
        assert not worker.is_alive()

    print('\tterminate() succeeded\n')

    #
    # Check garbage collection
    #

    print('Testing garbage collection:')

    pool = Pool(2)
    processes = pool._pool
    
    ignore = pool.apply(pow3, [2])
    results = [pool.apply_async(time.sleep, [10]) for i in range(10)]

    del results, pool

    time.sleep(0.2)
    
    for worker in processes:
        assert not worker.is_alive()

    print('\tgarbage collection succeeded\n')