Пример #1
0
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 25 18:28:37 2019

@author: Guanglin Kuang
"""

from multiprocessing import Pool
import time 

def doubler(number):
    return number * 2
 
if __name__ == '__main__':
    numbers = range(50000000)
    start_time = time.time()
    
    pool = Pool(processes=8)
    result = pool.map(doubler, numbers)
    
    end_time = time.time()
    
    print("Parallel time: {:.2f} seconds".format(end_time - start_time))
pid_array = loadtxt(data_path + 'samples_100_order_pids.txt.gz')
pid_array = pid_array.flatten()
pid_array = pid_array.astype(np.int64)

test_list_pid = pid_array*1

gc.collect()                                                                                                                                                                                                                 

I_song = array(train_list_song_co_occur.sum(axis=1)).flatten()
I_list = array(train_list_song_co_occur.sum(axis=0)).flatten()

alpha = 0.9
beta = 0.9

print("Predictions begin:")

start = time()
pool = Pool(20)
results = pool.map(cf_predict_evaluation_slim, [index for index in test_list_pid])
pool.close()
pool.join()
print('Time taken:', time() - start)

prediction_result = zeros((len(test_list_pid),max_n_predictions+1))
prediction_result[:,0] = test_list_pid

for i in range(len(results)):
    prediction_result[i,range(1,501)] = results[i]

savetxt('../submit/cf2submit_v9.txt', prediction_result, delimiter=',')
Пример #3
0
for i1, task in enumerate(filter(isdir, os.listdir(base_path))):
    models_path = os.path.join(base_path, task)
    make_title(i1, task)

    i2 = 0
    for model in os.listdir(models_path):
        frameworks_path = os.path.join(models_path, model)

        if os.path.isdir(frameworks_path):
            for script_name in os.listdir(frameworks_path):
                download_script = os.path.join(frameworks_path, script_name)

                if download_script.endswith('download_dataset.sh'):
                    if args.sequential:
                        run_script(download_script)
                    else:
                        scripts.append(download_script)


if not args.sequential:
    from multiprocessing import Pool, cpu_count

    pool = Pool(cpu_count())
    pool.map(run_script, scripts)


data = f'Total Time {time.time() - start_all}'
print(data)
experiment.write(data)
experiment.close()
Пример #4
0
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_file', required=True)
    parser.add_argument('--save_dir', required=True)
    parser.add_argument('--mode', type=str, default='train')  # 'train'
    parser.add_argument('--ncpu', type=int, default=8)
    parser.add_argument('--split_len', type=int, default=10000)  # the surplus will be evenly allocated to each split
    args = parser.parse_args()

    if not os.path.isdir(args.save_dir):
        os.makedirs(args.save_dir)

    if args.mode == 'train':
        # dataset contains molecule pairs
        with open(args.data_file) as f:
            data = f.readlines()[1:]

        with Pool(args.ncpu) as pool:
            data = pool.map(process, data)
        # data = [process(item) for item in data]

        num_splits = len(data) // args.split_len
        num_splits = 1 if num_splits == 0 else num_splits
        le = (len(data) + num_splits - 1) // num_splits
        for split_id in range(num_splits):
            st = split_id * le
            sub_data = data[st: st + le]

            with open(os.path.join(args.save_dir, f'tensors-{split_id}.pkl'), 'wb') as f:
                pickle.dump(sub_data, f, pickle.HIGHEST_PROTOCOL)
Пример #5
0
def validator(proxy, q):
    '用代理连接百度验证代理可用性'
    url = 'https://www.baidu.com'
    proxies = {
        'http': proxy,
        'https': proxy,
    }
    try:
        r = requests.get(url, headers=headers, proxies=proxies, timeout=10)
        if (r.status_code == requests.codes.ok):
            print('valid proxy: ' + proxy)
            q.append(proxy)
    except Exception as e:
        print('error:', proxy, e)
    time.sleep(0.5)


if __name__ == '__main__':
    num = int(input('高匿代理,一页100个代理,输入你想要爬取的页数:'))
    getXiCi(num)

    m = Manager()
    q = m.list()
    p = Pool(len(proxyList) // 2)
    for proxy in proxyList:
        p.apply_async(validator, args=(proxy, q))
    p.close()
    p.join()
    print('可用代理如下:')
    print(q)
Пример #6
0
 def squad_questions_to_metas(self, pretense, file):
   p=Pool(40)
   queries = pickle.load(open(file,'rb'))
   queries_meta = p.starmap(Metatext, zip([len(q.question.split()) for q in queries], [q.question for q in queries], [q.paragraph_num for q in queries]))
   pickle.dump([queries_meta], open('../../pickled_squad/'+pretense+'queries_meta.pickle','wb'))
   return queries_meta
results = []
fullmeasuredPeriod = []
fullPeriod = []
fullPower = []
fullSigLevel = []
fullMag = []
MagRangearray = np.linspace(17,24,maglength)
MagRange = [x for x in MagRangearray]
maglist = []
for x in range(len(MagRange)):
    maglist.append([MagRange[x]]*7)


newlist = Magnitudes.mag1929

pool = Pool(processors)
for h in range(startnumber,endnumber):
    print(newlist[h])
    results.append(pool.map(partial(lombScargle, objectmag=newlist[h]),FrangeLoop))
    
    twoDlist = [[],[],[],[],[],[]]
    for X in range(len(results)):
        for Y in range(len(results[X])):
            twoDlist[0].append(results[X][Y][0])
            twoDlist[1].append(results[X][Y][1])
            twoDlist[2].append(results[X][Y][2])
            twoDlist[3].append(results[X][Y][3])
            twoDlist[4].append(results[X][Y][4])
            twoDlist[5].append(results[X][Y][5])
    with open(inFile, 'r') as istr:
        with open(outFile,'w') as ostr:
Пример #8
0
assert len(val_idx) + len(test_idx) + len(train_idx) == 30000
assert len(val_idx) == 2993
assert len(test_idx) == 2824
assert len(train_idx) == 24183


def process(item):
    count, (d, idx, x) = item
    copyfile(os.path.join(s_label,
                          str(idx) + '.png'),
             os.path.join(d,
                          str(count) + '.png'))
    resize_and_write(idx, d, count)


with Pool(processes=os.cpu_count() // 2) as pool:
    for i in tqdm.tqdm(pool.imap_unordered(process, val_idx),
                       'val',
                       total=len(val_idx)):
        pass
    for i in tqdm.tqdm(pool.imap_unordered(process, test_idx),
                       'test',
                       total=len(test_idx)):
        pass
    for i in tqdm.tqdm(pool.imap_unordered(process, train_idx),
                       'train',
                       total=len(train_idx)):
        pass

# def process_one_line(idx, x):
# x = int(x)
Пример #9
0
    # read all MPdata
    MPdata_all = pd.read_csv("./MPdata_all/MPdata_all.csv", sep=';', header=0, index_col=None)
    
    # show statistics of original data
    if False:
        print('show statistics of original data')
        check_properties(data=MPdata_all)
    else:
        print('size of original data:', MPdata_all.shape[0])

    # check crystal symmetry 
    if False:
        print('\nchecking crystal symmetry match on all {} data'.format(MPdata_all.shape[0]))
        sym_thresh = 0.1
        nworkers = multiprocessing.cpu_count()
        pool_Xsys = Pool(processes=nworkers)
        df_split = np.array_split(MPdata_all, nworkers)
        args = [(data, sym_thresh) for data in df_split]
        MPdata_all = pd.concat(pool_Xsys.starmap(check_crystal_system, args), axis=0)
        pool_Xsys.close()
        pool_Xsys.join()
        print('size of data with matched crystal symmetry:', MPdata_all.shape[0])
    else:
        print('\nskip checking..')
        drop_ids = ['mp-18828', 'mp-12843', 'mp-20811']
        MPdata_all = MPdata_all[~MPdata_all['material_id'].isin(drop_ids)]
        print('size of data with matched crystal system:', MPdata_all.shape[0])

    # make dataset directory
    root_dir = './datasets/'
    if not os.path.exists(root_dir):
Пример #10
0
def main():
    print('\n******************************')
    print(' MultiFit v.' + defPar.version)
    print('******************************')

    try:
        opts, args = getopt.getopt(sys.argv[1:], "bftmitph:", [
            "batch", "file", "type", "map", "input-par", "test", "plot", "help"
        ])
    except getopt.GetoptError:
        # print help information and exit:
        usage()
        sys.exit(2)

    # If parameter file not present, make one
    if not exists(defPar.inputParFile):
        print('\n Init parameter not found. Generating a new one...')
        genInitPar()

    # If summary file is not present, make it and fill header
    makeHeaderSummary()

    if (defPar.multiproc == True):
        print('\n Multiprocessing enabled: ' + str(defPar.numProc) + '/' +
              str(mp.cpu_count()) + ' CPUs\n')
    else:
        print('\n Multiprocessing disabled\n')
    for o, a in opts:
        if o in ("-b", "--batch"):
            try:
                type = sys.argv[2]
            except:
                usage()
                sys.exit(2)
            type = int(sys.argv[2])
            i = 0
            if (defPar.multiproc == True):
                p = Pool(defPar.numProc)
                for f in glob.glob('*.txt'):
                    if (f != 'summary.txt'):
                        rs = readSingleSpectra(f)
                        p.apply_async(calculate,
                                      args=(rs.x, rs.y, '0', '0', f, type,
                                            False, False, i))
                        i += 1
                p.close()
                p.join()
            else:
                for f in glob.glob('*.txt'):
                    if (f != 'summary.txt'):
                        rs = readSingleSpectra(f)
                        calculate(rs.x, rs.y, '0', '0', f, type, False, False,
                                  i)
                        i += 1
            addBlankLine(defPar.summary)

        elif o in ("-f", "--file"):
            try:
                type = sys.argv[3]
            except:
                usage()
                sys.exit(2)
            file = str(sys.argv[2])
            type = int(sys.argv[3])
            rs = readSingleSpectra(file)
            calculate(rs.x, rs.y, '0', '0', file, type, False, True, '')

        elif o in ("-p", "--plot"):
            if (len(sys.argv) < 3):
                if (defPar.multiproc == True):
                    p = Pool(defPar.numProc)
                    for f in glob.glob('*.txt'):
                        if (f != 'summary.txt'):
                            rs = readSingleSpectra(f)
                            print("Saving plot for: " + f)
                            p.apply_async(plotData,
                                          args=(rs.x, rs.y, f, False))
                    p.close()
                    p.join()
                else:
                    for f in glob.glob('*.txt'):
                        if (f != 'summary.txt'):
                            rs = readSingleSpectra(f)
                            print("Saving plot for: " + f)
                            plotData(rs.x, rs.y, f, False)
            else:
                file = str(sys.argv[2])
                rs = readSingleSpectra(file)
                plotData(rs.x, rs.y, file, True)

        elif o in ("-m", "--map"):
            try:
                type = sys.argv[3]
            except:
                usage()
                sys.exit(2)

            file = str(sys.argv[2])
            type = int(sys.argv[3])
            rm = readMap(file)
            map = Map()
            i = 0
            if (defPar.multiproc == True):
                p = Pool(defPar.numProc)
                for i in range(1, rm.num_lines):
                    p.apply_async(calculate,
                                  args=(rm.x, rm.y[i], rm.x1[i], rm.y1[i],
                                        file, type, True, False, i))
                p.close()
                p.join()
            #map.draw(os.path.splitext(file)[0] + '_map.txt', True)

            else:
                for i in range(1, rm.num_lines):
                    calculate(rm.x, rm.y[i], rm.x1[i], rm.y1[i], file, type,
                              True, False, i)
                #map.draw(os.path.splitext(file)[0] + '_map.txt', True)

        elif o in ("-t", "--test"):
            file = str(sys.argv[2])
            map = Map()
            #map.readCoord(os.path.splitext(file)[0] + '_map.txt')
            map.draw(os.path.splitext(file)[0] + '_map.txt', True)

        elif o in ("-i", "--input-par"):
            genInitPar()

        else:
            usage()
            sys.exit(2)
Пример #11
0
from multiprocessing import Pool
import time


def sayHi(num):
    num += 1
    print "process %d start" % num
    time.sleep(10 / num)
    print "process %d end" % num
    return num * num


p = Pool(processes=5)

result_list = []
for i in range(30):
    result_list.append(p.apply_async(sayHi, [i]))

# p.close()
# p.join()
id = 0
for res in result_list:
    print "wait %d" % id
    print res.get()
    id += 1
Пример #12
0
    # =========================================
    # Filtering II - Position wise filtering
    # =========================================

    pars_toprint = '-m{}-d{}-b{}-c{}-p{}'.format(int(args.m), int(args.d), int(args.b), int(args.c), float(args.p))
    filt_folder = args.projdir + '/filtered' + pars_toprint + '/'

    if not os.path.exists(filt_folder):
        os.makedirs(filt_folder)
        os.makedirs(filt_folder + '/pop/')
    else:
        shutil.rmtree(filt_folder)
        os.makedirs(filt_folder)
        os.makedirs(filt_folder + '/pop/')

    p = Pool(processes=args.n_threads)
    partial_Div = partial(filter_two,
                          args=args,
                          snp_files=glob.glob(args.projdir + '/snpCaller/called*'),
                          outdir=filt_folder + '/pop')
    p.map(partial_Div, samples_of_interest.keys())
    p.close()
    p.join()

    if args.ind:
        if not os.path.exists(filt_folder + '/ind/'):
            os.makedirs(filt_folder + '/ind/')
        p = Pool(processes=args.n_threads)
        partial_Div = partial(filter_two,
                              args=args,
                              snp_files=glob.glob(args.projdir + '/snpCaller/indiv*'),
Пример #13
0
async def get_poi_information():
    """Retrieve POI information for an array of ids"""
    ids = await request.get_json()

    if len(ids) > 100:
        abort(400, description='You can send at most 100 ids at once.')

    pool = Pool(processes=math.ceil(len(ids) / 3))
    results = list()

    def parse_result(r):
        data = r['data'][6]
        name = get_nested_value(data, 11)
        place_id = get_nested_value(data, 78)
        lat = round(get_nested_value(data, 9, 2), 7)  # 7 digits equals a precision of 1 cm
        lng = round(get_nested_value(data, 9, 3), 7)  # 7 digits equals a precision of 1 cm
        # noinspection PyUnresolvedReferences
        h3_index = h3.geo_to_h3(lat, lng, POI_RESOLUTION)
        address = get_nested_value(data, 2)
        timezone = get_nested_value(data, 30)
        categories = [t[0] for t in (get_nested_value(data, 76) or [])]
        opening_hours = parse_opening_hours(get_nested_value(data, 34, 1))
        permanently_closed = get_nested_value(data, 88, 0) == 'CLOSED'
        temporarily_closed = get_nested_value(data, 96, 5, 0, 2) == 'Reopen this place' and not permanently_closed
        inside_of = get_nested_value(data, 93, 0, 0, 0, 1)
        phone = get_nested_value(data, 178, 0, 3)
        website = get_nested_value(data, 7, 0)
        rating_stars = get_nested_value(data, 4, 7)
        rating_number_of_reviews = get_nested_value(data, 4, 8)
        price_level = get_nested_value(data, 4, 2)
        popularity_data = get_nested_value(data, 84, 0)
        spending_time = parse_spending_time_data(get_nested_value(data, 117, 0))
        popularity, waiting_time = None, None

        if popularity_data:
            popularity, waiting_time = parse_popularity_data(popularity_data, timezone)

        return dict(
            id=r['id'],
            data=dict(
                name=name,
                placeID=place_id,
                location=dict(lat=lat, lng=lng),
                h3Index=h3_index,
                address=address,
                timezone=timezone,
                categories=categories,
                temporarilyClosed=temporarily_closed,
                permanentlyClosed=permanently_closed,
                insideOf=inside_of,
                contact=dict(phone=phone, website=website),
                openingHours=opening_hours,
                rating=dict(stars=rating_stars, numberOfReviews=rating_number_of_reviews),
                priceLevel=len(price_level) if price_level else None,
                popularity=popularity,
                waitingTime=waiting_time,
                spendingTime=spending_time
            )
        )

    for result in pool.imap(google.get_by_id, ids):
        results.append(parse_result(result))

    return jsonify({'success': True, 'data': results})
    def update(self, corpus, chunks_as_numpy=False):
        """Train the model with new documents, by EM-iterating over `corpus` until the topics converge
        (or until the maximum number of allowed iterations is reached).

        Train the model with new documents, by EM-iterating over the corpus until the topics converge, or until
        the maximum number of allowed iterations is reached. `corpus` must be an iterable. The E step is distributed
        into the several processes.

        Notes
        -----
        This update also supports updating an already trained model (`self`)
        with new documents from `corpus`; the two models are then merged in
        proportion to the number of old vs. new documents. This feature is still
        experimental for non-stationary input streams.

        For stationary input (no topic drift in new documents), on the other hand,
        this equals the online update of Hoffman et al. and is guaranteed to
        converge for any `decay` in (0.5, 1.0>.

        Parameters
        ----------
        corpus : {iterable of list of (int, float), scipy.sparse.csc}, optional
            Stream of document vectors or sparse matrix of shape (`num_terms`, `num_documents`) used to update the
            model.
        chunks_as_numpy : bool
            Whether each chunk passed to the inference step should be a np.ndarray or not. Numpy can in some settings
            turn the term IDs into floats, these will be converted back into integers in inference, which incurs a
            performance hit. For distributed computing it may be desirable to keep the chunks as `numpy.ndarray`.

        """
        try:
            lencorpus = len(corpus)
        except TypeError:
            logger.warning("input corpus stream has no len(); counting documents")
            lencorpus = sum(1 for _ in corpus)
        if lencorpus == 0:
            logger.warning("LdaMulticore.update() called with an empty corpus")
            return

        self.state.numdocs += lencorpus

        if not self.batch:
            updatetype = "online"
            updateafter = self.chunksize * self.workers
        else:
            updatetype = "batch"
            updateafter = lencorpus
        evalafter = min(lencorpus, (self.eval_every or 0) * updateafter)

        updates_per_pass = max(1, lencorpus / updateafter)
        logger.info(
            "running %s LDA training, %s topics, %i passes over the supplied corpus of %i documents, "
            "updating every %i documents, evaluating every ~%i documents, "
            "iterating %ix with a convergence threshold of %f",
            updatetype, self.num_topics, self.passes, lencorpus, updateafter,
            evalafter, self.iterations, self.gamma_threshold
        )

        if updates_per_pass * self.passes < 10:
            logger.warning(
                "too few updates, training might not converge; "
                "consider increasing the number of passes or iterations to improve accuracy"
            )

        job_queue = Queue(maxsize=2 * self.workers)
        result_queue = Queue()

        # rho is the "speed" of updating; TODO try other fncs
        # pass_ + num_updates handles increasing the starting t for each pass,
        # while allowing it to "reset" on the first pass of each update
        def rho():
            return pow(self.offset + pass_ + (self.num_updates / self.chunksize), -self.decay)

        logger.info("training LDA model using %i processes", self.workers)
        pool = Pool(self.workers, worker_e_step, (job_queue, result_queue,))
        for pass_ in xrange(self.passes):
            queue_size, reallen = [0], 0
            other = LdaState(self.eta, self.state.sstats.shape)

            def process_result_queue(force=False):
                """
                Clear the result queue, merging all intermediate results, and update the
                LDA model if necessary.

                """
                merged_new = False
                while not result_queue.empty():
                    other.merge(result_queue.get())
                    queue_size[0] -= 1
                    merged_new = True
                if (force and merged_new and queue_size[0] == 0) or (not self.batch and (other.numdocs >= updateafter)):
                    self.do_mstep(rho(), other, pass_ > 0)
                    other.reset()
                    if self.eval_every is not None and \
                            ((force and queue_size[0] == 0) or
                                 (self.eval_every != 0 and (self.num_updates / updateafter) % self.eval_every == 0)):
                        self.log_perplexity(chunk, total_docs=lencorpus)

            chunk_stream = utils.grouper(corpus, self.chunksize, as_numpy=chunks_as_numpy)
            for chunk_no, chunk in enumerate(chunk_stream):
                reallen += len(chunk)  # keep track of how many documents we've processed so far

                # put the chunk into the workers' input job queue
                chunk_put = False
                while not chunk_put:
                    try:
                        job_queue.put((chunk_no, chunk, self), block=False, timeout=0.1)
                        chunk_put = True
                        queue_size[0] += 1
                        logger.info(
                            "PROGRESS: pass %i, dispatched chunk #%i = documents up to #%i/%i, "
                            "outstanding queue size %i",
                            pass_, chunk_no, chunk_no * self.chunksize + len(chunk), lencorpus, queue_size[0]
                        )
                    except queue.Full:
                        # in case the input job queue is full, keep clearing the
                        # result queue, to make sure we don't deadlock
                        process_result_queue()

                process_result_queue()
            # endfor single corpus pass

            # wait for all outstanding jobs to finish
            while queue_size[0] > 0:
                process_result_queue(force=True)

            if reallen != lencorpus:
                raise RuntimeError("input corpus size changed during training (don't use generators as input)")
        # endfor entire update

        pool.terminate()
Пример #15
0
def current(alfa,k,suma,xmin,xmax,nx,ymin,ymax,ny):
    tic = time.clock()
    mypath = 'Results/' + 'alfa=' + alfa + '/' + 'k=' + k
    if not os.path.isdir(mypath):
        os.makedirs(mypath)

    path_x = mypath + '/' + 'x_coordinate_current.txt' # array of x coordinates saved in a file
    path_y = mypath + '/' + 'y_coordinate_current.txt' # array of y coordinates saved in a file
    path_current_x = mypath + '/' + 'current_x.txt' # array of x component of current for given coordinates (x,y) saved in a file
    path_current_y = mypath + '/' + 'current_y.txt' # array of y component of current for given coordinates (x,y) saved in a file

    #Initializing arrays of current with number of rows equal to nx and number of columns equal to ny, filled with 0's
    current_x = np.zeros((nx, ny), dtype=np.float64)
    current_y = np.zeros((nx, ny), dtype=np.float64)
    x = np.linspace(xmin, xmax, nx)
    y = np.linspace(ymin, ymax, ny)
    # Initializing list which will become a stack for tasks to be done
    stack = []                                                                          
    # Setting number of processes used in parallel computing
    nproc = 8
    # This condition is required on Windows
    if __name__ == '__main__':
        p = Pool(nproc)
        for h in range(0, nx):
            for j in range(0, ny):
                # Appending tasks to stack list
                stack.append((h,j))
        while len(stack):
            # Takes nproc last element from stack list
            temps = stack[-nproc:]
            # Creates 8 element list of parameters stored in tuple
            xys = [(x[i], y[j], float(alfa), float(k), int(suma)) for i, j in temps]
            # Results of last nproc tasks evaluated in parallel
            results_x = p.map(current_x_formula, xys)
            results_y = p.map(current_y_formula, xys)
            for proc in range(len(temps)):
                # Storing last nproc results in z 2D array
                current_x[temps[proc][1], temps[proc][0]] = results_x[proc]
                current_y[temps[proc][1], temps[proc][0]] = results_y[proc]
            # Deleting last nproc from stack
            del stack[-nproc:]

        # Saving x coordinates, y coordinates, current matrices in txt files
        np.savetxt(path_x, x, delimiter=',')
        np.savetxt(path_y, y, delimiter=',')
        np.savetxt(path_current_x, current_x, delimiter=',')
        np.savetxt(path_current_y, current_y, delimiter=',')
        
        outfile = open(mypath + '/' + 'info_current.txt', 'w')
        outfile.write('alfa = ' + alfa + '\n')
        outfile.write('k = ' + k + '\n')
        outfile.write('Number of elements = ' + str(2*int(suma)+1) + '\n')
        outfile.write('xmin = ' + str(xmin) + '\n')
        outfile.write('xmax = ' + str(xmax) + '\n')
        outfile.write('Density of points on x axis = ' + str(nx) + '\n')
        outfile.write('ymin = ' + str(ymin) + '\n')
        outfile.write('ymax = ' + str(ymax) + '\n')
        outfile.write('Density of points on y axis = ' + str(ny) + 's\n')

        toc = time.clock()
        outfile.write('Evaluating time = ' + str(toc-tic) + '\n')
        outfile.close()
        # Printing time elapsed on evaluating
        print(toc-tic)

        # Creates streamplot for current
        normalising_factor = np.sqrt(current_x**2 + current_y**2)
        plt.figure()
        plt.streamplot(x,y,current_x,current_y,color=normalising_factor,cmap=cm.jet,linewidth=2,arrowstyle='->',arrowsize=1.5)
        plt.colorbar()

        # Saves that plot
        plt.savefig(mypath + '/' + 'current.png')
        # And shows
        plt.show()
        print "==== pretending to run %s (%d entries, %s) ====" % (name, nev, fout)
        return (name,(nev,0))
    print "==== %s starting (%d entries) ====" % (name, nev)
    booker = Booker(fout)
    modulesToRun = MODULES
    if options.modules != []:
        toRun = {}
        for m,v in MODULES:
            for pat in options.modules:
                if re.match(pat,m):
                    toRun[m] = True 
        modulesToRun = [ (m,v) for (m,v) in MODULES if m in toRun ]
    el = EventLoop([ VariableProducer(options.treeDir,booker,options.region,sample_nevt,short,modulesToRun), ])
    el.loop([tb], eventRange=range)
    booker.done()
    fb.Close()
    time = timer.RealTime()
    print "=== %s done (%d entries, %.0f s, %.0f e/s) ====" % ( name, nev, time,(nev/time) )
    return (name,(nev,time))

if options.jobs > 0:
    from multiprocessing import Pool
    pool = Pool(options.jobs)
    ret  = dict(pool.map(_runIt, jobs)) if options.jobs > 0 else dict([_runIt(j) for j in jobs])
else:
    ret = dict(map(_runIt, jobs))
fulltime = maintimer.RealTime()
totev   = sum([ev   for (ev,time) in ret.itervalues()])
tottime = sum([time for (ev,time) in ret.itervalues()])
print "Done %d tasks in %.1f min (%d entries, %.1f min)" % (len(jobs),fulltime/60.,totev,tottime/60.)
        reduce_contacts_by,
        prob_has_trace_app,
        backwards_trace,
        probable_infections_need_test
    ]
    return(parameters + simulation.inf_counts)

param_names = [
    "hazard_rate_scale",
    "infection_reporting_prob",
    "contact_tracing_success_prob",
    "contact_trace_delay_par",
    "global_contact_reduction",
    "prob_has_trace_app",
    "backwards_trace",
    "probable_infections_require_test",
    "backwards_tracing_time_limit"
]

col_names = param_names + [str(i) for i in range(days_to_simulate)]
col_names_dict = {}
for i in range(len(col_names)):
    col_names_dict.update({i: col_names[i]})

if __name__ == '__main__':
    with Pool(10) as p:
        results = p.map(run_simulation, range(repeats))
        results = pd.DataFrame(results)
        results = results.rename(columns=col_names_dict)
        results.to_excel("Data/Simulation Results/UK Model/no_time_lim.xlsx")
Пример #18
0
from multiprocessing import Pool
import os
import time


def task(task_id):
    start_time = time.time()
    print 'task %s in process %s start on %s' % (task_id, os.getpid(), start_time)
    time.sleep(3)
    end_time = time.time()
    print 'task %s in process %s end on %s' % (task_id, os.getpid(), end_time)

pool = Pool(10)

for i in range(10):
    pool.apply(task, args=('task' + str(i+1),),)

pool.close()
pool.join()

Пример #19
0
        w = w.split(":")[0]
        all_read_ids.add(v)
        all_read_ids.add(w)

    for v, w in h_asm_G.sg_edges:
        if h_asm_G.sg_edges[ (v, w) ][-1] != "G":
            continue
        v = v.split(":")[0]
        w = w.split(":")[0]
        all_read_ids.add(v)
        all_read_ids.add(w)

    seqs = load_sg_seq(all_read_ids, fasta_fn)

    if ctg_id == "all":
        ctg_id_list = p_asm_G.ctg_data.keys()
    else:
        ctg_id_list = [ctg_id]

    exe_list = []
    for ctg_id in ctg_id_list:
        if ctg_id[-1] != "F":
            continue
        if ctg_id not in all_rid_to_phase:
            continue
        exe_list.append( (ctg_id, os.path.join(".", ctg_id)) )

    exec_pool = Pool(24)
    exec_pool.map( generate_haplotigs_for_ctg, exe_list)
    #map( generate_haplotigs_for_ctg, exe_list)
    apply_async

"""

from multiprocessing import Pool, TimeoutError
import time
import os


def f(x):
    return x * x


if __name__ == '__main__':
    # 创建4个进程
    with Pool(processes=4) as pool:

        # 打印 "[0, 1, 4,..., 81]"
        print(pool.map(f, range(10)))

        # 使用任意顺序输出相同的数字,
        for i in pool.imap_unordered(f, range(10)):
            print(i)

        # 异步执行"f(20)"
        res = pool.apply_async(f, (20, ))  # 只运行一个进程
        print(res.get(timeout=1))  # 输出 "400"

        # 异步执行 "os.getpid()"
        res = pool.apply_async(os.getpid, ())  # 只运行一个进程
        print(res.get(timeout=1))  # 输出进程的 PID
Пример #21
0
#%% construct word_dict with multi-process to speed up

#run before you want to excute the code in Ipython console for create multiprocess
if __name__ == '__main__':
    __spec__ = "ModuleSpec(name='builtins', loader=<class '_frozen_importlib.BuiltinImporter'>)"

DATA = pd.read_csv('train.csv')
n_workers = 4

chunks = [
    ' '.join(DATA['Sentences'][i:i + len(DATA['Sentences']) // n_workers])
    for i in range(0, len(DATA['Sentences']),
                   len(DATA['Sentences']) // n_workers)
]

pool = Pool(processes=n_workers)
result = pool.map_async(word_tokenize,
                        chunks)  # tokenize using nltk.word_tokenize
words = set(sum(result.get(), []))

word_dict = {'<pad>': 0}
for word in words:
    word_dict[word] = len(word_dict)
#%% Train w2v
TRAIN_corpus = DATA['TOKEN'].values
# setting
vector_dim = 64
window_size = 5
min_count = 1
training_iter = 20
#print(input_, alg, output_path, dataset, procs)
df_input = pd.read_csv(input_, sep='|')
#df_input = df_input.sample(100)
#print(df_input)
data = df_input['bow_preproc'].apply(lambda x: np.str_(x))
labels = df_input['classes']
filenames = df_input['file_name']
#print("Database: ", np.shape(X), np.unique(y, return_counts=True))

# K Fold não aleatório
#kf = KFold(n_splits=10, shuffle = False)
# K Fold aleatório estratificado
# kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
# K Fold aleatório nao-estratificado
kf = KFold(n_splits=10, shuffle=True, random_state=1234)
p = Pool(processes=procs)

if (alg == 0):
    start_job(1, "tf", dataset, pos_label, neg_label)
    start_job(2, "tf", dataset, pos_label, neg_label)
    start_job(3, "tf", dataset, pos_label, neg_label)
    start_job(4, "tf", dataset, pos_label, neg_label)
    start_job(1, "tfidf", dataset, pos_label, neg_label)
    start_job(2, "tfidf", dataset, pos_label, neg_label)
    start_job(3, "tfidf", dataset, pos_label, neg_label)
    start_job(4, "tfidf", dataset, pos_label, neg_label)

else:
    start_job(alg, "tf", dataset, pos_label, neg_label)
    start_job(alg, "tfidf", dataset, pos_label, neg_label)
Пример #23
0
def main(argv):
    g_resultNameBase = argv[0]
    new_command = argv[1]  #apply to all
    g_group_command_16nodes = argv[2]
    g_group_command_36nodes = argv[3]
    g_group_command_100nodes = argv[4]

    def options(sim_type):
        return {
            # --resultType=3 --noOfRuns=10 --ctrlNoOfRuns=1
            # 0: Grid, ndad 2
            0:
            g_common_commands + g_resultNameBase + "-grid-ndad2 " +
            '--topology=0 --nDADcount=2 ',
            # 1: Cross-Grid, ndad3
            1:
            g_common_commands + g_resultNameBase + "-crossGrid-ndad3 " +
            '--topology=1 --nDADcount=3 ',
            # 2: Uniform-Disc, ndad2
            2:
            g_common_commands + g_resultNameBase + "-uDisc-ndad2 " +
            '--topology=3 --nDADcount=2 ',
            # 3: Grid, ndad3
            3:
            g_common_commands + g_resultNameBase + "-grid-ndad3 " +
            '--topology=0 --nDADcount=3 ',
            # 4: Cross-Grid, ndad4
            4:
            g_common_commands + g_resultNameBase + "-crossGrid-ndad4 " +
            '--topology=1 --nDADcount=4 ',
            # 5: Uniform-Disc, ndad3
            5:
            g_common_commands + g_resultNameBase + "-uDisc-ndad3 " +
            '--topology=3 --nDADcount=3 ',
        }.get(sim_type, str('ERROR'))  # "ERROR" is default if x not found

    log_folder = g_pfp + "simul_logs/" + g_resultNameBase + "/"
    os.system("mkdir -p " + log_folder)

    # This is the list with the parameters which will change in your simulations; it is split by the multiprocessing library and fed to the Pool, which initiates the simulations on as many threads you want.
    sim_command_list = []

    for x in range(0, 6):
        log_file = log_folder + "log_" + g_resultNameBase + "_" + str(
            x) + ".out"
        # touch the dump file where the log will be written
        os.system("touch " + log_file)
        if x < 3:
            temp_command = options(
                x
            ) + new_command + " " + "--resultGroupName=" + g_resultNameBase + " " + g_group_command_16nodes + ' " > ' + log_file + " 2>&1"
        elif x < 6:
            temp_command = options(
                x
            ) + new_command + " " + "--resultGroupName=" + g_resultNameBase + " " + g_group_command_36nodes + ' " > ' + log_file + " 2>&1"
        else:
            temp_command = options(
                x
            ) + new_command + " " + "--resultGroupName=" + g_resultNameBase + " " + g_group_command_100nodes + ' " > ' + log_file + " 2>&1"
        sim_command_list.append([temp_command, str(x)])

    # Specify here how many processes/threads you want
    pool = Pool(processes=6)  # start X workers processes

    # Give here the name of your function which can run one simulation with a given parameter, and a list of parameters which will be split among the threads.
    pool.map(run_single_sim, sim_command_list)
Пример #24
0
def prepareTilemap():
	global mapData
	global heatFractal
	global moistureFractal
	global r_offset 
	global heat_random 
	global moisture_random

	r_offset = random.random()*1234
	heat_random = random.random()*1234
	moisture_random = random.random()*1234
	mapData = MapData(g.mapx,g.mapy)
	heatFractal = [[0 for _ in range(g.mapy)] for _ in range(g.mapx)]
	moistureFractal = [[0 for _ in range(g.mapy)] for _ in range(g.mapx)]

	tile_list = []
	for x in range(g.mapx):
		for y in range(g.mapy):
			tile_list.append((r_offset,heat_random,moisture_random,x,y))
	with Pool() as p:
		squares = p.starmap(setMapDataXY,tile_list)
	for N,sq in enumerate(squares):
		x = N//512
		y = N %512
		mapData.data[x][y] = sq[0]
		heatFractal[x][y] = sq[1]
		moistureFractal[x][y] = sq[2]
		if sq[0] > mapData.maxValue:
			mapData.maxValue = sq[0]
		if sq[0] < mapData.minValue:
			mapData.minValue = sq[0]
	timepunch("Initial map data: ")
	tile_list = []
	for x in range(g.mapx):
		for y in range(g.mapy):
			hval = (mapData.data[x][y] - mapData.minValue) / (mapData.maxValue - mapData.minValue)
			tile_list.append((hval,heatFractal[x][y],moistureFractal[x][y],x,y))
	with Pool() as p:
		tiles = p.starmap(setTile,tile_list)
	for N,tile in enumerate(tiles):
		x = N//512
		y = N %512
		g.tiles[x][y] = tile
	updateNeighbours()
	timepunch("Tile stuff: ")

	if g.have_savefile:
		f = get_tar_data('lands.dat')
		json_str = f.decode('utf-8')
		json_lands = json.loads(json_str)
		for land in json_lands:
			our_land = TileGroup()
			our_land.tiles = land['tiles']
			our_land.area = float(land['area'])
			g.lands.append(our_land)
		maxmin = json.loads(get_tar_data('map.dat').decode('utf-8'))
		mapData.maxValue = maxmin['max']
		mapData.minValue = maxmin['min']
		#updateBitmasks()
	else:
		#updateBitmasks()
		floodFill()
		timepunch("Flood filling: ")
		json_lands = []
		for land in g.lands:
			our_dict = {}
			our_dict['tiles'] = land.tiles
			our_dict['area'] = land.area
			json_lands.append(our_dict)
		json_str = json.dumps(json_lands)
		json_bytes = json_str.encode('utf-8')
		add_to_tarfile((json_bytes,"lands.dat"))
		json_str = json.dumps({'max':mapData.maxValue,'min':mapData.minValue})
		json_bytes = json_str.encode('utf-8')
		add_to_tarfile((json_bytes,"map.dat"))
Пример #25
0
from multiprocessing import Pool, TimeoutError
import time
import os

def f(x):
    return x*x

if __name__ == '__main__':
    pool = Pool(processes=4)              # start 4 worker processes

    # print "[0, 1, 4,..., 81]"
    print pool.map(f, range(10))

    # print same numbers in arbitrary order
    for i in pool.imap_unordered(f, range(10)):
        print i,
    print

    # evaluate "f(20)" asynchronously
    res = pool.apply_async(f, (20,))      # runs in *only* one process
    print res.get(timeout=1)              # prints "400"

    # evaluate "os.getpid()" asynchronously
    res = pool.apply_async(os.getpid, ()) # runs in *only* one process
    print res.get(timeout=1)              # prints the PID of that process

    # launching multiple evaluations asynchronously *may* use more processes
    multiple_results = [pool.apply_async(os.getpid, ()) for i in range(4)]
    print [res.get(timeout=1) for res in multiple_results]

    # make a single worker sleep for 10 secs
Пример #26
0
    # plt.show()
    plt.close()
    return X, y, count


def get_features(args):
    i = args[0]
    data = args[1]
    img = ds.get_image(data[i][1])
    label = data[i][0]
    sift = cv2.xfeatures2d.SIFT_create(500)
    kp, des = sift.detectAndCompute(img, None)
    return [des, label]


pool = Pool(os.cpu_count())
print("Training")
features = pool.map(get_features, [(i, data_train)
                                   for i in range(len(data_train))])

X, y, count = vector_quantization_train(features)
print("Number of features: ", count)
clf = LinearSVC()
clf.fit(X, y)

print("Testing")
features = pool.map(get_features, [(i, data_test)
                                   for i in range(len(data_test))])
X, y, count = vector_quantization(features)
print("Number of features: ", count)
predictions = clf.predict(X)
Пример #27
0
def main():
    pool = Pool(processes=3)
    for i in range(30):
        pool.apply(f, (i, ))
    pool.close()
    pool.join()
def make_graph(env,
               sampler,
               connection_radius,
               num_vertices,
               lazy=False,
               saveto='graph.pkl'):
    """
    Returns a graph on the passed environment.
    All vertices in the graph must be collision-free.

    Graph should have node attribute "config" which keeps a configuration in tuple.
    E.g., for adding vertex "0" with configuration np.array(0, 1),
    G.add_node(0, config=tuple(config))

    To add edges to the graph, call
    G.add_weighted_edges_from([edges])
    where edges is a list of tuples (node_i, node_j, weight),
    where weight is the distance between the two nodes.

    @param env: Map Environment for graph to be made on
    @param sampler: Sampler to sample configurations in the environment
    @param connection_radius: Maximum distance to connect vertices
    @param num_vertices: Minimum number of vertices in the graph.
    @param lazy: If true, edges are made without checking collision.
    @param saveto: File to save graph and the configurations
    @returns a undirected weighted graph G where each node is a tuple (x, y)
             and edge data the distance between nodes.
    """
    print 'dubins graph maker'
    G = nx.DiGraph()
    numberOfThreads = 1
    pool = Pool(processes=numberOfThreads)
    # Implement here

    # TODO: Code needs to be restructured, maybe vectorized?
    # 1. Sample vertices
    vertices = sampler.sample(num_vertices)
    edges = []
    # 2. Connect them with edges
    start_time = time.time()
    for vid in tqdm(range(len(vertices))):
        vertex = tuple(vertices[vid])
        G.add_node(vertex)
        #distances = env.compute_distances(vertices[vid], vertices)
        distances = compute_distance_parallel(numberOfThreads, pool,
                                              vertices[vid], vertices)
        for vid2 in range(len(vertices)):
            if vid != vid2:
                dist = distances[vid2]
                if (dist < connection_radius) and (
                        lazy or env.edge_validity_checker(
                            vertices[vid], vertices[vid2])[0]):
                    edges.append((vertex, tuple(vertices[vid2]), dist))
        if vid % 100 == 0:
            print 'cost time', time.time() - start_time
    G.add_weighted_edges_from(edges)

    # Check for connectivity.
    #num_connected_components = len(list(nx.connected_components(G)))
    #if not num_connected_components == 1:
    #    print ("warning, Graph has {} components, not connected".format(num_connected_components))

    # Save the graph to reuse.
    if saveto is not None:
        data = dict(G=G)
        pickle.dump(data, open(saveto, 'wb'))
        print('Saved the graph to {}'.format(saveto))
    return G
def predict(ds, **kwargs):
    global db
    print(os.getcwd())
    data=[]
    ls=[]
    pred=False
    if pred:
        for doc in col.distinct('ad_creative_body',{'marked':0}):
            if len(data)<2000:
                data.append({'ad_creative_body':doc})
            else:
                break           

        if len(data)>0:
            print(len(data))
            df=pd.DataFrame(data)
            df['label']=0
            dev_df_bert = pd.DataFrame({
                'id':range(len(df)),
                'label':df['label'],
                'alpha':['a']*df.shape[0],
                'text': df['ad_creative_body'].replace(r'\n', ' ', regex=True)
            })

            dev_df_bert.to_csv('./home/jay/airflow/dags/data/dev.tsv', sep='\t', index=False, header=False)

            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            # This is where BERT will look for pre-trained models to load parameters from.
            CACHE_DIR = './home/jay/airflow/dags/cache/'

            # The maximum total input sequence length after WordPiece tokenization.
            # Sequences longer than this will be truncated, and sequences shorter than this will be padded.
            MAX_SEQ_LENGTH = 128

            TRAIN_BATCH_SIZE = 24
            EVAL_BATCH_SIZE = 8
            LEARNING_RATE = 2e-5
            RANDOM_SEED = 42
            GRADIENT_ACCUMULATION_STEPS = 1
            WARMUP_PROPORTION = 0.1
            OUTPUT_MODE = 'classification'
            NUM_TRAIN_EPOCHS = 1
            CONFIG_NAME = "config.json"
            WEIGHTS_NAME = "pytorch_model.bin"
            Data = 'FB16'
            DATA_DIR = "./home/jay/airflow/dags/data/"
            categories = ["Attack", "Advocacy", "CTA", "Issue","Image"]
    #         categories = ["Attack", "Advocacy", "Ceremonial", "CTA", "CI", "Image", "Issue"]
            for Category in categories:
                print(Category)
                TASK_NAME = Data+Category
                BERT_MODEL =  TASK_NAME+'.tar.gz'

                # The output directory where the fine-tuned model and checkpoints will be written.
                OUTPUT_DIR = './home/jay/airflow/dags/outputs/'+TASK_NAME+'/'
                tokenizer = BertTokenizer.from_pretrained(OUTPUT_DIR + 'vocab.txt', do_lower_case=False)
                processor = BinaryClassificationProcessor()
                eval_examples = processor.get_dev_examples(DATA_DIR)
                label_list = processor.get_labels() # [0, 1] for binary classification
                num_labels = len(label_list)
                eval_examples_len = len(eval_examples)


                label_map = {label: i for i, label in enumerate(label_list)}
                eval_examples_for_processing = [(example, label_map, MAX_SEQ_LENGTH, tokenizer, OUTPUT_MODE) for example in eval_examples]

                process_count = cpu_count() - 1
    #             if __name__ ==  '__main__':
            #         print('Preparing to convert' {eval_examples_len} examples..')
            #         print(f'Spawning {process_count} processes..')
                with Pool(process_count) as p:
                    eval_features = list(p.imap(convert_example_to_feature, eval_examples_for_processing))

                all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
                all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
                all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
                all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)

                eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

                # Run prediction for full data
                eval_sampler = SequentialSampler(eval_data)
                eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=EVAL_BATCH_SIZE)


                # Load pre-trained model (weights)
                model = BertForSequenceClassification.from_pretrained(CACHE_DIR + BERT_MODEL, cache_dir=CACHE_DIR, num_labels=len(label_list))



                model.to(device)

                model.eval()
                eval_loss = 0
                nb_eval_steps = 0
                preds = []

                for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                    input_ids = input_ids.to(device)
                    input_mask = input_mask.to(device)
                    segment_ids = segment_ids.to(device)
                    label_ids = label_ids.to(device)

                    with torch.no_grad():
                        logits = model(input_ids, segment_ids, input_mask, labels=None)

                    # create eval loss and other metric required by the task

                    loss_fct = CrossEntropyLoss()
                    tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))

                    eval_loss += tmp_eval_loss.mean().item()
                    nb_eval_steps += 1
                    if len(preds) == 0:
                        preds.append(logits.detach().cpu().numpy())
                    else:
                        preds[0] = np.append(
                            preds[0], logits.detach().cpu().numpy(), axis=0)

                eval_loss = eval_loss / nb_eval_steps
                preds = preds[0]
                preds = np.argmax(preds, axis=1)
                df[Category]=preds

            del df['label']

            dc=df.to_dict('records')

            for doc in dc:
                doc['class']=[]
                for c in categories:
                    if doc[c]==1:
                        doc['class'].append(c)
                    del doc[c]
            print(len(dc))
            print(dc[0])
            print("Pushing into DB")
            for doc in dc:
                for x in col.find({"ad_creative_body":doc['ad_creative_body'],'marked':0}):
                    x['marked']=1
                    x['class']=doc['class']
                    col.update_one({'_id': x['_id']},{"$set":x},True)    
            return "Done"
Пример #30
0
def main():
    with Pool(POOL_SIZE) as pool:
        results = pool.map(fetch_rates, BASES)

    for result in results:
        present_result(*result)