from PhysicsTools.PythonAnalysis.rootplot.core import report_progress import ROOT import multiprocessing as multi from Queue import Empty import os os.chdir('..') # return to the directory with the ROOT files calls = [] calls.append(""" canvas, objects = plot('PileUp_2011_truth_finebin_64600microbarn.root', 'PileUp_2011_truth_finebin_68000microbarn.root', 'PileUp_2011_truth_finebin_71400microbarn.root', 'pileup', ext='root', xlabel='Number of interactions per crossing', title='CMS Preliminary 5.1 fb^{-1} at #sqrt{s} = 7 TeV', legend_entries='inelastic cross-section = 64600 #mub,inelastic cross-section = 68000 #mub,inelastic cross-section = 71400 #mub') canvas.SaveAs('plots/pileup.root') """) queue = multi.JoinableQueue() qglobals = multi.Manager().Namespace() qglobals.nfinished = 0 qglobals.ntotal = len(calls) for call in calls: queue.put(call) def qfunc(queue, qglobals): while True: try: mycall = queue.get(timeout=5) except (Empty, IOError): break exec(mycall) ROOT.gROOT.GetListOfCanvases().Clear()
def parallel_preprocess_gene(ensembl,out_dir,n_processes,readcount_min,readcount_max,resume): # Create output paths and locks. out_paths,locks = dict(),dict() for out_filetype in ['json','index','log','readcount']: out_paths[out_filetype] = os.path.join(out_dir,'data.%s' %out_filetype) locks[out_filetype] = multiprocessing.Lock() # Writing the starting of the files. gene_ids_done = [] if resume and os.path.exists(out_paths['index']): df_index = pd.read_csv(out_paths['index'],sep=',') gene_ids_done = list(df_index['idx'].unique()) else: # with open(out_paths['json'],'w') as f: # f.write('{\n') # f.write('"genes":{') open(out_paths['json'],'w').close() with open(out_paths['index'],'w') as f: f.write('idx,start,end\n') # header with open(out_paths['readcount'],'w') as f: f.write('idx,n_reads\n') # header open(out_paths['log'],'w').close() # Create communication queues. task_queue = multiprocessing.JoinableQueue(maxsize=n_processes * 2) # Create and start consumers. consumers = [helper.Consumer(task_queue=task_queue,task_function=preprocess_gene,locks=locks) for i in range(n_processes)] for p in consumers: p.start() # Get all gene ids. gene_ids = set() tx_ensembl = dict() with h5py.File(os.path.join(out_dir,'eventalign.hdf5'),'r') as f: for tx_id in f.keys(): tx_id,tx_version = tx_id.split('.') # Based on Ensembl tx_ensembl[tx_id] = tx_version try: g_id = ensembl.transcript_by_id(tx_id).gene_id except ValueError: continue else: gene_ids = gene_ids.union([g_id]) # # Load tasks into task_queue. gene_ids_processed = [] with h5py.File(os.path.join(out_dir,'eventalign.hdf5'),'r') as f: for gene_id in gene_ids: if resume and (gene_id in gene_ids_done): continue # mapping a gene <-> transcripts tx_ids, t2g_mapping = t2g(gene_id,ensembl) # read_ids = [] data_dict = dict() n_reads = 0 for tx_id in tx_ids: if tx_id not in tx_ensembl: continue tx_id += '.' + tx_ensembl[tx_id] if tx_id not in f: # no eventalign for tx_id continue # n_reads += len(f[tx_id]) for read_id in f[tx_id].keys(): if (n_reads < readcount_max) and (read_id not in read_ids): data_dict[read_id] = f[tx_id][read_id]['events'][:] read_ids += [read_id] n_reads += 1 elif n_reads >= readcount_max: break if n_reads >= readcount_min: task_queue.put((gene_id,data_dict,t2g_mapping,out_paths)) # Blocked if necessary until a free slot is available. gene_ids_processed += [gene_id] # Put the stop task into task_queue. task_queue = helper.end_queue(task_queue,n_processes) # Wait for all of the tasks to finish. task_queue.join() # Write the ending of the json file. # with open(out_paths['json'],'a+') as f: # f.seek(0,2) # end of file # f.truncate(f.tell()-1) # f.write('\n}\n}\n') ### with open(out_paths['log'],'a+') as f: f.write('Total %d genes.\n' %len(gene_ids_processed)) f.write(helper.decor_message('successfully finished'))
def multi_process(func, data, num_process=None, verbose=True, **args): '''Function to use multiprocessing to process pandas Dataframe. This function applies a function on each row of the input DataFrame by multiprocessing. Args: func (function): The function to apply on each row of the input Dataframe. The func must accept pandas.Series as the first positional argument and return a pandas.Series. data (pandas.DataFrame): A DataFrame to be processed. num_process (int, optional): The number of processes to run in parallel. Defaults to be the number of CPUs of the computer. verbose (bool, optional): Set to False to disable verbose output. args (dict): Keyword arguments to pass as keywords arguments to `func` return: A dataframe containing the results ''' # Check arguments value assert isinstance(data, pd.DataFrame), \ 'Input data must be a pandas.DataFrame instance' if num_process is None: num_process = multiprocessing.cpu_count() # Establish communication queues tasks = multiprocessing.JoinableQueue() results = multiprocessing.Queue() error_queue = multiprocessing.Queue() start_time = time.time() # Enqueue tasks num_task = len(data) for i in range(num_task): tasks.put(data.iloc[i, :]) # Add a poison pill for each consumer for i in range(num_process): tasks.put(None) logger.info('Create {} processes'.format(num_process)) consumers = [ Consumer(func, tasks, results, error_queue, **args) for i in range(num_process) ] for w in consumers: w.start() # Add a task tracking process task_tracker = TaskTracker(tasks, verbose) task_tracker.start() # Wait for all input data to be processed tasks.join() # If there is any error in any process, output the error messages num_error = error_queue.qsize() if num_error > 0: for i in range(num_error): logger.error(error_queue.get()) raise RuntimeError('Multi process jobs failed') else: # Collect results result_table = [] while num_task: result_table.append(results.get()) num_task -= 1 df_results = pd.DataFrame(result_table) logger.info("Jobs finished in {0:.2f}s".format(time.time() - start_time)) return df_results
def create_queue(self): return multiprocessing.JoinableQueue()
def main(): style = os.path.dirname(os.path.abspath(__file__))+"/osm.xml" dir = "tiles" type = "png" scale = 22800000 minzoom = 1 maxzoom = 6 threads = 1 context = 3 parser = OptionParser() parser.add_option("-s", "--style", action="store", type="string", dest="style", help="path to the mapnik stylesheet xml, defaults to: "+style) parser.add_option("-d", "--dir", action="store", type="string", dest="dir", help="path to the destination folder, defaults to "+type) parser.add_option("-t", "--type", action="store", type="string", dest="type", help="file type to render (png, png256, jpg), defaults to "+type) parser.add_option("-z", "--minzoom", action="store", type="int", dest="minzoom", help="minimum zoom level to render, defaults to "+str(minzoom)) parser.add_option("-Z", "--maxzoom", action="store", type="int", dest="maxzoom", help="maximum zoom level to render, defaults to "+str(maxzoom)) parser.add_option("-T", "--threads", action="store", type="int", dest="threads", help="number of threads to launch, defaults to "+str(threads)) parser.add_option("-i", "--only-interesting", action="store_true", dest="onlyinteresting", help="only render around interesting places (buildings, peaks, islands, ...)") parser.add_option("-c", "--only-interesting-context", action="store", type="int", dest="context", help="when rendering tiles around interesting places, how many tiles around those places should be rendered?"+ "0 means that only the tile with the interesting feature will be rendered; "+ "1 means that the 8 surrounding tiles will be rendered for each zoom level, too; "+ "2 adds 24 extra tiles; 3 adds 48 extra tiles; 4 adds 80 extra tiles; "+ "defaults to "+str(context)+", which should fill the most screens") parser.add_option("-l", "--only-interesting-list", action="store", type="string", dest="listfile", help="write a GeoJSON-List of interesting places") parser.add_option("-D", "--db", action="store", type="string", dest="dsn", default="", help="database connection string used for finding interesting places") parser.add_option("-e", "--skip-existing", action="store_true", dest="skipexisting", help="skip existing tiles, only render missing") (options, args) = parser.parse_args() if options.style: style = options.style if options.dir: dir = options.dir if options.type: type = options.type if options.minzoom: minzoom = options.minzoom if options.maxzoom: maxzoom = options.maxzoom if options.threads: threads = options.threads if options.context != None: context = options.context queue = multiprocessing.JoinableQueue(32) lock = multiprocessing.Lock() renderers = {} print "Starting %u render-threads" % (threads) for i in range(threads): renderer = RenderThread(i, queue, style, scale, dir, type, lock) render_thread = multiprocessing.Process(target=renderer.run) render_thread.start() renderers[i] = render_thread if options.onlyinteresting: import psycopg2 tileset = set() features = [] con = psycopg2.connect(options.dsn) sql = """ SELECT 'point' AS type, osm_id, name, ST_X(way), ST_Y(way), ST_X(ST_Transform(way, 3411)), ST_Y(ST_Transform(way, 3411)) FROM ant_point WHERE (place IS NOT NULL AND place IN ('hamlet', 'town', 'isolated_dwelling', 'cape', 'locality', 'island', 'islet')) OR building IS NOT NULL OR aeroway IS NOT NULL OR ("natural" IS NOT NULL AND "natural" IN ('volcano', 'ridge', 'cliff', 'cape', 'peak', 'valley', 'bay')) UNION ALL SELECT 'line' AS type, osm_id, name, ST_X(ST_Centroid(way)), ST_Y(ST_Centroid(way)), ST_X(ST_Transform(ST_Centroid(way), 3411)), ST_Y(ST_Transform(ST_Centroid(way), 3411)) FROM ant_line WHERE (place IS NOT NULL AND place IN ('hamlet', 'town', 'isolated_dwelling', 'cape', 'locality', 'island', 'islet')) OR building IS NOT NULL OR aeroway IS NOT NULL UNION ALL SELECT 'polygon' AS type, osm_id, name, ST_X(ST_Centroid(way)), ST_Y(ST_Centroid(way)), ST_X(ST_Transform(ST_Centroid(way), 3411)), ST_Y(ST_Transform(ST_Centroid(way), 3411)) FROM ant_polygon WHERE (name IS NOT NULL AND place IS NOT NULL AND place IN ('hamlet', 'town', 'isolated_dwelling', 'cape', 'locality', 'island', 'islet')) OR building IS NOT NULL OR aeroway IS NOT NULL; """; cur = con.cursor() cur.execute(sql) lock.acquire() print "found %u interesting nodes" % (cur.rowcount) lock.release() i = 0 for record in cur: (obj_type, osm_id, name, lat, lng, xmeter, ymeter) = record lock.acquire() i += 1 print "found interesting %s %u of %u: #%u (%s)" % (obj_type, i, cur.rowcount, osm_id, name) lock.release() if(options.listfile): features += ({ "type": "Feature", "properties": { "osm_id": osm_id, "name": name }, "geometry": { "type": "Point", "coordinates" : [ lat, lng ] } },) for z in range(minzoom, maxzoom+1): n = 2**z n2 = n/2 tilesz = float(scale) / float(n) xoff = float(xmeter) / tilesz yoff = float(ymeter) / tilesz x = int(xoff + n2) y = int(n2 - yoff) for xctx in range(-context, context+1): for yctx in range(-context, context+1): absx = x+xctx absy = y+yctx t = (z, absx, absy) if absx >= 0 and absx < n and absy >= 0 and absy < n and not t in tileset: queue.put(t) tileset.add(t) if(options.listfile): import json f = open(options.listfile, "w") f.write(json.dumps({ "type": "FeatureCollection", "features": features } )) f.close() else: for z in range(minzoom, maxzoom+1): n = 2**z for x in range(0, n): for y in range(0, n): if options.skipexisting and os.path.exists(dir + "/" + str(z) + "/" + str(x) + "/" + str(y) + "." + type): continue t = (z, x, y) queue.put(t) # Signal render threads to exit by sending empty request to queue for i in range(threads): queue.put(None) # wait for pending rendering jobs to complete queue.join() for i in range(threads): renderers[i].join()
def spawn_core_test(self): """Spawn concurrent scale testing on all online cores.""" def run_worker_process(_result_queue, affinity): """ Subclass instantiation & constructor for individual core. """ _worker = psutil.Process() # assign affinity, pin to core _worker.cpu_affinity(affinity) # intantiate core_test cpu_freq_ctest = CpuFreqCoreTest(affinity[0], _worker.pid) # execute freq scaling cpu_freq_ctest.scale_all_freq() # get results res_freq_map = cpu_freq_ctest.__call__() # place in result_queue _result_queue.put(res_freq_map) def process_rqueue(queue_depth, _result_queue): """Get and process core_test result_queue.""" # get queued core_test results for _ in range(queue_depth): # pipe results from core_test worker_queue = _result_queue.get() # append to chainmap object self.freq_chainmap = self.freq_chainmap.new_child(worker_queue) # signal processing complete _result_queue.task_done() logging.info('----------------------------') logging.info('* joining and closing queues') # nicely join and close queue try: _result_queue.join() finally: _result_queue.close() worker_list = [] # track spawned multiproc processes pid_list = [] # track spawned multiproc pids online_cores = self._get_cores('online') # delegate & spawn tests on other cores first # then run core 0 last (main() thread) online_cores.append(online_cores.pop(0)) # create queue for piping results result_queue = multiprocessing.JoinableQueue() # assign affinity and spawn core_test for core in online_cores: affinity = [int(core)] affinity_dict = dict(affinity=affinity) worker = multiprocessing.Process(target=run_worker_process, args=(result_queue, ), kwargs=affinity_dict) # start core_test worker.start() worker_list.append(worker) # track and log active child pids pid_list.append(worker.pid) # get, process queues process_rqueue(len(worker_list), result_queue) # cleanup core_test pids logging.info('* joining worker processes:') for idx, worker in enumerate(worker_list): # join worker processes worker_return = worker.join() time.sleep(.1) if worker_return is None: logging.info(' - PID %s joined parent', pid_list[idx]) else: # can cleanup in reset subroutine continue # update attribute for a 2nd pass terminate self.__proc_list = worker_list
def main(): """Launch the script that computes frequencies. - Read the file (or all files within a directory) and put each line in queue - Spawn multiple processes (Worker) - Collect results from processes, merge them and write the results in a file. """ parser = argparse.ArgumentParser( description="Script to compute unigrams and/or bigrams frequencies.") parser.add_argument("-f", "--file", help="source file to be processed") parser.add_argument("-d", "--directory", help="directory containing a set " "of files to be processed") parser.add_argument("-t", "--type", help="whether computing 'unigrams' or " "'bigrams'", required=True) parser.add_argument("-o", "--output", help="output file with results", required=True) parser.add_argument("-v", "--verbose", action='store_true', help="print debugging information") args = parser.parse_args() # Adjust logger verbosity. if args.verbose is True: logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)-8s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') else: logging.basicConfig(level=logging.WARNING, format='%(asctime)s %(levelname)-8s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') logger = logging.getLogger() # Make sure that one parameter has been setted. if args.file is None and args.directory is None: logger.critical("No source specified.") return -1 if args.file is not None and args.directory is not None: logger.critical("Either specify a file or a directory.") return -1 # Validate the type of computation requested. if not (args.type == "unigrams" or args.type == "bigrams"): logger.critical("Wrong type: please specify 'unigrams' or 'bigrams'") return -1 # Create a list with valid files ready to be processed. if args.file is not None: if isfile(args.file): files = [args.file] else: logger.critical("Unable to find %s." % args.file) return -1 else: if isdir(args.directory): files = [f for f in listdir(args.directory) if isfile(join(args.directory, f))] if len(files) == 0: logger.critical("%s doesn't contain valid file(s)." % args.directory) return -1 else: logger.critical("%s is not a directory." % args.directory) return -1 begin_time = time.time() workers = [] # Limit queue size to 100k items (this is due to the fact that reading # can be way more fast than computing; RAM is filled up abnormally). queue = multiprocessing.JoinableQueue(100000) results_queue = multiprocessing.Queue() # Spawn a process for every CPU. for _ in range(multiprocessing.cpu_count()): w = Worker(queue, results_queue, args.type) w.start() workers.append(w) for idx, filename in enumerate(files): logger.debug("Begin read %s." % filename) directory = args.directory or "." with codecs.open(join(directory, filename), 'r', 'utf8') as f: for line in f: queue.put(line) logger.debug("File %s successfully read." % filename) logger.debug("All files successfully read.") # Join the queue with the words to be processed. This is a synchronous # call, so main() will wait for workers to complete their work. queue.join() logger.debug("Every file has been processed. Merging...") # Merge the counters with the '+=' operator. counter = Counter() for _ in workers: counter += results_queue.get() # Clean process table by joining workers. for w in workers: w.join() logger.debug("Computing finished. Writing results...") with codecs.open(args.output, 'w', 'utf8') as out: # Write the header. out.write("%d %d\n" % (len(counter.values()), sum(counter.values()))) # For each element, write the key and its value (space separated). for k, v in counter.most_common(): out.write("%s %d\n" % (k, v)) logger.debug("Done in %s seconds." % (time.time() - begin_time))
def prepare(args, logger): """Main script function. :param args: the ArgumentParser-derived namespace. :param logger: a logging instance :type logger: logging.Logger """ if hasattr(args.json_conf["reference"]["genome"], "close"): args.json_conf["reference"]["genome"].close() if hasattr(args.json_conf["reference"]["genome"], "filename"): args.json_conf["reference"]["genome"] = getattr( args.json_conf["reference"]["genome"], "filename") elif hasattr(args.json_conf["reference"]["genome"], "name"): args.json_conf["reference"]["genome"] = getattr( args.json_conf["reference"]["genome"], "name") else: logger.critical("Invalid FASTA file: %s", args.json_conf["reference"]["genome"]) raise AttributeError elif not isinstance(args.json_conf["reference"]["genome"], (str, bytes)): logger.critical("Invalid FASTA file: %s", args.json_conf["reference"]["genome"]) raise AttributeError if not os.path.exists(args.json_conf["reference"]["genome"]): logger.critical("Invalid FASTA file: %s", args.json_conf["reference"]["genome"]) raise AttributeError assert len(args.json_conf["prepare"]["files"]["gff"]) > 0 assert len(args.json_conf["prepare"]["files"]["gff"]) == len( args.json_conf["prepare"]["files"]["labels"]), ( args.json_conf["prepare"]["files"]["gff"], args.json_conf["prepare"]["files"]["labels"]) if args.json_conf["prepare"]["strand_specific"] is True: args.json_conf["prepare"]["files"]["strand_specific_assemblies"] = [ True ] * len(args.json_conf["prepare"]["files"]["gff"]) else: args.json_conf["prepare"]["files"]["strand_specific_assemblies"] = [ (member in args.json_conf["prepare"]["files"] ["strand_specific_assemblies"]) for member in args.json_conf["prepare"]["files"]["gff"] ] args.json_conf["prepare"]["files"]["reference"] = [ (member in args.json_conf["prepare"]["files"]["reference"] or label in args.json_conf["prepare"]["files"]["reference"]) for member, label in zip(args.json_conf["prepare"]["files"]["gff"], args.json_conf["prepare"]["files"]["labels"]) ] shelve_names = [ path_join(args.json_conf["prepare"]["files"]["output_dir"], "mikado_shelf_{}.db".format(str(_).zfill(5))) for _ in range(len(args.json_conf["prepare"]["files"]["gff"])) ] logger.propagate = False if args.json_conf["prepare"]["single"] is False and args.json_conf[ "threads"] > 1: multiprocessing.set_start_method( args.json_conf["multiprocessing_method"], force=True) args.logging_queue = multiprocessing.JoinableQueue(-1) log_queue_handler = logging.handlers.QueueHandler(args.logging_queue) log_queue_handler.setLevel(logging.DEBUG) # logger.addHandler(log_queue_handler) args.tempdir = tempfile.TemporaryDirectory( dir=args.json_conf["prepare"]["files"]["output_dir"]) args.listener = logging.handlers.QueueListener(args.logging_queue, logger) args.listener.propagate = False args.listener.start() args.json_conf["prepare"]["files"]["out_fasta"] = open( path_join(args.json_conf["prepare"]["files"]["output_dir"], args.json_conf["prepare"]["files"]["out_fasta"]), 'w') args.json_conf["prepare"]["files"]["out"] = open( path_join(args.json_conf["prepare"]["files"]["output_dir"], args.json_conf["prepare"]["files"]["out"]), 'w') logger.info("Output dir: %s. Output GTF: %s. Output Fasta: %s", args.json_conf["prepare"]["files"]["output_dir"], args.json_conf["prepare"]["files"]["out"].name, args.json_conf["prepare"]["files"]["out_fasta"].name) logger.info("Loading reference file") args.json_conf["reference"]["genome"] = pysam.FastaFile( args.json_conf["reference"]["genome"]) logger.info("Finished loading genome file") logger.info("Started loading exon lines") shelf_stacks = dict() try: load_exon_lines( args, shelve_names, logger, min_length=args.json_conf["prepare"]["minimum_cdna_length"], max_intron=args.json_conf["prepare"]["max_intron_length"], ) logger.info("Finished loading exon lines") # Prepare the sorted data structure sorter = functools.partial( store_transcripts, logger=logger, seed=args.json_conf["seed"], keep_redundant=args.json_conf["prepare"]["keep_redundant"]) shelve_source_scores = [] for label in args.json_conf["prepare"]["files"]["labels"]: shelve_source_scores.append( args.json_conf["prepare"]["files"]["source_score"].get( label, 0)) try: for shelf, score, is_reference in zip( shelve_names, shelve_source_scores, args.json_conf["prepare"]["files"]["reference"]): assert isinstance(is_reference, bool) conn = sqlite3.connect(shelf) shelf_stacks[shelf] = { "conn": conn, "cursor": conn.cursor(), "score": score, "is_reference": is_reference } # shelf_stacks = dict((_, shelve.open(_, flag="r")) for _ in shelve_names) except Exception as exc: raise TypeError((shelve_names, exc)) perform_check(sorter(shelf_stacks), shelf_stacks, args, logger) except Exception as exc: logger.exception(exc) __cleanup(args, shelve_names) logger.error("Mikado has encountered an error, exiting") # sys.exit(1) if args.json_conf["prepare"]["single"] is False and args.json_conf[ "threads"] > 1: args.tempdir.cleanup() args.listener.enqueue_sentinel() logger.setLevel(logging.INFO) __cleanup(args, shelve_names) logger.addHandler(logging.StreamHandler()) logger.info( """Mikado prepare has finished correctly. The output %s FASTA file can now be used for BLASTX \ and/or ORF calling before the next step in the pipeline, `mikado serialise`.""", args.json_conf["prepare"]["files"]["out_fasta"]) logging.shutdown()
def main(): #--see if a restart flag was passed try: if sys.argv[1].upper() == 'R': restart = True else: restart = False except: restart = False if restart: print 'Using existing dir and files' #--a dict of data types that are of interest - these become nested folders use_dtypes = {'GW':['PSI','WELL'],'SW':['BOARD','FLOW','GATE','RPM','STG'],\ 'RAIN':['RAIN'],'EVAP':['EVAP','ETP','ETPI']} #--create the directory structure if restart is False: for key, val in use_dtypes.iteritems(): if os.path.exists(key): shutil.rmtree(key) os.mkdir(key) for v in val: os.mkdir(key + '\\' + v) else: for key, val in use_dtypes.iteritems(): if not os.path.exists(key): os.mkdir(key) for v in val: os.mkdir(key + '\\' + v) #--the time series listing CSV from dbhydro fname = 'ts_listing.csv' f = open(fname, 'r') header = f.readline().strip().split(',') #--some column indices idx = {} idx['dbkey'] = 0 idx['station'] = 1 idx['dtype'] = 3 idx['freq'] = 4 idx['stat'] = 5 idx['sdate'] = 8 idx['edate'] = 9 idx['opnum'] = 12 idx['basin'] = 17 idx['struc'] = 18 #--get a list of file names and dbkeys to retrieve #--build queue_args = [[dbkey,sdate,edate,fname]] dbkeys = [] fnames = [] queue_args = [] for i, line in enumerate(f): raw = line.strip().split(',') dbkey = raw[idx['dbkey']].strip() station = raw[idx['station']].strip() freq = raw[idx['freq']].strip() stat = raw[idx['stat']].strip() sdate = raw[idx['sdate']].strip() edate = raw[idx['edate']].strip() dtype = raw[idx['dtype']].strip() opnum = raw[idx['opnum']].strip() struc = raw[idx['struc']].strip() #--fix the dbkey since excel is a giant turd and removes leading '0's if len(dbkey) < 5: dbkey = '%05d' % int(dbkey) #print dbkey #break #--check if this is some data we want dir1, dir2 = None, None for key, val in use_dtypes.iteritems(): if dtype in val: dir1 = key + '\\' dir2 = dtype + '\\' break #--if this isn't a dup and it is a data type we want and it has valid date ranges if dbkey not in dbkeys and dir1 != None and sdate != '' and edate != '': #--if opnum is null, make it 1 if opnum == '': opnum = '1' dbkeys.append(dbkey) #--convert sdate and edate to dbhydro format s = datetime.strptime(sdate, '%d-%b-%Y') sdate2 = s.strftime('%Y%m%d') e = datetime.strptime(edate, '%d-%b-%Y') edate2 = e.strftime('%Y%m%d') #--build the output file name station_mod = station.replace('.', '_') station_mod = station_mod.replace(' ', '_') fname = dir1+dir2+station_mod+'.'+freq+'.'+stat+'.'+opnum+'.'+\ sdate2+'.'+edate2+'.'+struc+'.dat' if restart: #--check that this fname doesn't exist if os.path.exists(fname) == False and fname not in fnames: queue_args.append([dbkey, sdate2, edate2, fname]) fnames.append(fname) #--if not restart else: if fname not in fnames: queue_args.append([dbkey, sdate2, edate2, fname]) fnames.append(fname) print 'number of records to retrieve:', len(fnames) #--multiprocessing #--number of process to spawn - do my bidding! num_procs = 20 #--create a queue for jobs and to track failed retrievals jobq = mp.JoinableQueue() failq = mp.Queue() #--create and start the process instances procs = [] for i in range(num_procs): #--pass the woker function both queues and a PID p = mp.Process(target=worker, args=(jobq, failq, i + 1)) p.daemon = True print 'starting process', p.name p.start() procs.append(p) #--add the args to the queue for qa in queue_args: jobq.put(qa) #break #--add the sentinels so processes know when to terminate for p in procs: jobq.put(None) #--block until all finish for p in procs: p.join() print p.name, 'Finished' #--process the failed retrievals failq.put_nowait(None) f_out = open('failed.dat', 'w') for args in iter(failq.get, None): f_out.write(args[0] + '\n') f_out.close()
def perform_check(keys, shelve_stacks, args, logger): """ This is the most important method. After preparing the data structure, this function creates the real transcript instances and checks that they are correct when looking at the underlying genome sequence. This is also the point at which we start using multithreading, if so requested. :param keys: sorted list of [tid, sequence] :param shelve_stacks: dictionary containing the name and the handles of the shelf DBs :param args: the namespace :param logger: logger :return: """ counter = 0 # FASTA extraction *has* to be done at the main process level, it's too slow # to create an index in each process. if args.json_conf["prepare"]["single"] is True or args.json_conf[ "threads"] == 1: # Use functools to pre-configure the function # with all necessary arguments aside for the lines partial_checker = functools.partial( create_transcript, canonical_splices=args.json_conf["prepare"]["canonical"], logger=logger, force_keep_cds=not args.json_conf["prepare"]["strip_cds"]) for tid, chrom, key in keys: tid, shelf_name = tid try: tobj = json.loads( next(shelve_stacks[shelf_name]["cursor"].execute( "SELECT features FROM dump WHERE tid = ?", (tid, )))[0]) except sqlite3.ProgrammingError as exc: raise sqlite3.ProgrammingError("{}. Tids: {}".format(exc, tid)) transcript_object = partial_checker( tobj, str(args.json_conf["reference"]["genome"].fetch( chrom, key[0] - 1, key[1])), key[0], key[1], lenient=args.json_conf["prepare"]["lenient"], is_reference=tobj["is_reference"], strand_specific=tobj["strand_specific"]) if transcript_object is None: continue counter += 1 if counter >= 10**4 and counter % (10**4) == 0: logger.info("Retrieved %d transcript positions", counter) elif counter >= 10**3 and counter % (10**3) == 0: logger.debug("Retrieved %d transcript positions", counter) print(transcript_object.format("gtf"), file=args.json_conf["prepare"]["files"]["out"]) print(transcript_object.fasta, file=args.json_conf["prepare"]["files"]["out_fasta"]) else: # pylint: disable=no-member submission_queue = multiprocessing.JoinableQueue(-1) working_processes = [ CheckingProcess( submission_queue, args.logging_queue, args.json_conf["reference"]["genome"].filename, _ + 1, os.path.basename( args.json_conf["prepare"]["files"]["out_fasta"].name), os.path.basename( args.json_conf["prepare"]["files"]["out"].name), args.tempdir.name, seed=args.json_conf["seed"], lenient=args.json_conf["prepare"]["lenient"], canonical_splices=args.json_conf["prepare"]["canonical"], force_keep_cds=not args.json_conf["prepare"]["strip_cds"], log_level=args.level) for _ in range(args.json_conf["threads"]) ] [_.start() for _ in working_processes] for counter, keys in enumerate(keys): tid, chrom, (pos) = keys tid, shelf_name = tid tobj = json.loads( next(shelve_stacks[shelf_name]["cursor"].execute( "SELECT features FROM dump WHERE tid = ?", (tid, )))[0]) submission_queue.put((tobj, pos[0], pos[1], counter + 1)) submission_queue.put(tuple(["EXIT"] * 4)) [_.join() for _ in working_processes] partial_gtf = [ os.path.join( args.tempdir.name, "{0}-{1}".format( os.path.basename( args.json_conf["prepare"]["files"]["out"].name), _ + 1)) for _ in range(args.json_conf["threads"]) ] merge_partial(partial_gtf, args.json_conf["prepare"]["files"]["out"]) partial_fasta = [ os.path.join( args.tempdir.name, "{0}-{1}".format( os.path.basename( args.json_conf["prepare"]["files"]["out_fasta"].name), _ + 1)) for _ in range(args.json_conf["threads"]) ] merge_partial(partial_fasta, args.json_conf["prepare"]["files"]["out_fasta"]) args.json_conf["prepare"]["files"]["out_fasta"].close() args.json_conf["prepare"]["files"]["out"].close() logger.setLevel(logging.INFO) # logger.info("Finished to analyse %d transcripts (%d retained)", # len(exon_lines), counter) logger.setLevel(args.level) return
def _load_exon_lines_multi(args, shelve_names, logger, min_length, strip_cds, threads, max_intron=3 * 10**5): logger.info("Starting to load lines from %d files (using %d processes)", len(args.json_conf["prepare"]["files"]["gff"]), threads) submission_queue = multiprocessing.JoinableQueue(-1) working_processes = [] # working_processes = [ for _ in range(threads)] for num in range(threads): proc = AnnotationParser(submission_queue, args.logging_queue, num + 1, log_level=args.level, min_length=min_length, max_intron=max_intron, strip_cds=strip_cds, seed=args.json_conf["seed"]) proc.start() working_processes.append(proc) # [_.start() for _ in working_processes] for new_shelf, label, strand_specific, is_reference, gff_name in zip( shelve_names, args.json_conf["prepare"]["files"]["labels"], args.json_conf["prepare"]["files"]["strand_specific_assemblies"], args.json_conf["prepare"]["files"]["reference"], args.json_conf["prepare"]["files"]["gff"]): submission_queue.put( (label, gff_name, strand_specific, is_reference, new_shelf)) submission_queue.put(("EXIT", "EXIT", "EXIT", "EXIT", "EXIT")) [_.join() for _ in working_processes] tid_counter = Counter() for shelf in shelve_names: conn = sqlite3.connect( "file:{}?mode=ro".format(shelf), uri=True, # Necessary to use the Read-only mode from file string isolation_level="DEFERRED", timeout=60, check_same_thread= False # Necessary for SQLite3 to function in multiprocessing ) cursor = conn.cursor() tid_counter.update( [_[0] for _ in cursor.execute("SELECT tid FROM dump")]) if tid_counter.most_common()[0][1] > 1: if set(args.json_conf["prepare"]["files"]["labels"]) == {""}: exception = exceptions.RedundantNames( """Found redundant names during multiprocessed file analysis.\ Please repeat using distinct labels for your input files. Aborting. Redundant names:\n\ {}""".format("\n".join(tid_counter.most_common()))) else: exception = exceptions.RedundantNames( """Found redundant names during multiprocessed file analysis, even if \ unique labels were provided. Please try to repeat with a different and more unique set of labels. Aborting. Redundant names:\n\ {}""".format("\n".join([_[0] for _ in tid_counter.most_common() if _[1] > 1]))) logger.exception(exception) raise exception del working_processes gc.collect()
import logging import logging.handlers import sqlalchemy from sqlalchemy.ext.declarative import declarative_base from sqlalchemy import Integer, Column, BLOB from sqlalchemy.orm import sessionmaker import tempfile import os from collections import defaultdict import operator __doc__ = """Script to try to translate the CDS from one coordinate system to another.""" transfer_base = declarative_base() logging_queue = mp.JoinableQueue(-1) log_queue_handler = logging.handlers.QueueHandler(logging_queue) log_queue_handler.setLevel(logging.DEBUG) class _Storer(transfer_base): __tablename__ = "storer" id = Column(Integer, primary_key=True) bed = Column(BLOB) gff3 = Column(BLOB) def __init__(self, id, bed, gff3): self.id, self.bed, self.gff3 = id, bed, gff3
for k in range(len(matrizB)): valor = valor + matrizA[i][k] * matrizB[k][j] queue.task_done() queue_resultados.put((i, j, valor)) if __name__ == '__main__': linhas, colunas = 400, 400 print("{}: Gerando matrizes".format(time.strftime('%c'))) matrizA = cria_matriz(linhas, colunas) matrizB = cria_matriz(linhas, colunas) matrizC = numpy.zeros(shape=(linhas, colunas)) print("{}: Multiplicando matrizes".format(time.strftime('%c'))) queue = multiprocessing.JoinableQueue() queue_resultados = multiprocessing.JoinableQueue() for i in range(2): worker = multiprocessing.Process(target=multiplica_linha_coluna, args=( queue, queue_resultados, matrizA, matrizB, )) worker.daemon = True worker.start() for i in range(len(matrizA)): for j in range(len(matrizA[0])): queue.put((i, j))
def joinable_queue(res, w): def pcheck_joins2(q, resultq, weight_type='ROOK'): while True: work = q.get() if work == None: #print "Got the pill." q.task_done() break #Unpack the args from q potential_neighbors = work[0] shapes = work[1] polygon_ids = work[2] mdict = {} weight_type = weight_type.upper() #print "Process {} working on polygons {} - {}.".format(pid, polygon_ids[0], polygon_ids[-1]) if weight_type == 'QUEEN': # check for a shared vertex vertCache = {} for polyId in polygon_ids: iVerts = shapes[polyId].vertices nbrs = potential_neighbors[polyId] if polyId not in vertCache: vertCache[polyId] = set(iVerts) if polyId not in w: w[polyId] = set() for j in nbrs: join = False if j not in vertCache: vertCache[j] = set(shapes[j].vertices) common = vertCache[polyId].intersection(vertCache[j]) if len(common) > 0: join = True if join: w[polyId].add(j) if j not in w: w[j] = set() w[j].add(polyId) return w elif weight_type == 'ROOK': # check for a shared edge edgeCache = {} for polyId in polygon_ids: if polyId not in edgeCache: iEdges = {} iVerts = shapes[polyId].vertices nv = len(iVerts) ne = nv - 1 for i in range(ne): l = iVerts[i] r = iVerts[i + 1] iEdges[(l, r)] = [] iEdges[(r, l)] = [] edgeCache[polyId] = iEdges nbrs = potential_neighbors[polyId] if polyId not in mdict: mdict[polyId] = [] for j in nbrs: join = False if j not in edgeCache: jVerts = shapes[j].vertices jEdges = {} nv = len(jVerts) ne = nv - 1 for e in range(ne): l = jVerts[e] r = jVerts[e + 1] jEdges[(l, r)] = [] jEdges[(r, l)] = [] edgeCache[j] = jEdges for edge in edgeCache[j]: if edge in edgeCache[polyId]: join = True d = mdict[polyId] d.append(j) mdict[polyId] = d if j not in mdict: mdict[j] = [] k = mdict[j] k.append(polyId) mdict[j] = k break #Put the resultant dict back into the queue and alert that the work is done. resultq.put(mdict) q.task_done() return t6 = time.time() cores = mp.cpu_count() #print #print "Managed Queue" #cores = 2 #Create a joinable queue from which to draw cells and a solution queue to get results ta = time.time() q = mp.JoinableQueue() resultq = mp.Queue() tb = time.time() #print "Made queues {}.".format(tb-ta) #Start up a number of child workers equal to the number of cores #This is a great way to manage a web service. jobs = [ mp.Process(target=pcheck_joins2, args=(q, resultq)) for x in range(cores) ] for job in jobs: job.start() tc = time.time() #print "Spawned processes {}".format(tc-tb) n = len(res['shapes']) starts = range(0, n, n / cores) ends = starts[1:] ends.append(n) offsets = [range(z[0], z[1]) for z in zip(starts, ends)] td = time.time() #print "Computing offsets {} ".format(td-tc) #Load the work into the queue #As the jobs are loaded, they start, so we avoid some of the packing overhead. for i in offsets: args = [] args.append(res['potential_neighbors']) args.append(res['shapes']) args.append(i) #args.append(weight_type='Queen') q.put_nowait(args) te = time.time() #print "Putting work on queue: {}".format(te-td) #Load a poison pill into the queue to kill the children when work is done for i in range(cores): q.put_nowait(None) results = [] for i in range(len(offsets)): results.append(resultq.get()) t7 = time.time() #tf = time.time() #print "Getting work off queue, i.e. processing done {}".format(tf-te) ddict = defaultdict(set) for d in (results): for key, value in d.items(): for v in value: ddict[key].add(v) tg = time.time() for job in jobs: job.join() #print "Joining results {}".format(tg-tf) t8 = time.time() for job in jobs: job.join() print "Joinable Queue Time: {0}".format(t8 - t6) print "Are the results the same? {0}".format(ddict == w)
apg_other.add_argument('--threads', type = int, metavar = 'N', help = 'number of threads (default: 2)', default = 2) apg_other.add_argument('--debug', type = int, help = 'print debug information; 0 = off, 1 = info, 2 = debug, 3 = details (default: 0)', default = 0) options = parser.parse_args() # check for required argument if options.bbox == None: parser.print_help() sys.exit() print ("Bounding Box: %s" % options.bbox) print ("Metasize: {}".format(options.metasize)) print ("Zoom: {}-{}".format(options.zooms[0], options.zooms[1])) # setup queue to be used as a transfer pipeline from the render processes to the writer writerQueue = multiprocessing.JoinableQueue(options.metasize * options.metasize) # setup a lock for parts that only one process can execute (e.g. access the same file, print to screen) if MULTIPROCESSING: lock = multiprocessing.Lock() # multiprocessing else: lock = threading.Lock() # threading writer = WriterThread(options, writerQueue, lock) if MULTIPROCESSING: writer_thread = multiprocessing.Process(target = writer.loop) # multiprocessing else: writer_thread = threading.Thread(target = writer.loop) # threading writer_thread.start() render_tiles(options.bbox, options.zooms, mapfile, options.metasize, writerQueue, lock, num_threads = options.threads, scale = 1.0, debug = options.debug)
horizontal_heatmap = np.reshape(horizontal_heatmap, (1, 32, 57, 28, 1)) vertical_heatmap = np.reshape(vertical_heatmap, (1, 32, 37, 28, 1)) model_input = ((horizontal_heatmap, vertical_heatmap), ()) start = time.time() predictions = loaded_model.predict(model_input) end = time.time() print(end - start) nano_serv.send_data_queue.put(predictions) print('inference done') cross_process_signal.put('switch') if __name__ == "__main__": cross_process_signal = mp.Queue() cross_process_data = mp.JoinableQueue() received_data_queue = Qthread.Queue(5) send_data_queue = Qthread.Queue(5) nano_serv = ns.Nano_Server(received_data_queue, send_data_queue) pid = os.fork() if (pid): time.sleep(30) red_blue_buffer(nano_serv, cross_process_signal, cross_process_data) else: inference_machine(nano_serv, cross_process_signal, cross_process_data)
def render_tiles(bbox, zooms, mapfile, metasize, writer, lock, num_threads = NUM_THREADS, scale = 1, debug = 0): # setup queue to be used as a transfer pipeline to the render processes renderQueue = multiprocessing.JoinableQueue(32) print "Setting up maps. Please wait..." # Launch render processes renderers = {} for i in range(num_threads): renderer = RenderThread(writer, mapfile, renderQueue, lock, zooms[1]) if MULTIPROCESSING: render_thread = multiprocessing.Process(target = renderer.loop) else: render_thread = threading.Thread(target = renderer.loop) render_thread.start() renderers[i] = render_thread # setup projection shortcuts gprj = GoogleProjection(zooms[1] + 1) LLtoPx = gprj.fromLLtoPixel # our map window to render ll0 = (bbox[0], bbox[3]) ll1 = (bbox[2], bbox[1]) # dimensions of map area for each zoom level ((left, top), (right, bottom)) px = [[LLtoPx(ll0, z), LLtoPx(ll1, z)] for z in xrange(0, zooms[1] + 1)] # setup tile and metadata dictionarys (https://docs.python.org/2/tutorial/datastructures.html#dictionaries) tileData = {'sum': 0}; # holds information of all tiles metaData = {}; # holds information of all metatiles # iterate over all requested zoom levels for z in range(zooms[0], zooms[1] + 1): # setup nested dictionaries for this zoom level tileData[z] = {} metaData[z] = {} # compute how many tiles need to be rendered at current zoom level tileData[z]['cols'] = int(ceil((px[z][1][0] - px[z][0][0]) / TILE_SIZE)) tileData[z]['rows'] = int(ceil((px[z][1][1] - px[z][0][1]) / TILE_SIZE)) # number of tiles for this zoom level tileData[z]['sum'] = tileData[z]['cols'] * tileData[z]['rows'] # update number of tiles overall tileData['sum'] += tileData[z]['sum'] # determine optimal metatile size if tileData[z]['sum'] <= (metasize * metasize): # whole map at this zoom level fits into one metatile (does not need to be a square) metaData[z]['width'] = tileData[z]['cols'] metaData[z]['height'] = tileData[z]['rows'] else: if tileData[z]['cols'] <= tileData[z]['rows']: metaData[z]['width'] = min(metasize, tileData[z]['cols']) metaData[z]['height'] = int(floor(metasize * metasize / metaData[z]['width'])) else: metaData[z]['height'] = min(metasize, tileData[z]['rows']) metaData[z]['width'] = int(floor(metasize * metasize / metaData[z]['height'])) # amount of metatiles for this zoom level metaData[z]['sum'] = int(ceil(float(tileData[z]['sum']) / float(metaData[z]['width'] * metaData[z]['height']))) if debug >= 2: print "px at z=", z, ": ", px[z] print "tileData at z=", z, ": ", tileData[z] print "metaData at z=", z, ": ", metaData[z] print "" print "Tiles to render: ", tileData['sum'], "\n" # transfer tile count to writer thread item = (Command.sum, tileData['sum'], None, None, None) writer.put(item) # loop over tiles in every zoom level and render metatiles for z in range(zooms[0], zooms[1] + 1): # tiles are rendered from left to right beginning at the top left corner and ending at the bottom right corner for y in range(0, int(ceil(float(tileData[z]['rows']) / metaData[z]['height']))): # calculate height of current metatile (can be reduced at bottom/right border of map) # check if bottom edge of metatile exceeds overall number of tiles in this column if ((y + 1) * metaData[z]['height']) > tileData[z]['rows']: # yes, limit to max possible metaheight = min(metaData[z]['height'], max(0, tileData[z]['rows'] - y * metaData[z]['height'])) else: # no, use full metatile height metaheight = metaData[z]['height'] for x in range(0, int(ceil(float(tileData[z]['cols']) / metaData[z]['width']))): # calculate width of current metatile (can be reduced at bottom/right border of map) # check if right border of metatile exceeds overall tiles in this row if ((x + 1) * metaData[z]['width']) > tileData[z]['cols']: # yes, limit metatile dimensions to maximum possible metawidth = min(metaData[z]['width'], max(0, tileData[z]['cols'] - x * metaData[z]['width'])) else: # no, use full width of metatile metawidth = metaData[z]['width'] # calculate dimensions of current metatile in pixels left = TILESIZE * (int(px[z][0][0] / TILE_SIZE) + x * metaData[z]['width']) top = TILESIZE * (int(px[z][0][1] / TILE_SIZE) + y * metaData[z]['height']) right = left + TILESIZE * metawidth bottom = top + TILE_SIZE * metaheight # create set of current metatile for the render queue metatile = (z, scale, (left, bottom), (right, top), metawidth, metaheight, debug) if debug >= 3: print "x=", x, " y=", y, " metawidth=", metawidth, "metaheight=", metaheight, " metatile=", metatile # add metatile to rendering queue renderQueue.put(metatile) # Signal render threads to exit by sending empty request to queue for i in range(num_threads): renderQueue.put(None) # wait for pending rendering jobs to complete renderQueue.join() for i in range(num_threads): renderers[i].join()
def create_photometric_flatfield( filelist=None, input_hdus=None, strict_ota=False, smoothing=None, debug=False, return_interpolator=False, parallel=True, n_processes=-1, ): logger = logging.getLogger("PhotFlat") if (n_processes == 0): n_processes = multiprocessing.cpu_count() elif (n_processes < 0): n_processes = sitesetup.number_cpus if (smoothing is None): smoothing = 120. smoothing_pixels = smoothing / 0.11 logger.info("Using PF smoothing length of %.1f arcsec" % (smoothing)) pf = PhotFlatHandler( filelist=filelist, input_hdus=input_hdus ) n_frames = len(filelist) if filelist is not None else 0 n_hdus = len(input_hdus) if input_hdus is not None else 0 logger.info("Computing photometric flatfield from %d disk-files and %d memory-files" % ( n_frames, n_hdus) ) # logger.info("Input files:\n-- %s" % ("\n-- ".join(filelist))) pf.read_catalogs() reference_pos = [4., -4.] # that's in arc-min relative to reference point from CRVAL1/2 reference_zp = pf.get_reference_zeropoint( ra=reference_pos[0], dec=reference_pos[1], radius=3, relative_coords=True, max_error=0.05) logger.debug("Using reference ZP: %s" % (reference_zp)) # reference_zp = {} # list_of_otas = [] # list_of_extnames = [] # for framename in pf.phot_frames: # logger.info("Adding photometric data from file %s" % (framename)) # frame = pf.phot_frames[framename] # # zps = frame.get_zeropoints(ra=reference_pos[0], # dec=reference_pos[1], # radius=3, # relative_coords=True, max_error=0.05) # print zps # reference_zp[framename] = numpy.median(zps) # # # also collect a list of all available OTAs # list_of_otas.extend(frame.get_ota_list()) # list_of_extnames.extend(frame.get_extname_list()) # # print reference_zp unique_otas = pf.get_ota_set() unique_extnames = pf.get_extname_set() # set(list_of_otas) # print list_of_otas # # unique_extnames = set(list_of_extnames) # print unique_extnames # # Now extract the relative ZP differences for each of the sectors in each ota # sampling = 512 otalist = [pyfits.PrimaryHDU()] running_sum = 0 all_photflat = [] all_photflat_err = [] all_extnames = [] if (parallel): logger.debug("Calculating photometric flatfield in parallel") # prepare jobs extname_queue = multiprocessing.JoinableQueue() for i, extname in enumerate(unique_extnames): extname_queue.put(extname) result_queue = multiprocessing.Queue() # start parallel execution in separate processes processes = [] for i in range(n_processes): # start the process p = multiprocessing.Process( target=parallel_create_photometric_flatfields_worker, kwargs=dict( input_queue=extname_queue, result_queue=result_queue, pf=pf, reference_zp=reference_zp, sampling=sampling, smoothing=smoothing_pixels, ) ) # p.daemon = True p.start() processes.append(p) # also add a termination command to the job queue extname_queue.put(None) # Gather all results for _ in unique_extnames: (imghdu, photflat, photflat_err) = result_queue.get() otalist.append(imghdu) all_photflat.append(photflat) all_photflat_err.append(photflat_err) all_extnames.append(imghdu.name) logger.info("Received %d phot-flat extensions from parallel workers" % (len(otalist)-1)) else: logger.debug("Using the serial approach towards the photometric flatfield") for i, extname in enumerate(unique_extnames): logger.info("Computing photometric flat-field for OTA %s (%2d of %2d)" % (extname, i+1, len(unique_otas))) imghdu, photflat, photflat_err = create_photometric_flatfield_single_ota( extname=extname, pf=pf, reference_zp=reference_zp, sampling=sampling, enlarge=enlarge, ) otalist.append(imghdu) all_photflat.append(photflat) all_photflat_err.append(photflat_err) all_extnames.append(imghdu.name) logger.debug("Total sum of reference values: %d" % (running_sum)) # break # # Calculate the mean and/or median level of the photflat across # the mean level # all_photflat = numpy.array(all_photflat) fluxcorr = numpy.power(10., 0.4*all_photflat) numpy.savetxt("photcorr", all_photflat.ravel()) numpy.savetxt("flatcorr", fluxcorr.ravel()) numpy.save("photcorr_npy", all_photflat) mean_level = numpy.nanmean(fluxcorr) mean_mag = numpy.nanmean(all_photflat) logger.info("Mean photometric flatfield level: %8.5f (delta-mag=%7.4f)" % (mean_level, mean_mag)) import pickle fluxcorr /= mean_level pickle.dump((fluxcorr, all_extnames), open("photflat.pickle", "wb")) logger.debug("Correcting photometric flatfield mean level") for ota in otalist[1:]: ota.data /= mean_level logger.debug("Done correcting photometric flatfield mean level") hdulist = pyfits.HDUList(otalist) if (return_interpolator): return hdulist, (fluxcorr, all_extnames) return hdulist
import os import req_proxy from bs4 import BeautifulSoup from urlparse import urlparse import multiprocessing import logging import time import os from threading import Thread import sys from lxml import html logging.basicConfig(level=logging.DEBUG, format='(%(threadName)-10s) %(message)s',) num_fetch_threads = 100 enclosure_queue = multiprocessing.JoinableQueue() class pl_to_info(object): def __init__(self, line): line = str(line).strip() self.line_list = ast.literal_eval(line) f = open("to_extract_downloads") self.direc = f.read().strip() f.close()
def prepare(mikado_config: MikadoConfiguration, logger): """Main script function. :param mikado_config: the ArgumentParser-derived namespace. :param logger: a logging instance :type logger: logging.Logger """ if not hasattr(mikado_config.reference, "genome"): raise InvalidConfiguration( "Invalid configuration; reference: {}".format(mikado_config)) if hasattr(mikado_config.reference.genome, "close"): mikado_config.reference.genome.close() if hasattr(mikado_config.reference.genome, "filename"): mikado_config.reference.genome = getattr( mikado_config.reference.genome, "filename") elif hasattr(mikado_config.reference.genome, "name"): mikado_config.reference.genome = getattr( mikado_config.reference.genome, "name") else: logger.critical("Invalid FASTA file: %s", mikado_config.reference.genome) raise AttributeError elif not isinstance(mikado_config.reference.genome, (str, bytes)): logger.critical("Invalid FASTA file: %s", mikado_config.reference.genome) raise AttributeError if not os.path.exists(mikado_config.reference.genome): error = "Invalid FASTA file: {}".format(mikado_config.reference.genome) logger.critical(error) raise AttributeError(error) assert len(mikado_config.prepare.files.gff) > 0 assert len(mikado_config.prepare.files.gff) == len( mikado_config.prepare.files.labels), ( mikado_config.prepare.files.gff, mikado_config.prepare.files.labels) if mikado_config.prepare.strand_specific is True: mikado_config.prepare.files.strand_specific_assemblies = mikado_config.prepare.files.gff[:] ref_len = len(mikado_config.prepare.files.reference) file_len = len(mikado_config.prepare.files.gff) if ref_len == 0: mikado_config.prepare.files.reference = ([False] * file_len) elif (ref_len != file_len) or (mikado_config.prepare.files.reference[0] not in (True, False)): ref_set = set(mikado_config.prepare.files.reference) mikado_config.prepare.files.reference = [ (_ in ref_set) for _ in mikado_config.prepare.files.gff ] if not mikado_config.prepare.files.exclude_redundant: mikado_config.prepare.files.exclude_redundant = ( [getattr(mikado_config, "exclude_redundant", False)] * len(mikado_config.prepare.files.gff)) shelve_names = [ path_join(mikado_config.prepare.files.output_dir, "mikado_shelf_{}.db".format(str(_).zfill(5))) for _ in range(len(mikado_config.prepare.files.gff)) ] logger.propagate = False if mikado_config.prepare.single is False and mikado_config.threads > 1: multiprocessing.set_start_method(mikado_config.multiprocessing_method, force=True) mikado_config.logging_queue = multiprocessing.JoinableQueue(-1) log_queue_handler = logging.handlers.QueueHandler( mikado_config.logging_queue) log_queue_handler.setLevel(logging.DEBUG) # logger.addHandler(log_queue_handler) mikado_config.tempdir = tempfile.TemporaryDirectory( dir=mikado_config.prepare.files.output_dir) mikado_config.listener = logging.handlers.QueueListener( mikado_config.logging_queue, logger) mikado_config.listener.propagate = False mikado_config.listener.start() mikado_config.prepare.files.out_fasta = open( path_join(mikado_config.prepare.files.output_dir, mikado_config.prepare.files.out_fasta), 'w') mikado_config.prepare.files.out = open( path_join(mikado_config.prepare.files.output_dir, mikado_config.prepare.files.out), 'w') logger.info("Output dir: %s. Output GTF: %s. Output Fasta: %s", mikado_config.prepare.files.output_dir, mikado_config.prepare.files.out.name, mikado_config.prepare.files.out_fasta.name) logger.info("Loading reference file") mikado_config.reference.genome = pysam.FastaFile( mikado_config.reference.genome) logger.info("Finished loading genome file") logger.info("Started loading exon lines") errored = False try: # chrom, start, end, strand, tid, write_start, write_length, shelf rows = load_exon_lines( mikado_config, shelve_names, logger, min_length=mikado_config.prepare.minimum_cdna_length, max_intron=mikado_config.prepare.max_intron_length, ) logger.info("Finished loading exon lines") shelve_source_scores = [] for label in mikado_config.prepare.files.labels: shelve_source_scores.append( mikado_config.prepare.files.source_score.get(label, 0)) shelve_table = [] for shelf, score, is_reference, exclude_redundant in zip( shelve_names, shelve_source_scores, mikado_config.prepare.files.reference, mikado_config.prepare.files.exclude_redundant): assert isinstance(is_reference, bool), \ (is_reference, mikado_config.prepare.files.reference) shelve_table.append( (shelf, score, is_reference, exclude_redundant)) shelve_table = pd.DataFrame( shelve_table, columns=["shelf", "score", "is_reference", "exclude_redundant"]) rows = rows.merge(shelve_table, on="shelf", how="left") random.seed(mikado_config.seed) shelves = dict((shelf_name, open(shelf_name, "rb")) for shelf_name in shelve_table["shelf"].unique()) def divide_by_chrom(): # chrom, start, end, strand, tid, write_start, write_length, shelf transcripts = rows.groupby(["chrom"]) columns = rows.columns[1:] for chrom in sorted(transcripts.groups.keys()): logger.debug("Starting with %s (%d positions)", chrom, transcripts.size()[chrom]) yield from _analyse_chrom(chrom, rows.loc[transcripts.groups[chrom], columns], shelves, logger=logger) perform_check(divide_by_chrom(), shelve_names, mikado_config, logger) except Exception as exc: # TODO: Consider using stderr to signal errors here too? logger.exception(exc) __cleanup(mikado_config, shelve_names) errored = True logger.error("Mikado has encountered an error, exiting") # sys.exit(1) if mikado_config.prepare.single is False and mikado_config.threads > 1: mikado_config.tempdir.cleanup() mikado_config.listener.enqueue_sentinel() logger.setLevel(logging.INFO) __cleanup(mikado_config, shelve_names) logger.addHandler(logging.StreamHandler()) if errored is False: logger.info( "Mikado prepare has finished correctly with seed %s. The output %s FASTA file can now be " "used for BLASTX and/or ORF calling before the next step in the pipeline, `mikado serialise`.", mikado_config.seed, mikado_config.prepare.files.out_fasta) logging.shutdown() else: logger.error("Mikado prepare has encountered a fatal error. Please check the logs and, if there is a bug,"\ "report it to https://github.com/EI-CoreBioinformatics/mikado/issues") logging.shutdown() exit(1)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-wd', dest='wd', help='full path to working directory', default=-1) parser.add_argument('-d', dest='DIR', help='full path to prep_TF directory') parser.add_argument('-s', dest='samples', help='samples file') parser.add_argument('-i', dest='ID', help='unique id of this sample') parser.add_argument('-eb', dest='exeBWA', help='full path to bwa executable', default="bwa") parser.add_argument('-es', dest='exeSAM', help='full path to samtools executable', default="samtools") parser.add_argument('-l1', dest='level', help='level of hierarchy to guide initial search') parser.add_argument('-l2', dest='cLevel', help='level of hierarchy to cluster') parser.add_argument('-q', dest='qual', help='map quality threshold', type=int) parser.add_argument( '-exclude', dest='exclude', help='newline separated list of te families to exclude from analysis', default=-1) parser.add_argument('-sd', dest='stdev', help='insert size standard deviation override', type=int, default=-1) parser.add_argument('-cov', dest='cov', help='manual coverage override', type=int, default=-1) parser.add_argument("-t", dest="nProc", type=int, default=1, help="Specify number of processes") args = parser.parse_args() # identify current working directory if args.wd == -1: cwd = os.getcwd() else: cwd = os.path.abspath(args.wd) # import options prep_TF = os.path.abspath(args.DIR) prefix = os.path.abspath(args.DIR).split("/")[-1].replace(".prep_TF", "") exeSAM = args.exeSAM exeBWA = args.exeBWA level = args.level cLevel = args.cLevel qual = args.qual nProc = args.nProc # check dependencies for function check_dependency(exeSAM) check_dependency(exeBWA) # import hierarchy hierFILE = os.path.join(prep_TF, prefix + ".hier") hierarchy, label = {}, [] ct = 0 with open(hierFILE, 'r') as fIN: for line in fIN: if ct == 0: label = line.split()[1:] else: hierarchy[line.split()[0]] = line.split()[1:] ct += 1 bam, pre = "", "" with open(os.path.abspath(args.samples), "r") as fIN: for line in fIN: if line.split()[1] == args.ID: pre = line.split()[1] bam = line.split()[0] if pre == "" or bam == "": print "Warning: prefix in samples file different from path in options" sys.exit() # identify the group-name of all TEs for the specified level of the hierarchy groups = [] groupIndex = label.index(level) for ID in hierarchy: groups.append(hierarchy[ID][groupIndex]) groups = sorted(set(groups)) # import the TE annotation annotation = [] with open(os.path.join(prep_TF, prefix + ".te.pseudo.bed"), 'r') as fIN: for line in fIN: arr = line.split() annotation.append([arr[0], int(arr[1]), int(arr[2]), arr[3]]) # import the chromosome lengths chromosomes, lengths = [], [] genomeSizeFILE = os.path.join(prep_TF, prefix + ".genomeSize.txt") with open(genomeSizeFILE, 'r') as fIN: for line in fIN: arr = line.split() chromosomes.append(arr[0]) lengths.append(int(arr[2])) # run samtools stats statsOutFile = bam.replace(".bam", ".stats.txt") print "Calculating alignment statistics" cmd = "%s stats -t %s %s" % (exeSAM, genomeSizeFILE, bam) print "cmd:", cmd p = sp.Popen(shlex.split(cmd), stdout=open(statsOutFile, 'w'), stderr=sp.PIPE) perr = p.communicate()[1] if p.returncode != 0: print "samtools stats issued error: %s" % (perr) sys.exit(1) # calculate coverage covFILE = bam.replace(".bam", ".cov.txt") cmd = """%s depth -Q %s %s | awk '{sum+=$3; sumsq+=$3*$3} END {print "Average = ",sum/NR; print "Stdev = ",sqrt(sumsq/NR - (sum/NR)**2)}' > %s""" % ( exeSAM, str(qual), bam, covFILE) print "cmd:", cmd os.system(cmd) # read samtools stats file with open(statsOutFile, 'r') as fIN: for line in fIN: if 'average length' in line: readLen = int(float(line.split()[-1])) if 'insert size average' in line: insz = int(float(line.split()[-1])) if 'insert size standard deviation' in line: sd = int(float(line.split()[-1])) if args.stdev == -1: print "Insert size standard deviation estimated as %s. Use the override option if you suspect this is incorrect!" % ( sd) if sd > 100: print "!!! Warning: insert size standard deviation reported as", sd, "!!!" print "Please ensure this is correct and use the override option!" sys.exit() else: sd = args.stdev # read coverage file cov = args.cov with open(covFILE, "r") as fIN: for line in fIN: if line.startswith("Av"): cov = int(float(line.split()[-1])) if cov == -1: print "Warning: coverage could not be estimated, enter coverage manually" sys.exit() # read list of TE groups to exclude from analysis if args.exclude == -1: excludeList = [] else: excludeList = [] with open(args.exclude, "r") as fIN: for line in fIN: excludeList.append(line.split()[0]) # define and create subdirectories bedDir = os.path.join(cwd, pre + ".bed_files") samDir = os.path.join(cwd, pre + ".sam_files") posDir = os.path.join(cwd, pre + ".te_positions") suppDir = os.path.join(cwd, pre + ".supplemental_alignments") outDir = os.path.join(cwd, "countPos") mkdir_if_not_exist(bedDir, posDir, samDir, suppDir, outDir) groups = [group for group in groups if group not in excludeList] #groups= ["doc3"] #debug single family print "Groups to search:", groups print "\nwriting TE bed files..." for group in groups: #print "group:",group wb.write_bed_portal(hierarchy, label, group, level, bedDir) print "writing TE bed files completed!" # reduce search-space 1 print "reducing search space..." try: bedFILE = os.path.join(bedDir, "mega_complete.bed") bamFILE = os.path.join(samDir, "mega_complete.bam") cmd = "%s view -@ %s -L %s %s -b" % (exeSAM, str(nProc), bedFILE, bam) print "cmd:", cmd p = sp.Popen(shlex.split(cmd), stdout=open(bamFILE, 'w'), stderr=sp.PIPE) perr = p.communicate()[ 1] # communicate returns a tuple (stdout, stderr) #print perr if p.returncode != 0: print "Error running samtools: p.returncode =", p.returncode sys.exit(1) except OSError: print "Cannot run samtools" print "search space succesfully reduced..." print "new reduced bam file:", bamFILE # run multiprocess 2 print "clustering TE positions..." task_q = mp.JoinableQueue() params = [ annotation, bamFILE, chromosomes, exeSAM, hierarchy, insz, label, lengths, level, cLevel, qual, readLen, sd, cov, bedDir, samDir, posDir, suppDir ] create_proc2(nProc, task_q, params) assign_task(groups, task_q, nProc) try: task_q.join() except KeyboardInterrupt: print "KeyboardInterrupt" sys.exit(0) else: print "\nclustering TE positions completed!" # combine bed files from all groups with open(os.path.join(bedDir, "mega_clustered.bed"), "w") as fOUT: for group in groups: with open(os.path.join(bedDir, "%s_clustered.bed" % (group)), "r") as fIN: for line in fIN: fOUT.write(line) # reduce search-space 2 print "final reduction of search space..." try: bedFILE = os.path.join(bedDir, "mega_clustered.bed") bamFILE = os.path.join(samDir, "mega_clustered.bam") cmd = "%s view -@ %s -q %s -L %s %s -b" % (exeSAM, str(nProc), str(qual), bedFILE, bam) print "cmd:", cmd p = sp.Popen(shlex.split(cmd), stdout=open(bamFILE, 'w'), stderr=sp.PIPE) perr = p.communicate()[ 1] # communicate returns a tuple (stdout, stderr) #print perr if p.returncode != 0: print "Error running samtools: p.returncode =", p.returncode sys.exit(1) except OSError: print "Cannot run samtools" print "search space succesfully reduced..." print "new reduced bam file:", bamFILE # run multiprocess 3 print "estimating TE breakpoints..." bamFILE = os.path.join(samDir, "mega_clustered.bam") task_q = mp.JoinableQueue() params = [ annotation, bamFILE, chromosomes, exeSAM, hierarchy, insz, label, lengths, level, cLevel, qual, readLen, sd, cov, bedDir, samDir, posDir, suppDir ] create_proc3(nProc, task_q, params) assign_task(groups, task_q, nProc) try: task_q.join() except KeyboardInterrupt: print "KeyboardInterrupt" sys.exit(0) else: print "\nestimating TE breakpoints completed!" # concatonate position estimates catFile = os.path.join(outDir, pre + ".all_positions.txt") try: files = "" for file in glob.glob(os.path.join(posDir, "*.txt")): files += file + " " cmd = "cat %s" % (files) #print "cmd:", cmd #p = sp.Popen(shlex.split(cmd), stdout=open(catFile, 'w'), stderr=sp.PIPE) p = sp.Popen(shlex.split(cmd), stdout=open(catFile, 'w'), stderr=sp.PIPE) perr = p.communicate()[ 1] # communicate returns a tuple (stdout, stderr) #print perr if p.returncode != 0: print "error concatenating positions" sys.exit(1) except OSError: print "Cannot concatenate positions" sys.exit(1) # sort position estimates print "Sorting positions..." sortp.sort_portal(catFile) # remove temporary directories shutil.rmtree(bedDir) shutil.rmtree(samDir) shutil.rmtree(posDir) shutil.rmtree(suppDir) print "TEFLON DISCOVERY FINISHED!"
FIXME: dynamically fetch & update the RIPE managed tree """ def __init__(self, lookup_queue, result_queue): multiprocessing.Process.__init__(self) self.lookup_queue = lookup_queue self.result_queue = result_queue self.tree = radix.Radix() self.prefixes = [] self.dbname = "RIPE-AUTH" self.ready_event = multiprocessing.Event() self.lookup = RIPELookupWorker(self.tree, self.prefixes, self.lookup_queue, self.result_queue) self.lookup.setDaemon(True) self.lookup.start() def run(self): print "INFO: loaded the RIPE managed tree" self.ready_event.set() # yay if __name__ == "__main__": lookup_queue = multiprocessing.JoinableQueue() result_queue = multiprocessing.JoinableQueue() a = RIPEWorker(lookup_queue, result_queue) a.start() a.ready_event.wait() lookup_queue.put(("is_covered", "194.33.96.0/24")) lookup_queue.join() print result_queue.get()
def __loadCache(self, file): mp = False nan = 0 processes = [] single = False cache_file = None try: temp = RopperService.CACHE_FOLDER cache_file = temp + os.path.sep + self.__getCacheFileName(file) if not os.path.exists(cache_file): if not os.path.exists(cache_file + '_%d' % 1): return else: if isWindows(): raise RopperError('Cache has to be cleared.') mp = True and multiprocessing.cpu_count() > 1 else: single = True if self.__callbacks and hasattr(self.__callbacks, '__message__'): self.__callbacks.__message__('Load gadgets from cache') if self.__callbacks and hasattr(self.__callbacks, '__gadgetSearchProgress__'): self.__callbacks.__gadgetSearchProgress__(None, [], 0) if not mp: all_gadgets = [] if single: with open(cache_file, 'rb') as f: data = f.read() all_gadgets.extend(eval(decode(data, 'zip'))) if self.__callbacks and hasattr( self.__callbacks, '__gadgetSearchProgress__'): self.__callbacks.__gadgetSearchProgress__( None, all_gadgets, 1.0) else: for i in range(1, RopperService.CACHE_FILE_COUNT + 1): if os.path.exists(cache_file + '_%d' % i): with open(cache_file + '_%d' % i, 'rb') as f: data = f.read() all_gadgets.extend(eval(decode(data, 'zip'))) if self.__callbacks and hasattr( self.__callbacks, '__gadgetSearchProgress__'): self.__callbacks.__gadgetSearchProgress__( None, all_gadgets, float(i) / RopperService.CACHE_FILE_COUNT) return all_gadgets else: count = min(multiprocessing.cpu_count(), RopperService.CACHE_FILE_COUNT) gqueue = multiprocessing.Queue() fqueue = multiprocessing.JoinableQueue() for i in range(1, RopperService.CACHE_FILE_COUNT + 1): fqueue.put(cache_file + '_%d' % i) all_gadgets = [] for i in range(count): p = multiprocessing.Process( target=self.__loadCachePerProcess, args=(fqueue, gqueue)) p.start() processes.append(p) for i in range(count): fqueue.put(None) for i in range(RopperService.CACHE_FILE_COUNT): gadgets = gqueue.get() all_gadgets.extend(gadgets) if self.__callbacks and hasattr( self.__callbacks, '__gadgetSearchProgress__'): self.__callbacks.__gadgetSearchProgress__( None, all_gadgets, float(i + 1) / RopperService.CACHE_FILE_COUNT) return sorted(all_gadgets, key=Gadget.simpleInstructionString) except KeyboardInterrupt: if mp: for p in processes: if p and p.is_alive(): p.terminate() except BaseException as e: if mp: for p in processes: if p and p.is_alive(): p.terminate() if cache_file: for i in range(1, RopperService.CACHE_FILE_COUNT + 1): if os.path.exists(cache_file + '_%d' % i): os.remove(cache_file + '_%d' % i)
import multiprocessing as mp def washer(dishes, output): for dish in dishes: print('Washing', dish, 'dish') output.put(dish) def dryer(input): while True: dish = input.get() print('Drying', dish, 'dish') input.task_done() dish_queue = mp.JoinableQueue() dryer_proc = mp.Process(target=dryer, args=(dish_queue,)) dryer_proc.daemon = True dryer_proc.start() dishes = ['salad', 'bread', 'entree', 'dessert'] washer(dishes, dish_queue) dish_queue.join()
def run_cmh(args): ''' run Cochran-Mantel-Hasenzle test ''' sz_utils.make_dirs_if_necessary(args.outp) allele_counts = {} pvals = {} tables = collections.defaultdict(list) ntests = 0 tables, ntables_per_snp = sz_utils._count2table(args.table_file) ColorText().info("[poolseq_tk]: %d tables prepared\n" %(len(tables)), "stderr") task_q = mp.JoinableQueue() result_q = mp.Queue() create_procs(args.nproc,task_q, result_q, ntables_per_snp, args.outp) sz_utils._assign_tables(tables, task_q, args.nproc) # waiting for all tasks to be finished try: task_q.join() except KeyboardInterrupt: ColorText().info("[poolseq_tk]: Terminated unexpectedly by keyboard\n", "stderr") sys.exit() else: # merge results pvals, odds_ratios = {}, {} while args.nproc: file = result_q.get() with open(file, 'r') as fIN: for line in fIN: tmp_line = line.strip().split("\t") chr = tmp_line[0] pos = int(tmp_line[1]) pval = float(tmp_line[2]) odds_ratio = float(tmp_line[3]) if (chr, pos) not in pvals: pvals[chr, pos] = pval if (chr, pos) not in odds_ratios: odds_ratios[chr, pos] = odds_ratio os.remove(file) # pvals_split, odds_ratios_split = result_q.get() # pvals.update(pvals_split) # odds_ratios.update(odds_ratios_split) args.nproc -= 1 ColorText().info("[poolseq_tk]: Running CMH tests successfully\n", "stderr") # correcting raw p-values ColorText().info("[poolseq_tk]: multi-testing correction using %s method at %d%% level ..." %(args.adj_method, args.adj_cutoff*100), "stderr") raw_pvals = [pvals[chr, pos] for chr, pos in sorted(pvals.iterkeys())] raw_pvals_vector = robjects.FloatVector(raw_pvals) padjust = robjects.r['p.adjust'](raw_pvals_vector, method=args.adj_method) ColorText().info(" [done]\n", "stderr") pcutoff = sz_utils.getFDR_BH(pvals, args.adj_cutoff) ColorText().info("[poolseq_tk]: p-value cutoff using Benjamini.Hochberg procedure %.5e" %(pcutoff), "stderr") ColorText().info(" [done]\n", "stderr") # output p-values ColorText().info("[poolseq_tk]: output to files ...", "stderr") out_all = args.outp + ".cmh.all" out_fdr = args.outp + ".cmh.fdr%d" %(args.adj_cutoff*100) out_expect = args.outp + ".cmh.fdr%d.expect" %(args.adj_cutoff*100) sz_utils.make_dirs_if_necessary(out_all, out_fdr) with open(out_all, 'w') as fALL, \ open(out_fdr, 'w') as fFDR, \ open(out_expect, 'w') as fEXPECT: for i, k in enumerate(sorted(pvals.iterkeys())): chr = k[0] pos = k[1] raw_pval = pvals[chr, pos] log_pval = None if raw_pval == 0.0: log_pval = "Inf" elif raw_pval == "Nan": raw_pval = 1.0 log_pval = 0.0 else: log_pval = -1 * math.log10(raw_pval) odds_ratio = odds_ratios[k] if padjust[i] <= args.adj_cutoff: sz_utils._results_outputter(fFDR, pos, chr, "\t".join(tables[chr, pos][1:3]), tables[chr, pos][3:], raw_pval, log_pval, padjust[i], odds_ratio) if ((args.oddsr_direction == "greater" and odds_ratios[chr, pos] > 1) or (args.oddsr_direction == "less" and odds_ratios[chr, pos] < 1)): sz_utils._results_outputter(fEXPECT, pos, chr, "\t".join(tables[chr, pos][1:3]), tables[chr, pos][3:], raw_pval, log_pval, padjust[i], odds_ratio) sz_utils._results_outputter(fALL, pos, chr, "\t".join(tables[chr, pos][1:3]), tables[chr, pos][3:], raw_pval, log_pval, padjust[i], odds_ratio) ColorText().info(" [done]\n", "stderr") ColorText().info("[poolseq_tk]: Program finishes successfully\n", "stderr")
def appSearchMP(dbfilenameFullPath, searchType, search_space, options): (outputFile, maxCores) = (options.outputFile, options.maxCores) known_bad_data = None # Start timer t0 = time.time() # If possible use the available indexes if searchType == 'LITERAL' and options.searchLiteral[0][0] not in [ '=', '>', '<' ] and (search_space.lower() == 'filename' or search_space.lower() == 'filepath'): num_hits = namedtuple('hits', 'value') num_hits_suppressed = namedtuple('hits', 'value') (num_hits.value, num_hits_suppressed.value, results) = runIndexedSearch(dbfilenameFullPath, search_space, options) else: # Get total number of entries to search DB = appDB.DBClass(dbfilenameFullPath, True, settings.__version__) conn = DB.appConnectDB() entriesCount = DB.CountEntries() logger.debug("Total entries in search space: %d" % entriesCount) # Pre-load known_bad if required if searchType == 'KNOWNBAD': known_bad_data = LoadRegexBulkSearch(options.knownbad_file) # Establish communication queues tasks = multiprocessing.JoinableQueue() resultsProducers = multiprocessing.Queue() resultsConsumers = multiprocessing.Queue() hitHistogram_queue = multiprocessing.Queue() # Start producers/consumers num_consumers = 1 num_producers = max(1, maxCores - 1) # Prep lock for progress update Producers progProducers = multiprocessing.Value('i', 0) # Prep lock for progress update Consumers progConsumers = multiprocessing.Value('i', 0) # Prep Consumers return values num_hits = multiprocessing.Value('i', 0) num_hits_suppressed = multiprocessing.Value('i', 0) logger.debug( 'Using %d cores for searching / %d cores for dumping results' % (num_producers, num_consumers)) # Queue tasks for Producers # Limit rowsPerJob to constrain memory use and ensure reasonable progress updates rowsPerJob = min((entriesCount / 8), 5000) logger.debug("RowsPerJob: %d" % rowsPerJob) num_tasks = 0 for startingRowID in range(0, entriesCount - rowsPerJob, rowsPerJob): tasks.put(Task(startingRowID, rowsPerJob - 1)) logger.debug( "Creating search job %d: [%d - %d]" % (num_tasks, startingRowID, startingRowID + rowsPerJob - 1)) num_tasks += 1 logger.debug("Creating search job %d: [%d - %d]" % (num_tasks, num_tasks * (rowsPerJob), ((num_tasks * rowsPerJob) + (entriesCount - (num_tasks * (rowsPerJob) - 1))))) # Special consideration for the last one: tasks.put( Task(num_tasks * (rowsPerJob), (entriesCount - ((num_tasks * rowsPerJob) - 1)))) logger.debug("Number of tasks: %d" % num_tasks) # Add a poison pill for each producer for i in xrange(num_producers): tasks.put(None) # Start producer threads producers = [Producer(tasks, resultsProducers, dbfilenameFullPath, progProducers, num_consumers, \ searchType, search_space, options, num_hits, known_bad_data) for i in xrange(num_producers)] for producer in producers: producer.daemon = True # Remove for debugging producer.start() # Start consumer threads consumers = [Consumer(resultsProducers, resultsConsumers, progConsumers, num_producers, outputFile, \ dbfilenameFullPath, searchType, search_space, options, num_hits, \ num_hits_suppressed, hitHistogram_queue, known_bad_data) for i in xrange(num_consumers)] for consumer in consumers: consumer.daemon = True # Remove for debugging consumer.start() # Producer progress loop while (num_tasks > progProducers.value and progProducers.value >= 0): logger.debug("Producer num_tasks: %d - v.value: %d" % (num_tasks, progProducers.value)) update_progress( min(1, float(progProducers.value) / float(num_tasks)), "Searching [%d]" % (num_hits.value - num_hits_suppressed.value)) time.sleep(0.5) update_progress( 1, "Searching [%d]" % (num_hits.value - num_hits_suppressed.value)) # Wait for consumers dumping results to finish too while (num_hits.value > progConsumers.value and progConsumers.value >= 0): logger.debug("Consuming hit: %d / %d" % (progConsumers.value, num_hits.value)) update_progress( min(1, float(progConsumers.value) / float(num_hits.value)), "Dumping results to disk [%d]" % progConsumers.value) time.sleep(0.5) # Make sure we dumped as many hits as we found assert (num_hits.value == progConsumers.value) update_progress(1, "Dumping results to disk [%d]" % progConsumers.value) # Track Consumers deaths logger.debug("Waiting for consumer reverse-poison pills") while num_consumers > 0: tmp = resultsConsumers.get() # Check for reverse-poison pill if tmp is None: num_consumers -= 1 logger.debug("Consumer finished!") logger.debug("All consumers accounted for") # Wait for consumer threads to finish logger.debug("Waiting for consumer threads to finish") for consumer in consumers: consumer.join() logger.debug("Consumer threads finished") # Print hit histogram: results = [] results.append(('cyan', ("Hit histogram:", "", ""))) while not hitHistogram_queue.empty(): (name, regex, regex_hits) = hitHistogram_queue.get() results.append(('white', (name, regex, regex_hits))) if len(results) > 1: outputcolum(results) # Stop timer t1 = time.time() logger.info("Search hits: %d" % num_hits.value) logger.info("Suppresed duplicate hits: %d" % num_hits_suppressed.value) logger.info("Search time: %s" % (str(timedelta(seconds=(t1 - t0))))) if num_hits.value: logger.info("Head:") # Dump head of output file: num_lines = file_len(options.outputFile) from itertools import islice with open(options.outputFile) as myfile: head = list(islice(myfile, 5)) for line in head: logger.info(line.strip('\n\r')) logger.info("(%d lines suppressed)" % max(0, (num_lines - 5))) return (num_hits.value, num_hits_suppressed.value, results)
def export_interpolated_data(path, X, Y, Z, fesvar, vdim=1, complex=False, nproc=1, ncfile='data.nc', curl=False, return_mask=False): from netCDF4 import Dataset results = mp.JoinableQueue() workers = [None] * nproc for i in range(nproc): w = exporter_child(results, i, nproc, path, X, Y, Z, fesvar, vdim, complex, curl=curl) workers[i] = w time.sleep(0.1) for w in workers: w.daemon = True w.start() res = [results.get() for x in range(len(workers))] for x in range(len(workers)): results.task_done() size = len(X.flatten()) if complex: ans = np.zeros((vdim, size), dtype=np.complex) else: ans = np.zeros((vdim, size), dtype=np.float) mask = np.zeros(len(X.flatten()), dtype=int) - 1 for idx, mm, dd in res: if mm is None: print(dd) assert False, "Child Process Failed" else: if idx.size == 0: continue print("here", idx.shape, dd.shape) if vdim == 1: ans[idx] = dd else: ans[:, idx] = dd mask[idx] = mm ans = ans.reshape(-1, X.shape[0], X.shape[1], X.shape[2]) mask = mask.reshape(X.shape[0], X.shape[1], X.shape[2]) if ncfile != '': nc = Dataset(ncfile, "w", format='NETCDF4') nc.createDimension('vdim', vdim) nc.createDimension('dim_0', X.shape[0]) nc.createDimension('dim_1', X.shape[1]) nc.createDimension('dim_2', X.shape[2]) if complex: a_real = nc.createVariable(fesvar + '_real', np.dtype('double'), ('vdim', 'dim_0', 'dim_1', 'dim_2')) a_real[:] = ans.real a_imag = nc.createVariable(fesvar + '_imag', np.dtype('double'), ('vdim', 'dim_0', 'dim_1', 'dim_2')) a_imag[:] = ans.imag else: a_real = nc.createVariable(fesvar, np.dtype('double'), ('vdim', 'dim_0', 'dim_1', 'dim_2')) a_real[:] = ans xx = nc.createVariable('X', np.dtype('double'), ('dim_0', 'dim_1', 'dim_2')) yy = nc.createVariable('Y', np.dtype('double'), ('dim_0', 'dim_1', 'dim_2')) zz = nc.createVariable('Z', np.dtype('double'), ('dim_0', 'dim_1', 'dim_2')) rank = nc.createVariable('rank', np.dtype('double'), ('dim_0', 'dim_1', 'dim_2')) xx[:] = X yy[:] = Y zz[:] = Z rank[:] = mask nc.close() if return_mask: return ans, mask else: return ans
json_files = [] for file_name in sorted(os.listdir(INPUT_JSON_DIR)): file_path = os.path.abspath(os.path.join(INPUT_JSON_DIR, file_name)) json_files.append(file_path) if args.input_avg_mq_json is not None: json_avg_mq_files = [args.input_avg_mq_json] else: json_avg_mq_files = [] for file_name in sorted(os.listdir(INPUT_JSON_AVG_MQ_DIR)): file_path = os.path.abspath( os.path.join(INPUT_JSON_AVG_MQ_DIR, file_name)) json_avg_mq_files.append(file_path) multiprocessing.set_start_method('spawn') queue1 = multiprocessing.JoinableQueue() queue2 = multiprocessing.JoinableQueue() num_files = len(newick_files) cpus = set_num_cpus(num_files, args.processes) # Set a timeout for get()s in the queue. timeout = 0.05 for i, newick_file in enumerate(newick_files): json_file = json_files[i] json_avg_mq_file = json_avg_mq_files[i] queue1.put((newick_file, json_file, json_avg_mq_file)) # Complete the preprocess_tables task. processes = [ multiprocessing.Process(target=preprocess_tables, args=(
self.a = a self.b = b def __call__(self): time.sleep(.1) # pretend to take time to do the work return f'{self.a} * {self.b} = {self.a * self.b}' def __str__(self): return f'{self.a} * {self.b}' pass if __name__ == "__main__": ### establishes communicate queues. tasks = multiprocessing.JoinableQueue() results = multiprocessing.Queue() ### starts consumers num_consumers = multiprocessing.cpu_count() * 2 logging.debug(f'creating {num_consumers} consumers') consumers = [Consumer(tasks, results) for _ in range(num_consumers)] for w in consumers: w.start() ### Enqueues jobs num_jobs = 10 for i in range(num_jobs): tasks.put(Task(i, i)) ### adds poison pill for each consumer for _ in range(num_consumers):
def genCM(n_mc_numbers, processes=0): '''Uses multiple cores to try generate [n_mc_numbers] carmichael numbers using [processes] cores (processes = 0: auto detection of num cores) basic idea for multicore implementation from this site: http://www.doughellmann.com/PyMOTW/multiprocessing/communication.html#multiprocessing-queues ''' # Establish communication queues tasks = multiprocessing.JoinableQueue() results = multiprocessing.Queue() # Start workers if processes == 0: if multiprocessing.cpu_count() == 1: # single processor machine num_workers = 1 else: num_workers = multiprocessing.cpu_count( ) - 1 # give one to the os... else: num_workers = processes print '\n---------------------------------------------------------------------' print 'Starting Carmichael Number generation' print '---------------------------------------------------------------------' print '\n ...creating %d workers\n' % num_workers workers = [Worker(tasks, results) for i in xrange(num_workers)] for w in workers: w.start() loop_cnt = 0 taskcnt = 0 aborted = False print 'Searching for', n_mc_numbers, 'Carmichael Numbers...' while results.qsize( ) < n_mc_numbers: # check from time to time whether there's a result or empty queue if taskcnt > 20000: # abort anyways after cnt checked numbers aborted = True #implement handle for this later break if tasks.qsize( ) < num_workers * 2: # fill up queue if it's running low for i in range(num_workers): tasks.put(Task_cm_check( taskcnt, taskcnt + 1)) # each task is checking a number, with offset 1.. if taskcnt % 1000 == 0: print ' ... checking range: ', taskcnt + 1, '-', taskcnt + 1000 taskcnt += 1 loop_cnt += 1 #sleep(0.1) # suspend loop, free up some cpu time for workers, don't do this here, workers are waiting for jobs else... # Add a poison pill for each Worker print '\nsending kill signal to processes....' for i in xrange(num_workers): tasks.put(None) print ' ...done' # calm down (let some old tasks finish running) print 'cleaning up processes and queues...' while tasks.qsize() > 0: sleep(0.5) print ' ...done' # Wait for all of the tasks to finish print 'waiting for processes to shutdown....' tasks.join() print ' ...done' print 'extracting results...' r = [] while results.qsize(): r.append(results.get(True, 0.1)) # trow away additional solutions print ' ...done' print '\n\n------------------------------------------------' print ' Final Result: ' print '------------------------------------------------' for i, ele in enumerate(r): print 'CM Numer #:', i + 1, 'is:', ele print '------------------------------------------------' print 'stats:' print ' spawned tasks:', taskcnt print ' loop counter :', loop_cnt print '------------------------------------------------' return True