def _cluster_multi_process(p: multiprocessing.pool, data_normalized, start, end, st, dist_func, verbose): # if len(data_normalized) < p._processe: # group the time series first if # time series < # worker group_partition = __partition_and_group(data_normalized, p._processes, start, end, p) cluster_arg_partition = [(x, st, dist_func, verbose) for x in group_partition] """ Linear Cluster for debug purposes # cluster_partition = [] # for arg in cluster_arg_partition: # cluster_partition.append(_cluster_groups(*arg)) """ cluster_partition = p.starmap(_cluster_groups, cluster_arg_partition) cluster_meta_dict = _cluster_to_meta_mp(cluster_partition, p) subsequences = flatten(p.map(get_second, flatten(group_partition))) return subsequences, cluster_partition, cluster_meta_dict
def runtime_parallel_np(pool: mp.pool, sample: np.ndarray, func: Callable, num_resamples: int, resample_size: int = None) -> float: assert isinstance( sample, np.ndarray), "Please convert the input into an numpy ndarray" sample_size = len(sample) if resample_size == None or resample_size > sample_size: resample_size = sample_size sample_ = mp.Array(ctypes.c_double, sample_size) sample_shr = np.ctypeslib.as_array(sample_.get_obj()) sample_shr[:] = sample t_start = time.perf_counter() pool.starmap_async(bootstrap_np, [(sample_shr, func, resample_size) for _ in range(num_resamples)]).get() t_end = time.perf_counter() return t_end - t_start
def _query_mp(p: multiprocessing.pool, clusters, **kwargs): query_arg_partition = [[x] + list(kwargs.values()) for x in clusters] # Linear query for debug purposes # candidates = [] # for qp in query_arg_partition: # rtn = _query_partition(*qp) # candidates.append(rtn) candidates = flatten(p.starmap(_query_partition, query_arg_partition)) return candidates
def __partition_and_group(data, slice_num, start, end, p: multiprocessing.pool, shuffle=True): data_partition = _partitioner(data, p._processes) group_arg_partition = [(x, start, end) for x in data_partition] # Linear partitioning for debugging # group_partition = [] # for arg in group_arg_partition: # group_partition.append(_group_time_series(*arg)) group_partition = p.starmap(_group_time_series, group_arg_partition) return group_partition
def _query_bf_mp(query, p: multiprocessing.pool, subsequences: list, dt_index): dist_subsequences_arg = [(query, x, dt_index) for x in subsequences] dist_subsequences = p.starmap(get_dist_query, dist_subsequences_arg) return dist_subsequences
def _cluster_to_meta_mp(cluster_partition: list, p: multiprocessing.pool): clusters = flatten(cluster_partition) temp = p.map(_cluster_to_meta, clusters) return tuple(reduce_by_key(_cluster_reduce_func, temp))
def main(): """EdiTyper""" # Setup EdiTyper # Parse arguments parser = arguments.make_argument_parser() # type: argparse.ArgumentParser if not sys.argv[1:] or any(map(lambda a: a in sys.argv, ('-h', '--help'))): sys.exit(parser.print_help()) args = {key: value for key, value in vars(parser.parse_args()).items() if value is not None} # type: Dict[str, Any] # Make an output directory if os.path.exists(args['outdirectory']): args['outdirectory'] = args['outdirectory'] + time.strftime('_%Y-%m-%d_%H:%M') try: os.makedirs(args['outdirectory']) except OSError: pass finally: # Make a prefix for project-level output files output_prefix = os.path.join(args['outdirectory'], args['project']) # type: str # Setup logger # Formatting values log_format = '%(asctime)s %(levelname)s:\t%(message)s' # type: str date_format = '%Y-%m-%d %H:%M:%S' # type: str # Formatters stripped_formatter = toolkit.StrippedFormatter(fmt=log_format, datefmt=date_format) # toolkit.StrippedFormatter colored_formater = toolkit.ColoredFormatter(fmt=log_format, datefmt=date_format) # type: toolkit.ColoredFormatter # Open /dev/null (or whatever it is on Windows) to send basic stream information to devnull = open(os.devnull, 'w') # type: file # Configure the logger verbosity = _set_verbosity(level=args['verbosity']) # type: int logging.basicConfig( stream=devnull, level=verbosity, ) # If we're being verbose, capture other warnings (mainly matplotlib and numpy) # Otherwise, ignore them if verbosity == logging.DEBUG: logging.captureWarnings(True) else: warnings.filterwarnings('ignore') # Setup a FileHandler for the log file # Use a StrippedFormatter to remove extra ANSI color codes logname = output_prefix + '.log' logfile = logging.FileHandler(filename=logname, mode='w') # type: Logging.FileHandler logfile.setFormatter(stripped_formatter) logging.getLogger().addHandler(logfile) # Setup the console handler # Use a ColoredFormatter because colors are cool console = logging.StreamHandler() # type: logging.StreamHandler console.setFormatter(colored_formater) logging.getLogger().addHandler(console) # Begin the program logging.info("Welcome to %s %s!", os.path.basename(sys.argv[0]), arguments.VERSION) program_start = time.time() # type: float # Where are we putting our output directory? logging.warning("Using outdirectory \x1b[1m%s", args['outdirectory']) logging.warning("Full logfile can be found at %s", logname) # Check suppression values and other arguments if args['suppress_sam']: # Suppressed SAM output? logging.warning("SAM output suppressed, not writing SAM file") args['bam'] = False elif args['bam']: # Check for SAMtools try: args['samtools_exec'] = toolkit.which('samtools') except ValueError: # No SAMtools found logging.error("Cannot find SAMtools, outputing SAM instead of BAM") args['bam'] = False if args['suppress_events'] or args['suppress_tables']: # Suppressed events table? logging.warning("Events output suppressed, not writing events table") if args['suppress_classification'] or args['suppress_tables']: # Suppressed classification table? logging.warning("Read classification suppressed, not writing classification table") if args['suppress_plots']: # Suppressed plots? logging.warning("Plots suppressed, not creating plots") else: # Search for Rscript try: args['Rscript'] = toolkit.which('Rscript') except ValueError: # No Rscript found logging.error("Cannot find Rscript, not generating plots") args['suppress_plots'] = True if _check_suppressions(suppressions=args): # All output suppressed? Error sys.exit(logging.critical("All output suppressed, not running")) # if args['xkcd']: # plots._XKCD = True # Enable the profiler if desired if args['profile']: toolkit._DO_PROFILE = True # Read in reference and template sequences logging.info("Quality control...") # Get genomic chromosome and start position try: chrom, args['genomic_start'] = sam.get_genomic_location(bedfile=args['reference_bed']) # type: str, int except KeyError: # Not provided chrom, args['genomic_start'] = '', 0 # type: str, int qc_start = time.time() # type: float reference = toolkit.load_seq(seq_file=args['reference'], chrom=chrom) # type: toolkit.NamedSequence template = toolkit.load_seq(seq_file=args['template']) # type: toolkit.NamedSequence # Align template and reference sequences to determine alignment direction al_ref_seq, al_temp_seq = quality_control.align_reference( # type: str, str reference=reference.sequence, template=template.sequence, gap_penalty=args['gap_opening'] ) aligned_reference = toolkit.NamedSequence(name=reference.name, sequence=al_ref_seq) # type: toolkit.NamedSequence aligned_template = toolkit.NamedSequence(name=template.name, sequence=al_temp_seq) # type: toolkit.NamedSequence # QC the alignments logging.info("Validating reference/template alignment...") alignment_validation = time.time() # type: float if '-' in set(aligned_reference.sequence): raise ValueError(logging.error("Cannot have insertions in the reference")) if '-' in set(toolkit.side_trimmer(seq=aligned_template.sequence)): raise ValueError(logging.error("Cannot have deletions in the template sequence")) template_reference_mismatch = toolkit.get_mismatch(seq_a=aligned_reference.sequence, seq_b=aligned_template.sequence) # type: List if not template_reference_mismatch: logging.error("No mismatches found between the reference and template sequenecs, going into NHEJ-only mode") template_reference_mismatch = list(itertools.repeat((None, ('',)), times=len(args['analysis_mode']))) # type: List[Tuple[None, Tuple[str]]] if len(template_reference_mismatch) != len(args['analysis_mode']): msg = "There can only be %(num)s mismatches in '%(mode)s' mode" % { # type: str 'num': len(args['analysis_mode']), 'mode': '+'.join(args['analysis_mode']) } if len(args['analysis_mode']) == 1: msg = msg.replace('mismatches', 'mismatch') # type: str raise ValueError(logging.error(msg)) logging.debug("Reference/template aligmnent validation took %s seconds", round(time.time() - alignment_validation, 3)) # Get SNP information # snp_info_raw = template_reference_mismatch.pop(args['analysis_mode'].index('SNP')) snp_index, reference_state, target_snp = quality_control.get_snp_states( # type: int, str, str reference=aligned_reference.sequence, template=aligned_template.sequence, mismatch=template_reference_mismatch.pop(args['analysis_mode'].index('SNP')) # mismatch=snp_info_raw ) snp = SNP(reference=reference_state, target=target_snp, position=snp_index) # type: SNP logging.debug("Quality control took %s seconds", round(time.time() - qc_start, 3)) # Collect FASTQ information if 'sample_list' in args: if not os.path.exists(args['sample_list']): raise ValueError(logging.critical("Cannot find sample list %s", args['sample_list'])) with open(args['sample_list'], 'r') as listfile: fastq_list = tuple(line.strip() for line in listfile if not line.startswith('#')) # type: Tuple[str] elif 'input_file' in args: fastq_list = tuple(args['input_file']) # type: Tuple[str] elif 'fastq_directory' in args: fastq_list = toolkit.find_fastq(directory=args['fastq_directory']) # type: Tuple[str] else: sys.exit(logging.critical("No inputs provided")) zipped_args = zip( # type: Iterable[str, NamedSequence, toolkit.NamedSequence, Dict[str, Any], SNP, str] fastq_list, itertools.repeat(reference), itertools.repeat(aligned_reference), itertools.repeat(args), itertools.repeat(snp), itertools.repeat(args['outdirectory']) ) # Tell the pool to ignore SIGINT (^C) # by turning INTERUPT signals into IGNORED signals sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN) # type: function # Setup our multiprocessing pool # Allow the user to specify the number of jobs to run at once # If not specified, let multiprocessing figure it out try: pool = Pool(processes=args['num_cores']) # type: multiprocessing.Pool except KeyError: pool = Pool() # type: multiprocessing.Pool # Re-enable the capturing of SIGINT, catch with KeyboardInterrupt # or ExitPool, depending on how the exit was initiated # Note: SystemExits are swallowed by Pool, no way to change that signal.signal(signal.SIGINT, sigint_handler) # If we have multiple FASTQ files AND multiple processes running # use pool.map_async; else use generic map to avoid timeout issues if all(map(lambda i: i > 1, (len(fastq_list), getattr(pool, '_processes')))): try: # Use map_async and get with a large timeout # to allow for KeyboardInterrupts to be caught # and handled with the try/except timeout = max((9999, 600 * len(fastq_list))) # type: int logging.debug("Setting timeout to %s seconds", timeout) res = pool.map_async(crispr_analysis, zipped_args) # type: multiprocessing.pool.MapResult pool.close() results = res.get(timeout) except (KeyboardInterrupt, ExitPool) as error: # Handle ctrl+c or custom ExitPool pool.terminate() pool.join() if isinstance(error, KeyboardInterrupt): # ctrl+c sys.exit('\nkilled') elif isinstance(error, ExitPool): # My way of handling SystemExits sys.exit(error.msg) else: # Shouldn't happen, but you know... raise else: pool.join() # Otherwise, don't bother with pool.map() make life easy else: # Clean up the pool pool.close(); pool.terminate(); pool.join() # Use standard map (or itertools.imap if Python 2) results = map(crispr_analysis, zipped_args) # type: Iterable[Tuple[Tuple[alignment.Alignment]], Tuple[Dict[str, Any]]] # Sort our alignments and summaries into separate collections try: alignments, summaries = zip(*results) # type: Tuple[Tuple[alignment.Alignment]], Tuple[Dict[str, Any]] except ExitPool as error: # Handle ExitPool calls for single-threaded map sys.exit(error.msg) # Unpack our alignments into a single tuple alignments = toolkit.unpack(collection=alignments) # type: Tuple[alignment.Alignment] # Final batch summary plot and table if not args['suppress_plots']: if alignments: plots.quality_plot( alignments=alignments, thresholds={d['filename']: d['score_threshold'] for d in summaries}, output_prefix=output_prefix ) else: logging.error("No passing reads found in any file, not producing quality plot") if not (args['suppress_classification'] or args['suppress_events'] or args['suppress_tables']): summary_name = output_prefix + '.summary.txt' # type: str summary_header = ( '#FASTQ', 'TOTAL_READS', 'TOTAL_NON_DISC', 'UNIQ_READS', 'DISCARDED', 'SNP_POS', 'REF_STATE', 'TEMP_SNP', 'NO_EDIT', 'PERC_NO_EDIT', 'HDR', 'PERC_HDR', 'MIX', 'PERC_MIX', 'NHEJ', 'PERC_NHEJ', 'PERC_MIS_A', 'PERC_MIS_T', 'PERC_MIS_C', 'PERC_MIS_G' ) logging.info("Writing summary to %s", summary_name) summary_start = time.time() # type: float with open(summary_name, 'w') as summfile: summfile.write('\t'.join(summary_header) + '\n') summfile.flush() for sum_dict in sorted(summaries, key=lambda d: d['filename']): # type: Dict[str, Any] out = ( # type: Tuple[Any] sum_dict['filename'], sum_dict['total_reads'] + sum_dict['discarded'], sum_dict['total_reads'], sum_dict['unique_reads'], sum_dict['discarded'], snp.position + 1 if _is_snp(snp=snp) else analysis.NA, snp.reference if _is_snp(snp=snp) else analysis.NA, snp.target if _is_snp(snp=snp) else analysis.NA, sum_dict['no_edit'], sum_dict['no_edit_perc'], sum_dict['hdr'], sum_dict['hdr_perc'], sum_dict['mix'], sum_dict['mix_perc'], sum_dict['nhej'], sum_dict['nhej_perc'], sum_dict['perc_a'], sum_dict['perc_t'], sum_dict['perc_c'], sum_dict['perc_g'] ) out = map(str, out) # type: Iterable[str] summfile.write('\t'.join(out)) summfile.write('\n') summfile.flush() logging.debug("Writing summary took %s seconds", round(time.time() - summary_start, 3)) # Close logfile logging.debug("Entire program took %s seconds to run", round(time.time() - program_start, 3)) logging.info("Thank you for using %s", os.path.basename(sys.argv[0])) devnull.close() try: logfile.close() except NameError: pass
def runtime_parallel(resamples: np.ndarray, func: Callable, pool: mp.pool) -> float: t_start = time.perf_counter() pool.map(func, resamples, 500) # running time should be O(mn) t_end = time.perf_counter() return t_end - t_start