def get_chromlist(ref_genome: str, debug: bool) -> List[str]: """Scan the reference genome FASTA file to find the chromosomes for which there a sequence is available. The file must be in FASTA format and the chromosome names start with '>chr' (e.g. '>chrX', '>chr1', etc.) Parameters ---------- ref_genome : str path to the reference genome FASTA file Returns ------- list chomosomes for which a sequence is available in the given reference genome FASTA file """ assert os.path.isfile(ref_genome) # redefine default SIGINT handler original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN) # overwrite original SIGINT handler signal.signal(signal.SIGINT, original_sigint_handler) chroms = list() try: with open(ref_genome, mode='r') as ifstream: while True: line = ifstream.readline() if not line: return # empty file ? if line[0] == ">": break # data start here while True: if line[0] != ">": errmsg = "Sequence names in FASTA file should begin with \">\"\n." exception_handler(FileReadError, errmsg, debug) else: seqname = line.rstrip().split()[0][1:] # skip ">" line = ifstream.readline() while True: if not line: break # empty sequence ? if line[0] == ">": break # sequence end line = ifstream.readline() chroms.append(seqname) if not line: break # reached EOF except KeyboardInterrupt: sigint_handler() except: errmsg = "A problem was encountered reading {}\n." exception_handler(FileReadError, errmsg.format(ref_genome), debug) finally: ifstream.close() return chroms
def get_kmers( queries: List[str], pool: mp.Pool, debug: bool, verbose: Optional[bool] = False, ) -> None: """Retrieve sequences from genome variation graph(s). The k-mers search is made in parallel creating #cores processes. ... Parameters ---------- queries : list list of queries pool : multiprocessing.Pool pool ps debug : bool trace the full error stack verbose : bool, optional print additional information """ if not isinstance(queries, list): errmsg = "Expected list, got {}.\n" exception_handler(TypeError, errmsg.format(type(queries).__name__), debug) if verbose: start_re: float = time.time() try: res: mp.pool.MapResult = (pool.map_async(get_seqs, queries)) if not verbose: it: int = 0 while (True): if res.ready(): printProgressBar( tot, tot, prefix='Progress:', suffix='Complete', length=50 ) break if it == 0: tot = res._number_left remaining = res._number_left printProgressBar( (tot - remaining), tot, prefix='Progress:', suffix='Complete', length=50 ) time.sleep(1) it += 1 ret: list = res.get(60 * 60 * 60) # does not ignore signals except KeyboardInterrupt: pool.terminate() sigint_handler() else: pool.close() if verbose: end_re: float = time.time() print("Extracted sequences from all regions in %.2fs" % (end_re - start_re))
def get_chromlist(ref_genome: str) -> List[str]: """Scan the reference genome FASTA file to find the chromosomes for which there a sequence is available. The file must be in FASTA format and the chromosome names start with '>chr' (e.g. '>chrX', '>chr1', etc.) Parameters ---------- ref_genome : str path to the reference genome FASTA file Returns ------- list chomosomes for which a sequence is available in the given reference genome FASTA file """ # redefine default SIGINT handler original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN) # overwrite original SIGINT handler signal.signal(signal.SIGINT, original_sigint_handler) chroms = list() print( "Reading the valid chromosome names from the given reference genome...\n" ) try: with open(ref_genome, mode='r') as infile: for line in infile: line = line.strip() if line[0] == ">": # this line contains the chromosome name if line[:4] == ">chr": chroms.append(line[4:]) # remove the starting '>chr' else: chroms.append(line[1:]) # remove the starting '> except Exception as e: raise e except KeyboardInterrupt: sigint_handler() finally: infile.close() # close input stream return chroms
def main(cmdLineargs: Optional[List[str]] = None) -> None: try: # starting point of the execution time start: float = time.time() # read the command-line arguments parser: GRAFIMOArgumentParser = get_parser() if cmdLineargs is None: cmdLineargs: List[str] = sys.argv[1:] # get input args # no arguments given --> print help if len(cmdLineargs) == 0: parser.error_noargs() die(2) # the second argument must be buildvg or findmotif if ((cmdLineargs[0] != "-h" and cmdLineargs[0] != "--help") and cmdLineargs[0] != "--version" and (cmdLineargs[0] != "buildvg" and cmdLineargs[0] != "findmotif")): parser.error( "The second argument must be one between 'buildvg' and 'findmotif'" ) die(1) args: argparse.Namespace = parser.parse_args(cmdLineargs) if args.verbose: print("Parsing arguments...") start_args_parse: float = time.time() #--------------------------------------------------------------# # check commandline arguments consistency # #---------------------- general options -----------------------# # workflow type if args.workflow != "buildvg" and args.workflow != "findmotif": parser.error("Unexpected workflow given. Available options:\n" "\tbuildvg: construct VG from user data.\n" "\tfindmotif: scan VG for DNA motif(s) occurrences") die(1) # cpu cores if args.cores < 0: parser.error("Negative number of CPU cores given") elif args.cores == 0 and args.graph_genome: # when whole genome variation graph is given, it is safer to # use 1 CPU core by default. This beacuse of the space needed # to load the whole VG on RAM. # # CAVEAT: before requiring more CPU cores to be used, be sure # your system has enough memory args.cores = 1 elif args.cores == 0: # default option -> use all available CPU cores args.cores = mp.cpu_count() else: # args.cores > 0 if args.cores > mp.cpu_count(): parser.error("Too many CPU cores to use ({})".format( args.cores)) # verbosity if (not isinstance(args.verbose, bool) or (args.verbose != False and args.verbose != True)): parser.error( '\"--verbose\" does not accept any positional argument') # debugging if (not isinstance(args.debug, bool) or (args.debug != False and args.debug != True)): parser.error("\"--debug\" does not accept any positional argument") #---------------------- buildvg options -----------------------# buildvg_err_msg: str = "Unexpected arguments for \"grafimo buildvg\": \"{}\"" if args.workflow == "buildvg": if args.graph_genome_dir: parser.error(buildvg_err_msg.format("-d, --genome-graph-dir")) die(1) elif args.graph_genome: parser.error(buildvg_err_msg.format("-g, --genome-graph")) die(1) elif args.bedfile: parser.error(buildvg_err_msg.format("-b, --bedfile")) die(1) elif args.motif: parser.error(buildvg_err_msg.format("-m, --motif")) die(1) elif args.bgfile != UNIF: # if default ignored parser.error(buildvg_err_msg.format("-k, --bgfile")) die(1) elif args.pseudo != 0.1: # if default ignored parser.error(buildvg_err_msg.format("-p, --pseudo")) die(1) elif args.threshold != 1e-4: # if default ignored parser.error(buildvg_err_msg.format("-t, --thresh")) die(1) elif args.no_qvalue: parser.error(buildvg_err_msg.format("-q, --no-qvalue")) die(1) elif args.no_reverse: parser.error(buildvg_err_msg.format("-r, --no-reverse")) die(1) elif args.text_only: parser.error(buildvg_err_msg.format("-f, --text-only")) die(1) elif args.chroms_find: parser.error(buildvg_err_msg.format("--chroms-find")) die(1) elif args.chroms_prefix_find: parser.error(buildvg_err_msg.format("--chroms-prefix-find")) die(1) elif args.chroms_namemap_find != NOMAP: # if default ignored parser.error(buildvg_err_msg.format("--chroms-namemap-find")) die(1) elif args.qval_t: parser.error(buildvg_err_msg.format("--qvalueT")) die(1) elif args.recomb: parser.error(buildvg_err_msg.format("--recomb")) die(1) elif args.top_graphs != 0: # if default ignored parser.error(buildvg_err_msg.format("--top-graphs")) die(1) elif not args.linear_genome: parser.error("No reference genome given") die(1) elif not args.vcf: parser.error("No VCF file given") die(1) else: # arguments for buildvg are correct # reference genome if (args.linear_genome.split('.')[-1] != 'fa' and args.linear_genome.split('.')[-1] != 'fasta'): parser.error( "The reference genome file must be in FASTA format") die(1) else: if not os.path.isfile(args.linear_genome): parser.error("Unable to find {}".format( args.linear_genome)) die(1) if os.stat(args.linear_genome).st_size == 0: # empty file parser.error("{} seems to be empty.".format( args.linear_genome)) die(1) args.linear_genome = os.path.abspath(args.linear_genome) # VCF --> the VCF file must have been compressed with # bgzip (https://github.com/samtools/tabix) if (args.vcf.split(".")[-1] != "gz" and args.vcf.split(".")[-2] != "vcf"): parser.error( "Wrong VCF file given. The VCF file must have been " "compressed with bgzip (e.g. myvcf.vcf.gz)") die(1) else: if not os.path.isfile(args.vcf): parser.error('Unable to find {}'.format(args.vcf)) die(1) if os.stat(args.vcf).st_size == 0: # empty file parser.error("{} seems to be empty.".format(args.vcf)) die(1) args.vcf = os.path.abspath(args.vcf) # chromosome to construct VG if len(args.chroms_build) == 0: args.chroms_build = [ALL_CHROMS] # use all chromosome else: if anydup(args.chroms_build): parser.error( "Duplicated chromosome names given to \"--chroms-build\"" ) # chromosome name-map if args.chroms_namemap_build != NOMAP: if not os.path.isfile(args.chroms_namemap_build): parser.error("Unable to locate {}".format( args.chroms_namemap_build)) if (args.chroms_prefix_build and args.chroms_namemap_build != NOMAP): parser.error( "\"--chroms-prefix-build\" and \"chroms-namemap-build\" " "cannot used together. Choose one of those options") # if no out directory is specified the VGs are stored in # the current directory if args.out == "": args.out = os.path.abspath("./") workflow: BuildVG = BuildVG(args) if args.verbose: end_args_parse: float = time.time() print("Arguments parsed in %.2fs." % (end_args_parse - start_args_parse)) # end if # end if #---------------------- findmotif options -----------------------# findmotif_err_msg: str = "Unexpected arguments for \"grafimo findmotif\": \"{}\"" if args.workflow == "findmotif": if args.linear_genome: parser.error(findmotif_err_msg.format("-l, --linear-genome")) die(1) elif args.vcf: parser.error(findmotif_err_msg.format("-v, --vcf")) die(1) elif args.chroms_build: parser.error(findmotif_err_msg.format("--chroms-build")) elif args.chroms_prefix_build: parser.error(findmotif_err_msg.format("--chroms-prefix-build")) elif args.chroms_namemap_build != NOMAP: parser.error( findmotif_err_msg.format("--chroms-namemap-build")) elif args.reindex: # if default ignored parser.error(findmotif_err_msg.format("--reindex")) die(1) elif not args.graph_genome_dir and not args.graph_genome: parser.error( "No arguments given for both \"--genome-graph\" and \"--genome-graph-dir\"" ) die(1) elif not args.bedfile: parser.error("No BED file given") die(1) elif not args.motif: parser.error("No motif PWM given") die(1) else: # only one between graph_genome and graph_genome_dir is allowed if args.graph_genome and args.graph_genome_dir: parser.error( "Only one argument between \"--genome-graph\" and \"--genome-graph-dir\"" " can be used") die(1) # genome graph if args.graph_genome: if (args.graph_genome.split('.')[-1] != "xg" and args.graph_genome.split('.')[-1] != "vg"): parser.error( "Unrecognized genome variation graph format. Only" "VG and XG format are allowed") die(1) elif not os.path.isfile(args.graph_genome): parser.error("Unable to locate {}".format( args.graph_genome)) die(1) else: # using absolute path avoid potential problems args.graph_genome = os.path.abspath(args.graph_genome) # genome graphs directory if args.graph_genome_dir: if not os.path.isdir(args.graph_genome_dir): parser.error("Unable to locate {}".format( args.graph_genome_dir)) die(1) if len(glob(os.path.join(args.graph_genome_dir, "*.xg"))) <= 0: parser.error( "No genome variation graph found in {}".format( args.graph_genome_dir)) die(1) else: # using absolute path avoid potential problems args.graph_genome_dir = os.path.abspath( args.graph_genome_dir) # BED file if args.bedfile: if not isbed(args.bedfile, args.debug): parser.error( "The genomic coordinates must be given in UCSC BED files" ) die(1) else: if not os.path.isfile(args.bedfile): parser.error("Unable to locate {}".format( args.bedfile)) else: parser.error("No BED file given") # motif pwm if not args.motif: parser.error("No motif PWM given") else: motifs: List[str] = args.motif for m in motifs: if not isMEME_ff(m, args.debug) and not isJaspar_ff( m, args.debug): parser.error( "Unrecognized motif PWM file format. " "{} does not follow the MEME or JASPAR format rules" .format(m)) die(1) if not os.path.isfile(m): parser.error("Unable to locate {}".format(m)) # background file if args.bgfile != UNIF: if not os.path.isfile(args.bgfile): parser.error("Unable to locate {}".format(args.bgfile)) # pseudocount if args.pseudo <= 0: parser.error( "Pseudocount values must be > 0, got {}".format( args.pseudo)) die(1) # statistical significance threshold if args.threshold <= 0 or args.threshold > 1: parser.error( "Motif statistical significance threshold must be between 0 and 1" ) die(1) # q-value flag if (not isinstance(args.no_qvalue, bool) or (args.no_qvalue != False and args.no_qvalue != True)): parser.error( "\"--qvalue\" accepts only True or False values") die(1) # no reverse flag if (not isinstance(args.no_reverse, bool) or (args.no_reverse != False and args.no_reverse != True)): parser.error( "\"--no-reverse\" accepts only True or False values") die(1) # text only flag if (not isinstance(args.text_only, bool) or (args.text_only != False and args.text_only != True)): parser.error( "\"--text-only\" accepts only True or False values") die(1) # chromosome to consider during VG scan if len(args.chroms_find) == 0: args.chroms_find = [ALL_CHROMS] # use all chromosome else: if anydup(args.chroms_find): parser.error( "Duplicated chromosome names given to \"--chroms-find\"" ) # chromosome name-map if args.chroms_namemap_find != NOMAP: if not os.path.isfile(args.chroms_namemap_find): parser.error("Unable to locate {}".format( args.chroms_namemap_find)) if (args.chroms_prefix_find and args.chroms_namemap_find != NOMAP): parser.error( "\"--chroms-prefix-find\" and \"chroms-namemap-find\" " "cannot used together. Choose one of those options") # recomb flag if (not isinstance(args.recomb, bool) or (args.recomb != False and args.recomb != True)): parser.error( "\"--recomb\" accepts only True or False values") die(1) # out directory if args.out == "": # default option args.out = DEFAULT_OUTDIR print(args.out) # threshold on q-value flag if (not isinstance(args.qval_t, bool) or (args.qval_t != False and args.qval_t != True)): parser.error( "\"--qvalueT accepts only True or False values") die(1) elif args.no_qvalue == True and args.qval_t == True: parser.error( "Unable to apply statistical significance threshold on" " q-values if you don't want them") die(1) # number of graph regions to store as PNG images if args.top_graphs < 0: parser.error("Negative number of regions to display") workflow: Findmotif = Findmotif(args) if args.verbose: end_args_parse: float = time.time() print("Arguments parsed in %.2fs." % (end_args_parse - start_args_parse)) # end if # end if # chck that external dependencies are satisfied if args.verbose: sys.stderr.write( "Checking GRAFIMO external dependencies {}\n".format(EXT_DEPS)) start_deps: float = time.time() satisfied: bool deps_lack: List[str] satisfied, deps_lack = check_deps() if not satisfied and len(deps_lack) > 0: errmsg = "Some dependencies are not satisfied: {}.\nPlease solve them before running GRAFIMO.\n" exception_handler(DependencyError, errmsg.format(deps_lack), args.debug) elif not satisfied and len(deps_lack) <= 0: errmsg = "Dependencies satisfied, but unable to recover them.\n Be sure they are in system PATH.\n" exception_handler(DependencyError, errmsg, args.debug) if args.verbose and satisfied: end_deps: float = time.time() print("Dependencies satisfied.") print("Dependencies checked in %.2fs." % (end_deps - start_deps)) #--------------------------------------------------------------- # dependency check was ok, so we go to workflow selection: # * construction of the genome variation graph for # each chromosome or a user defined subset of them # * scan of a precomputed VG or a set of precomputed VG if isinstance(workflow, BuildVG): buildvg(workflow, args.debug) elif isinstance(workflow, Findmotif): findmotif(workflow, args.debug) else: errmsg = "Expected BuildVG or Findmotif, got {}.\n" exception_handler(TypeError, errmsg.format(type(workflow).__name__), args.debug) end: float = time.time() # GRAFIMO execution finishes here print("Elapsed time %.2fs." % (end - start)) except KeyboardInterrupt: sigint_handler() finally: pass
def compute_results(motif: Motif, sequence_loc: str, args_obj: Optional[Findmotif] = None, testmode: Optional[bool] = False ) -> pd.DataFrame: """Score all the sequences extracted from the genome variation graph in the regions defined in the input BED file. To score the sequences is used the scaled motif scoring matrix, stored in the input Motif instance. To each score is assigned a P-value using the P-value matrix, contained in the Motif instance. Parameters ---------- motif : Motif motif data to score sequences sequence_loc : str path to the intermediate files containing the sequences extracted from the genome variation graph args_obj : Findmotif, optional container for the arguments needed during the scoring step testmode : bool, optional flag value manually set used for test purposes Returns ------- pandas.DataFrame scoring results """ cores:int threshold: float no_qvalue: bool qval_t: bool no_reverse: bool recomb: bool verbose: bool errmsg: str if not isinstance(sequence_loc, str): errmsg = ''.join(["\n\nERROR: unable to locate extracted sequences in ", sequence_loc]) raise FileNotFoundError(errmsg) if not isinstance(motif, Motif): errmsg = "\n\nERROR: the given motif is not an instance of Motif" raise ValueError(errmsg) if not testmode: if not isinstance(args_obj, Findmotif): errmsg = "\n\nERROR: unrecognized argument object type" raise ValueError(errmsg) if not testmode: cores = args_obj.get_cores() threshold = args_obj.get_threshold() no_qvalue = args_obj.get_no_qvalue() qval_t = args_obj.get_qvalueT() no_reverse = args_obj.get_no_reverse() recomb = args_obj.get_recomb() verbose = args_obj.get_verbose() else: cores = 1 threshold = 1 recomb = True no_qvalue = False qval_t = False no_reverse = False verbose = False assert threshold > 0 assert threshold <= 1 assert cores >= 1 print_scoring_msg(no_reverse, motif) cwd: str = os.getcwd() os.chdir(sequence_loc) manager: SyncManager = mp.Manager() # results return_dict: DictProxy = manager.dict() # scanned nucleotides scanned_nucs_dict: DictProxy = manager.dict() # scanned sequences scanned_seqs_dict: DictProxy = manager.dict() # get all tmp files containing sequences sequences: List[str] = glob.glob('*.tsv') if len(sequences) < cores: cores = len(sequences) # split the sequence set in no. cores chunks sequences_split: List[str] = np.array_split(sequences, cores) jobs = list() # jobs list proc_finished: int = 0 original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN) # overwrite the default SIGINT handler to exit gracefully # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python signal.signal(signal.SIGINT, original_sigint_handler) if verbose: start_s: float = time.time() try: # compute results in parallel for i in range(cores): p = mp.Process( target=score_seqs, args=( sequences_split[i], motif, no_reverse, return_dict, scanned_seqs_dict, scanned_nucs_dict, i ) ) jobs.append(p) p.start() # end for # to print 0%, otherwise start from % as first chunk id already completed completed printProgressBar(proc_finished, cores, prefix='Progress:', suffix='Complete', length=50) for job in jobs: job.join() # sync point proc_finished += 1 printProgressBar(proc_finished, cores, prefix='Progress:', suffix='Complete', length=50) # end for except KeyboardInterrupt: sigint_handler() sys.exit(2) else: if verbose: end_s: float = time.time() print( "Scored all sequences in %.2fs" % (end_s - start_s) ) else: pass # all was OK, go to the next instruction # end try os.chdir(cwd) if not testmode: cmd: str = "rm -rf {0}".format(sequence_loc) code: int = subprocess.call(cmd, shell=True) if code != 0: errmsg = "\n\nERROR: an error occurred while running %s" % cmd raise SubprocessError(errmsg) if verbose: start_df: str = time.time() # recover all analysis results and summarize them in a single # data structure seqnames: List[str] = list() seqs: List[str] = list() chroms: List[str] = list() starts: List[int] = list() stops: List[int] = list() strands: List[str] = list() scores: List[np.double] = list() pvalues: List[np.double] = list() frequencies: List[int] = list() references: List[str] = list() seqs_scanned: int = 0 nucs_scanned: int = 0 for key in return_dict.keys(): assert isinstance(return_dict[key], ResultTmp) seqnames += return_dict[key].get_seqnames() seqs += return_dict[key].get_seqs() chroms += return_dict[key].get_chroms() starts += return_dict[key].get_starts() stops += return_dict[key].get_stops() strands += return_dict[key].get_strands() scores += return_dict[key].get_scores() pvalues += return_dict[key].get_pvalues() frequencies += return_dict[key].get_frequencies() references += return_dict[key].get_references() # compute the total number of scanned sequences and nucleotides seqs_scanned += scanned_seqs_dict[key] # the keys are the same as return_dict nucs_scanned += scanned_nucs_dict[key] # the keys are the same as return_dict # end for qvalues: List[np.double] # compute the q-values if no_qvalue: qvalues = list() # empty list -> not computed else: qvalues = compute_qvalues(pvalues) # end if print("Scanned sequences:", seqs_scanned) print("Scanned nucleotides:", nucs_scanned) # summarize results in a pandas DF finaldf: pd.DataFrame = build_df(motif, seqnames, starts, stops, strands, scores, pvalues, qvalues, seqs, frequencies, references, threshold, qval_t, no_qvalue, recomb) if verbose: end_df: float = time.time() print("\nResults summary built in %.2fs" % (end_df - start_df)) return finaldf
def compute_results(motif, sequence_loc, args_obj): """ Score all the sequences extracted from regions defined in the input BED file. To score sequences is used the processed input motif. The results are then stored in a pandas DataFrame ---- Parameters: motif (Motif) : processed motif, used to score sequences sequence_loc (str) : path to temporary files storing sequences extracted during the previous step args_obj (Findmotif) : arguments used during the sequnece scoring step ---- Returns: finaldf (pd.DataFrame) : pandas DataFrame containing the results of the GRAFIMO analysis """ if not isinstance(sequence_loc, str): errmsg = ''.join("\n\nERROR: unable to locate extracted sequences in ", sequence_loc, ". Exiting") raise FileNotFoundError(errmsg) if not isinstance(motif, Motif): raise ValueError( "\n\nERROR: the given motif is not an instance of Motif") if not isinstance(args_obj, Findmotif): raise ValueError("\n\nERROR: unrecognized argument object type") # reading arguments cores = args_obj.get_cores() threshold = args_obj.get_threshold() no_qvalue = args_obj.get_no_qvalue() qval_t = args_obj.get_qvalueT() no_reverse = args_obj.get_no_reverse() verbose = args_obj.get_verbose() assert threshold > 0 assert threshold <= 1 assert cores >= 1 print_scoring_msg(no_reverse, motif) cwd = os.getcwd() os.chdir(sequence_loc) # go to sequence location manager = mp.Manager() return_dict = manager.dict() # results scanned_nucs_dict = manager.dict() # nucleotides scanned scanned_seqs_dict = manager.dict() # sequences scanned sequences = glob.glob('*.tsv') # get all tmp files containing sequences sequences_split = np.array_split( sequences, cores) # split the sequence set in #cores chunks jobs = [] # jobs list proc_finished = 0 # number of jobs done original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN) signal.signal(signal.SIGINT, original_sigint_handler ) # overwrite the default SIGINT handler to exit gracefully # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python if verbose: start_s = time.time() try: # compute results in parallel for i in range(cores): p = mp.Process(target=score_seqs, args=(sequences_split[i], motif, no_reverse, return_dict, scanned_seqs_dict, scanned_nucs_dict, i)) jobs.append(p) p.start() # start the process # end for # to print 0%, otherwise start from % as first chunk id already completed completed printProgressBar(proc_finished, cores, prefix='Progress:', suffix='Complete', length=50) for job in jobs: job.join() # sync point proc_finished += 1 printProgressBar(proc_finished, cores, prefix='Progress:', suffix='Complete', length=50) # end for except KeyboardInterrupt: sigint_handler() sys.exit(2) else: if verbose: end_s = time.time() msg = ''.join( ["\nScored all sequences in ", str(end_s - start_s), "s"]) print(msg) else: pass # all was OK, go to the next instruction # end if # end try os.chdir(cwd) # get back to starting point cmd = "rm -rf {0}".format(sequence_loc) # remove temporary sequence files code = subprocess.call(cmd, shell=True) if code != 0: msg = ' '.join(["\n\nERROR: an error occurred while running", cmd]) raise SubprocessError(msg) # end if if verbose: start_df = time.time() # recover all analysis results and summarize them in a single data-structure seqnames = [] seqs = [] chroms = [] starts = [] stops = [] strands = [] scores = [] pvalues = [] references = [] seqs_scanned = 0 nucs_scanned = 0 for key in return_dict.keys(): assert isinstance(return_dict[key], ResultTmp) seqnames += return_dict[key].get_seqnames() seqs += return_dict[key].get_seqs() chroms += return_dict[key].get_chroms() starts += return_dict[key].get_starts() stops += return_dict[key].get_stops() strands += return_dict[key].get_strands() scores += return_dict[key].get_scores() pvalues += return_dict[key].get_pvalues() references += return_dict[key].get_references() # compute the total number of scanned sequences and nucleotides seqs_scanned += scanned_seqs_dict[ key] # the keys are the same as return_dict nucs_scanned += scanned_nucs_dict[ key] # the keys are the same as return_dict # end for # compute the q-values if no_qvalue: qvalues = [] # empty list -> not computed else: qvalues = compute_qvalues(pvalues) # end if print("Scanned sequences:", seqs_scanned) print("Scanned nucleotides:", nucs_scanned) # summarize results in a pandas DF finaldf = build_df(motif, seqnames, starts, stops, strands, scores, pvalues, qvalues, seqs, references, threshold, qval_t, no_qvalue) if verbose: end_df = time.time() msg = ''.join( ["\nBuilt result summary in ", str(end_df - start_df), "s"]) return finaldf
def build_motif_MEME(motif_file: str, bg_file: str, pseudocount: float, no_reverse: bool, cores: int, verbose: bool, debug: bool) -> List[Motif]: """Read motif PWMs in MEME format. It is computed the scoring matrix from the values given with the PWM and the P-value matrix to assign a statistical significance to each motif occurrence candidate, based on the resulting log-odds score. ... Parameters: motif_file : str path to the motif PWM bg_file path to the background file in Markov Background Format (http://meme-suite.org/doc/bfile-format.html). pseudocount : float value to add to motif PWM counts no_reverse : bool if False only the forward strand will be considered, otherwise both forward and reverse are considered cores : int number of CPU cores (used when MEME file has more than one PWM) verbose : bool print additional information debug : bool trace the full error stack Returns ------- Motif Motif object storing the data contained in motif_file """ if not isinstance(motif_file, str): errmsg = "Expected str, got {}.\n" exception_handler(TypeError, errmsg.format(type(motif_file).__name__), debug) if not os.path.isfile(motif_file): errmsg = "Unable to locate {}.\n" exception_handler(FileNotFoundError, errmsg.format(motif_file), debug) if not isMEME_ff(motif_file, debug): errmsg = "Required MEME motif PWM parsing, but {} is not in MEME format.\n" exception_handler(MotifFileFormatError, errmsg.format(motif_file), debug) if verbose: start_rm_all: float = time.time() motif_lst: List[Motif] = read_MEME_motif(motif_file, bg_file, pseudocount, no_reverse, verbose, debug) motif_num: int = len(motif_lst) if verbose: end_rm_all: float = time.time() print("Read all motifs in %s in %.2fs." % (motif_file, (end_rm_all - start_rm_all))) print("\nRead {} motifs in {}".format(motif_num, motif_file)) print("\nProcessing motifs\n") complete_motifs = list() # fully processed motifs if verbose: start_mp_all: str = time.time() if motif_num >= cores: # worth to use multiprocessing original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN) pool: mp.Pool = mp.Pool(processes=cores) # overwrite the default SIGINT handler to exit gracefully # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python signal.signal(signal.SIGINT, original_sigint_handler) try: args = [(motif, debug) for motif in motif_lst] res = (pool.starmap_async(process_motif_for_logodds, args)) it: int = 0 # ---- progress bar while (True): if res.ready(): # when finished call for the last time printProgressBar() printProgressBar(tot, tot, prefix='Progress:', suffix='Complete', length=50) break if it == 0: tot = res._number_left remaining = res._number_left printProgressBar((tot - remaining), tot, prefix='Progress:', suffix='Complete', length=50) time.sleep(1) it += 1 complete_motifs += res.get(60 * 60 * 60) # does not ignore signals except KeyboardInterrupt: pool.terminate() sigint_handler() else: pool.close() if verbose: end_mp_all: float = time.time() print("Processed motif(s) in %s in %.2fs" % (motif_file, (end_mp_all - start_mp_all))) return complete_motifs else: for m in motif_lst: # process each found motif complete_motifs.append(process_motif_for_logodds(m, debug)) if verbose: end_mp_all: float = time.time() print("Processed motif(s) in %s in %.2fs" % (motif_file, (end_mp_all - start_mp_all))) return complete_motifs
def main(cmdLineargs=None): """ Main function of GRAFIMO. The arguments given in input are checked for consistency, then a pipeline is followed. ---- Parameters: cmdLineargs (str) ---- Returns: None """ try: # starting point of the execution time start = time.time() # read the command-line arguments parser = get_AP() if cmdLineargs is None: cmdLineargs = sys.argv[1:] # take input args # the second argument must be buildvg or findmotif if ((cmdLineargs[0] != "-h" and cmdLineargs[0] != "--help") and cmdLineargs[0] != "--version" and (cmdLineargs[0] != "buildvg" and cmdLineargs[0] != "findmotif")): parser.error("The second argument must be one between 'buildvg' and 'findmotif'") die(1) args = parser.parse_args(cmdLineargs) # parse args if args.verbose: print("Parsing arguments...") start_args_parse = time.time() ##################################################################### # check arguments consistency ##################################################################### if args.workflow != "buildvg" and args.workflow != "findmotif": parser.error("Do not know what to do. Available options: create VGs with 'grafimo buildvg' or scan a " "precomputed genome variation graph with 'grafimo findmotif'") die(1) # cores (shared by the two workflows) if args.cores < 0: parser.error("The number of cores cannot be negative") elif args.cores == 0 and args.graph_genome: args.cores = 1 # to query a whole genome graph is loaded into RAM, since usually are # very heavy in terms of bytes is safer to use 1 thread by default, otherwise # it would be loaded #cores times. If you want use more cores, be sure # your system can handle the resulting amount of data elif args.cores == 0: args.cores = mp.cpu_count() # by default take all the available CPUs # end if # check verbose flag if (not isinstance(args.verbose, bool) or (args.verbose != False and args.verbose != True)): parser.error('The --verbose parameter accepts only True or False values') # chromosomes check (shared by the two workflows) for c in args.chroms: if c not in CHROMS_LIST: parser.error("Invalid chromosome") args.chroms = initialize_chroms_list(args.chroms) # checks for buildvg workflow if args.workflow == "buildvg": if args.graph_genome_dir: parser.error("Invalid arguments for grafimo buildvg") die(1) elif args.graph_genome: parser.error("Invalid arguments for grafimo buildvg") die(1) elif args.bedfile: parser.error("Invalid arguments for grafimo buildvg") die(1) elif args.motif: parser.error("Invalid arguments for grafimo buildvg") die(1) elif args.bgfile != 'UNIF': # if default ignored parser.error("Invalid arguments for grafimo buildvg") die(1) elif args.pseudo != 0.1: # if default ignored parser.error("Invalid arguments for grafimo buildvg") die(1) elif args.threshold != 1e-4: # if default ignored" parser.error("Invalid arguments for grafimo buildvg") die(1) elif args.no_qvalue: parser.error("Invalid arguments for grafimo buildvg") die(1) elif args.no_reverse: parser.error("Invalid arguments for grafimo buildvg") die(1) elif args.text_only: parser.error("Invalid arguments for grafimo buildvg") die(1) elif args.qval_t: parser.error("Invalid arguments for grafimo buildvg") die(1) elif args.top_graphs != 0: # if default ignored parser.error("Invalid arguments for grafimo buildvg") die(1) elif not args.linear_genome: parser.error("No reference genome given") die(1) elif not args.vcf: parser.error("No VCF file given") die(1) else: # check linear genome if (args.linear_genome.split('.')[-1] != 'fa' and args.linear_genome.split('.')[-1] != 'fasta'): parser.error('The linear genome must be in FASTA format (FASTA and FA extensions allowed)') die(1) else: if len(glob.glob(args.linear_genome)) != 1: parser.error('Cannot find the given reference genome file') die(1) args.linear_genome = os.path.abspath(args.linear_genome) # end if # check VCF if ((args.vcf.split('.')[-1] != 'gz' and args.vcf.split('.')[-1] != 'zip') or args.vcf.split('.')[-2] != 'vcf'): # allow only compressed VCF files parser.error('Incorrect VCF file given: the VCF must be compressed (e.g. myvcf.vcf.gz)') die(1) else: if len(glob.glob(args.vcf)) <= 0: parser.error('Cannot find the given VCF file') die(1) args.vcf = os.path.abspath(args.vcf) # by deafult the built VGs will be stored in the current directory if args.out == "grafimo_out": # general default value args.out = os.path.abspath("./") workflow = BuildVG(args) if args.verbose: end_args_parse = time.time() print(''.join(["Arguments parsed in ", str(end_args_parse - start_args_parse), "s"])) # end if # end if # checks for findmotif workflow if args.workflow == "findmotif": if args.linear_genome: parser.error("Invalid arguments for grafimo findmotif") die(1) elif args.vcf: parser.error("Invalid arguments for grafimo buildvg") die(1) elif not args.graph_genome_dir and not args.graph_genome: parser.error("No genome variation graph or directory containing them given") die(1) elif not args.bedfile: parser.error("No BED file given") die(1) elif not args.motif: parser.error("No motif file (MEME of JASPAR format) given") die(1) else: # only one between graph_genome and graph_genome_dir allowed if args.graph_genome and args.graph_genome_dir: parser.error("Invalid arguments for grafimo buildvg") die(1) # check graph_genome if args.graph_genome: if (args.graph_genome.split('.')[-1] != 'xg' and args.graph_genome.split('.')[-1] != 'vg'): parser.error("Cannot use the given genome variation graph (only VG or XG format allowed)") die(1) elif not os.path.isfile(args.graph_genome): parser.error("Unable to find the given variation genome graph") die(1) else: graph_genome = os.path.abspath(args.graph_genome) # safer to use absolute path args.graph_genome = graph_genome # end if # end if # check graph_genome_dir if args.graph_genome_dir: if not os.path.isdir(args.graph_genome_dir): parser.error("Cannot find the given directory containing the genome variation graphs") die(1) if args.graph_genome_dir[-1] == '/': graph_genome_dir = args.graph_genome_dir else: graph_genome_dir = ''.join([args.graph_genome_dir, '/']) # end if if len(glob.glob(graph_genome_dir + '*.xg')) <= 0: parser.error(' '.join(['No XG genome variation graph found in', graph_genome_dir])) die(1) else: graph_genome_dir = os.path.abspath(graph_genome_dir) args.graph_genome_dir = graph_genome_dir # end if # end if # check BED file if args.bedfile: if args.bedfile.split('.')[-1] != 'bed': parser.error('Incorrect BED file given') die(1) else: bedfile = args.bedfile if len(glob.glob(bedfile)) <= 0: parser.error('Cannot find the given BED file') # end if else: parser.error('No BED file given') # end if # check motif file if not args.motif: parser.error('No motif given') else: motifs = args.motif # check if the given motifs exist for m in motifs: if not isMEME_ff(m) and not isJaspar_ff(m): parser.error("Unrecognized motif file format (only MEME or JASPAR allowed)") die(1) if len(glob.glob(m)) <= 0: parser.error('Cannot find motif file: ' + m) die(1) # end for # end if # check background file if args.bgfile != 'UNIF': bgfile = args.bgfile # we have a path to a bg file if len(glob.glob(bgfile)) <= 0: parser.error('Cannot find the given background file') die(1) # end if # check pseudocount if args.pseudo <= 0: parser.error('The pseudocount cannot be less than or equal 0') die(1) # check threshold if args.threshold <= 0 or args.threshold > 1: parser.error('The pvalue threshold must be between 0 and 1') die(1) # check q-value flag if (not isinstance(args.no_qvalue, bool) or (args.no_qvalue != False and args.no_qvalue != True)): parser.error('The --qvalue parameter accepts only True or False as values') die(1) # check no reverse flag if (not isinstance(args.no_reverse, bool) or (args.no_reverse != False and args.no_reverse != True)): parser.error('The --no-reverse parameter accepts only True or False as values') die(1) # check text only flag if (not isinstance(args.text_only, bool) or (args.text_only != False and args.text_only != True)): parser.error('The --text-only parameter accepts only True or False values') die(1) # out directory if args.out == 'grafimo_out': # default option # to make unique the output directory we add the PID # to the name. # # This is useful when calling grafimo in different runs on the # same machine. args.out = ''.join([args.out, '_', str(os.getpid())]) # check threshold on q-value flag if (not isinstance(args.qval_t, bool) or (args.qval_t != False and args.qval_t != True)): parser.error("The --qvalueT parameter accepts only True or False as values") die(1) elif args.no_qvalue == True and args.qval_t == True: parser.error("Cannot apply the threshold on q-values if you don't want them") die(1) # check the number of graph regions to store as PNG images if args.top_graphs < 0: parser.error("The number of region graphs to show must be positive") workflow = Findmotif(args) if args.verbose: end_args_parse = time.time() print(''.join(["Arguments parsed in ", str(end_args_parse - start_args_parse), "s"])) # end if # end if # check that external dependencies are satisfied if args.verbose: print("Checking GRAFIMO external dependencies " + str(EXT_DEPS)) start_deps = time.time() satisfied, deps_lack = check_deps() if not satisfied and len(deps_lack) > 0: raise DependencyError("\n\nERROR: The following dependencies are not sastisfied: " + str(deps_lack) + "\nPlease, solve them before running GRAFIMO") die(1) elif not satisfied and len(deps_lack) <= 0: raise DependencyError("Some dependencies were found, but was not possible to track them." "\nBe sure they are available in system PATH") die(1) # end if if args.verbose and satisfied: end_deps = time.time() print("Dependencies correctly satisfied") print(''.join(["Dependencies checked in ", str(end_deps - start_deps), "s"])) ##################################################################### """ dependency check was ok, so we go to workflow selection: - creation of the genome variation graph for each chromosome or a user defined subset of them - scan of a precomputed VG or a set of precomputed VG """ if isinstance(workflow, BuildVG): # build the VG for each chromosome or a user defined subset of them buildvg(workflow) elif isinstance(workflow, Findmotif): # scan a precomputed VG or a set of VGs findmotif(workflow) else: raise ValueError("Unknown arguments object type") # end if end = time.time() # GRAFIMO execution finishes here print(''.join(["\nElapsed time: ", str(end - start), "s"])) except KeyboardInterrupt: sigint_handler() finally: pass
def build_motif_MEME(motif_file: str, bg_file: str, pseudocount: float, no_reverse: bool, cores: int, verbose: bool) -> List[Motif]: """Read a motif PWM in MEME format. The data read are then used to build the scoring matrix for the motif, the P-value matrix, etc. Parameters: motif_file : str path to the motif PWM in MEME format bg_file path to the background file in Markov Background Format (http://meme-suite.org/doc/bfile-format.html). pseudocount : float value to add to motif PWM counts no_reverse : bool if False only the forward strand will be considered, otherwise both forward and reverse are considered cores : int Number of cores to use while building the Motif object verbose : bool print additional information Returns ------- Motif Motif object storing the data contained in motif_file """ errmsg: str if not motif_file: errmsg = "\n\nERROR: the motif file is missing" raise FileNotFoundError(errmsg) if not isMEME_ff(motif_file): errmsg = "\n\nERROR: the given motif file is not in MEME format" raise NotValidFFException(errmsg) if verbose: start_rm_all: float = time.time() motif_lst: List[Motif] motif_lst = read_MEME_motif(motif_file, bg_file, pseudocount, no_reverse, verbose) motif_num: int = len(motif_lst) if verbose: end_rm_all: float = time.time() msg: str = ''.join([ "\nRead all motif contained in ", motif_file, " in ", str(end_rm_all - start_rm_all), "s" ]) print(msg) # end if print("\nRead", motif_num, "motifs in", motif_file) print("\nProcessing motifs\n") # list of the fully processed motifs complete_motifs = list() if verbose: start_mp_all: str = time.time() # process each found motif if motif_num >= cores: # worth to use multiprocessing original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN) pool: mp.Pool = mp.Pool(processes=cores) # use #cores processes signal.signal(signal.SIGINT, original_sigint_handler) # overwrite the default SIGINT handler to exit gracefully # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python try: res = (pool.map_async(process_motif_for_logodds, motif_lst)) it: int = 0 while (True): if res.ready(): # when finished call for the last time # printProgressBar() printProgressBar(tot, tot, prefix='Progress:', suffix='Complete', length=50) break # end if if it == 0: tot = res._number_left remaining = res._number_left printProgressBar((tot - remaining), tot, prefix='Progress:', suffix='Complete', length=50) time.sleep(2) it += 1 # end while # does not ignore signals complete_motifs += res.get(60 * 60 * 60) except KeyboardInterrupt: pool.terminate() sigint_handler() else: pool.close() if verbose: end_mp_all: float = time.time() print("Processed motif(s) in %s in %.2fs" % (motif_file, (end_mp_all - start_mp_all))) # end if return complete_motifs # end try else: # process each found motif for m in motif_lst: complete_motifs.append(process_motif_for_logodds(m)) if verbose: end_mp_all: float = time.time() print("Processed motif(s) in %s in %.2fs" % (motif_file, (end_mp_all - start_mp_all))) # end if return complete_motifs
def main(cmdLineargs: Optional[List[str]] = None) -> None : try: # starting point of the execution time start: float = time.time() # read the command-line arguments parser: GRAFIMOArgumentParser = get_parser() if cmdLineargs is None: cmdLineargs: List[str] = sys.argv[1:] # take input args # no argument given if len(cmdLineargs) == 0: parser.error_noargs() die(1) # the second argument must be buildvg or findmotif if ((cmdLineargs[0] != "-h" and cmdLineargs[0] != "--help") and cmdLineargs[0] != "--version" and (cmdLineargs[0] != "buildvg" and cmdLineargs[0] != "findmotif")): parser.error( "The second argument must be one between 'buildvg' and 'findmotif'") die(1) args: argparse.Namespace = parser.parse_args(cmdLineargs) if args.verbose: print("Parsing arguments...") start_args_parse: float = time.time() ################################################################ # check arguments consistency ################################################################ if args.workflow != "buildvg" and args.workflow != "findmotif": parser.error("Do not know what to do. Available options: create VGs " "with 'grafimo buildvg' or scan a precomputed genome " "variation graph with 'grafimo findmotif'") die(1) # cores (shared by the two workflows) if args.cores < 0: parser.error("The number of cores cannot be negative") elif args.cores == 0 and args.graph_genome: # to query a whole genome graph is loaded into RAM, since # usually they are very heavy in terms of bytes is safer to # use 1 thread by default, otherwise it would be loaded # #cores times. If you want use more cores, be sure your # system can handle the resulting amount of data args.cores = 1 elif args.cores == 0: # by default take all the available CPUs args.cores = mp.cpu_count() # end if # check verbose flag if (not isinstance(args.verbose, bool) or (args.verbose != False and args.verbose != True)): parser.error( 'The --verbose parameter accepts only True or False values') # chromosomes check (shared by the two workflows) if len(args.chroms) == 0: args.chroms = ['ALL_CHROMS'] buildvg_err_msg = "Invalid arguments for grafimo buildvg" # checks for buildvg workflow if args.workflow == "buildvg": if args.graph_genome_dir: parser.error(buildvg_err_msg) die(1) elif args.graph_genome: parser.error(buildvg_err_msg) die(1) elif args.bedfile: parser.error(buildvg_err_msg) die(1) elif args.motif: parser.error(buildvg_err_msg) die(1) elif args.bgfile != 'UNIF': # if default ignored parser.error(buildvg_err_msg) die(1) elif args.pseudo != 0.1: # if default ignored parser.error(buildvg_err_msg) die(1) elif args.threshold != 1e-4: # if default ignored" parser.error(buildvg_err_msg) die(1) elif args.no_qvalue: parser.error(buildvg_err_msg) die(1) elif args.no_reverse: parser.error(buildvg_err_msg) die(1) elif args.text_only: parser.error(buildvg_err_msg) die(1) elif args.qval_t: parser.error(buildvg_err_msg) die(1) elif args.recomb: parser.error(buildvg_err_msg) die(1) elif args.top_graphs != 0: # if default ignored parser.error(buildvg_err_msg) die(1) elif not args.linear_genome: parser.error("No reference genome given") die(1) elif not args.vcf: parser.error("No VCF file given") die(1) else: # check linear genome if (args.linear_genome.split('.')[-1] != 'fa' and args.linear_genome.split('.')[-1] != 'fasta'): parser.error( "The linear genome must be in FASTA format (FASTA and " "FA extensions allowed)") die(1) else: if len(glob.glob(args.linear_genome)) != 1: parser.error( 'Cannot find the given reference genome file') die(1) args.linear_genome = os.path.abspath(args.linear_genome) # end if # check VCF --> the VCF must have been compressed with # bgzip (https://github.com/samtools/tabix) if ((args.vcf.split('.')[-1] != 'gz' and args.vcf.split('.')[-1] != 'zip') or args.vcf.split('.')[-2] != 'vcf'): parser.error( "Incorrect VCF file given: the VCF must be compressed " "(e.g. myvcf.vcf.gz)") die(1) else: if len(glob.glob(args.vcf)) <= 0: parser.error('Cannot find the given VCF file') die(1) args.vcf = os.path.abspath(args.vcf) # by deafult the built VGs will be stored in the current # directory if args.out == "": # general default value args.out = os.path.abspath("./") workflow: BuildVG = BuildVG(args) if args.verbose: end_args_parse: float = time.time() print("Arguments parsed in %.2fs" % (end_args_parse - start_args_parse)) # end if # end if findmotif_err_msg = "Invalid arguments for grafimo findmotif" # checks for findmotif workflow if args.workflow == "findmotif": if args.linear_genome: parser.error(findmotif_err_msg) die(1) elif args.vcf: parser.error(findmotif_err_msg) die(1) elif args.reindex: # if default value is ignored parser.error(findmotif_err_msg) die(1) elif not args.graph_genome_dir and not args.graph_genome: parser.error( "No genome variation graph or directory containing them given") die(1) elif not args.bedfile: parser.error("No BED file given") die(1) elif not args.motif: parser.error("No motif file (MEME of JASPAR format) given") die(1) else: # only one between graph_genome and graph_genome_dir # are allowed if args.graph_genome and args.graph_genome_dir: parser.error("Invalid arguments for grafimo buildvg") die(1) # check graph_genome if args.graph_genome: if (args.graph_genome.split('.')[-1] != 'xg' and args.graph_genome.split('.')[-1] != 'vg'): parser.error( "Cannot use the given genome variation graph (only " "VG or XG format allowed)") die(1) elif not os.path.isfile(args.graph_genome): parser.error( "Unable to find the given variation genome graph") die(1) else: # it is safer to use absolute path to avoid bugs graph_genome: str = os.path.abspath(args.graph_genome) args.graph_genome = graph_genome # end if # end if # check graph_genome_dir if args.graph_genome_dir: if not os.path.isdir(args.graph_genome_dir): parser.error( "Cannot find the given directory containing the " "genome variation graphs") die(1) if args.graph_genome_dir[-1] == '/': graph_genome_dir = args.graph_genome_dir else: graph_genome_dir = ''.join([args.graph_genome_dir, '/']) # end if if len(glob.glob(graph_genome_dir + '*.xg')) <= 0: parser.error( ' '.join(['No XG genome variation graph found in', graph_genome_dir])) die(1) else: graph_genome_dir: str = os.path.abspath(graph_genome_dir) args.graph_genome_dir = graph_genome_dir # end if # end if # check BED file if args.bedfile: if args.bedfile.split('.')[-1] != 'bed': parser.error('Incorrect BED file given') die(1) else: bedfile: str = args.bedfile if len(glob.glob(bedfile)) <= 0: parser.error('Cannot find the given BED file') # end if else: parser.error('No BED file given') # end if # check motif file if not args.motif: parser.error('No motif given') else: motifs: List[str] = args.motif # check if the given motifs exist for m in motifs: if not isMEME_ff(m) and not isJaspar_ff(m): parser.error("Unrecognized motif file format (only MEME or JASPAR allowed)") die(1) if len(glob.glob(m)) <= 0: parser.error('Cannot find motif file: ' + m) die(1) # end for # end if # check background file if args.bgfile != 'UNIF': bgfile: str = args.bgfile if len(glob.glob(bgfile)) <= 0: parser.error('Cannot find the given background file') die(1) # end if # check pseudocount if args.pseudo <= 0: parser.error( 'The pseudocount cannot be less than or equal 0') die(1) # check threshold if args.threshold <= 0 or args.threshold > 1: parser.error('The pvalue threshold must be between 0 and 1') die(1) # check q-value flag if (not isinstance(args.no_qvalue, bool) or (args.no_qvalue != False and args.no_qvalue != True)): parser.error( "The --qvalue parameter accepts only True or False as " "values") die(1) # check no reverse flag if (not isinstance(args.no_reverse, bool) or (args.no_reverse != False and args.no_reverse != True)): parser.error( "The --no-reverse parameter accepts only True or False " "as values") die(1) # check text only flag if (not isinstance(args.text_only, bool) or (args.text_only != False and args.text_only != True)): parser.error( "The --text-only parameter accepts only True or False " "values") die(1) # check recombinant flag if (not isinstance(args.recomb, bool) or (args.recomb != False and args.recomb != True)): parser.error( "The --recomb parameter accepts only True or False values") die(1) # out directory if args.out == '': # default option args.out = DEFAULT_OUTDIR # check threshold on q-value flag if (not isinstance(args.qval_t, bool) or (args.qval_t != False and args.qval_t != True)): parser.error ("The --qvalueT parameter accepts only True or False as values") die(1) elif args.no_qvalue == True and args.qval_t == True: parser.error( "Cannot apply the threshold on q-values if you don't " "want them") die(1) # check the number of graph regions to store as PNG images if args.top_graphs < 0: parser.error( "The number of region graphs to show must be positive") workflow: Findmotif = Findmotif(args) if args.verbose: end_args_parse: float = time.time() print("Arguments parsed in %.2fs" % (end_args_parse - start_args_parse)) # end if # end if # check that external dependencies are satisfied if args.verbose: print("Checking GRAFIMO external dependencies " + str(EXT_DEPS)) start_deps: float = time.time() satisfied: bool deps_lack: List[str] satisfied, deps_lack = check_deps() if not satisfied and len(deps_lack) > 0: raise DependencyError("\n\nERROR: The following dependencies are not" " sastisfied: " + str(deps_lack) + "\nPlease, solve them before running GRAFIMO") elif not satisfied and len(deps_lack) <= 0: raise DependencyError("Some dependencies were found, but was not " "possible to track them.\n" "Be sure they are available in system PATH") # end if if args.verbose and satisfied: end_deps: float = time.time() print("Dependencies correctly satisfied") print("Dependencies checked in %.2fs" % (end_deps - start_deps)) ################################################################ # dependency check was ok, so we go to workflow selection: # * creation of the genome variation graph for # each chromosome or a user defined subset of them # * scan of a precomputed VG or a set of precomputed VG if isinstance(workflow, BuildVG): # build the VG for each chromosome or a user defined subset # of them buildvg(workflow) elif isinstance(workflow, Findmotif): # scan a precomputed VG or a set of VGs findmotif(workflow) else: raise ValueError("Unknown arguments object type") # end if end: float = time.time() # GRAFIMO execution finishes here print("Elapsed time %.2fs" % (end - start)) except KeyboardInterrupt: sigint_handler() finally: pass
def get_kmers(queries: List[str], pool: mp.Pool, verbose: Optional[bool] = False) -> None: """Extract the genomic sequences (both from reverse and forward strands)in the queried regions from the VG. The sequence extraction is perfromed in parallel working on a user defined number of cores (by default all the cores available). Parameters ---------- queries : list set of queries to perform on the graph to extract the motif occurrence candidates pool : multiprocessing.Pool pool of parallel processes to run verbose : bool, optional flag used to define if additional information has to printed """ if not isinstance(queries, list): raise Exception if verbose: start_re: float = time.time() # extract regions try: # query the VGs res: mp.pool.MapResult = (pool.map_async(get_seqs, queries)) if not verbose: it: int = 0 while (True): if res.ready(): # when finished call for the last time # printProgressBar() printProgressBar(tot, tot, prefix='Progress:', suffix='Complete', length=50) break # end if if it == 0: tot = res._number_left remaining = res._number_left printProgressBar((tot - remaining), tot, prefix='Progress:', suffix='Complete', length=50) time.sleep(2) it += 1 # end while # end if ret: list = res.get(60 * 60 * 60) # does not ignore signals except KeyboardInterrupt: pool.terminate() sigint_handler() else: pool.close() if verbose: end_re: float = time.time() print("Extracted sequences from all regions in %.2fs" % (end_re - start_re))
def compute_results( motif: Motif, sequence_loc: str, debug: bool, args_obj: Optional[Findmotif] = None, testmode: Optional[bool] = False, ) -> pd.DataFrame: """Score the sequences extracted from the genome variation graph. The potential motif occurrences are scored using the scaled scoring matrix. The scaled values are then used to retrieve the corresponding P-value. ... Parameters ---------- motif : Motif motif object sequence_loc : str path to sequences extracted debug : bool trace the full error stack args_obj : Findmotif, optional commandline arguments container testmode : bool, optional test (manually set) Returns ------- pandas.DataFrame results """ if not isinstance(motif, Motif): errmsg = "Expected Motif, got {}.\n" exception_handler(TypeError, errmsg.format(type(motif).__name__), debug) if not isinstance(sequence_loc, str): errmsg = "Expected str, got {}.\n" exception_handler(TypeError, errmsg.format(type(sequence_loc).__name__), debug) if not os.path.isdir(sequence_loc): errmsg = "Unable to locate {}.\n" exception_handler(FileNotFoundError, errmsg.format(sequence_loc), debug) if not testmode: if not isinstance(args_obj, Findmotif): errmsg = "Expected Findmotif, got {}.\n" exception_handler(TypeError, errmsg.format(type(args_obj).__name__), debug) if not testmode: cores: int = args_obj.cores threshold: float = args_obj.threshold no_qvalue: bool = args_obj.noqvalue qval_t: bool = args_obj.qvalueT no_reverse: bool = args_obj.noreverse recomb: bool = args_obj.recomb verbose: bool = args_obj.verbose else: # pytest - during normal execution we should never go here cores = 1 threshold = float(1) recomb = True no_qvalue = False qval_t = False no_reverse = False verbose = False assert threshold > 0 and threshold <= 1 assert cores >= 1 print_scoring_msg(motif, no_reverse, debug) cwd: str = os.getcwd() os.chdir(sequence_loc) manager: SyncManager = mp.Manager() return_dict: DictProxy = manager.dict() # results scanned_nucs_dict: DictProxy = manager.dict() # scanned nucleotides scanned_seqs_dict: DictProxy = manager.dict() # scanned sequences sequences: List[str] = glob.glob('*.tsv') # sequences if len(sequences) < cores: cores = len(sequences) # split the sequence set in no. cores chunks sequences_split: List[str] = np.array_split(sequences, cores) jobs = list() # jobs list proc_finished: int = 0 # overwrite the default SIGINT handler to exit gracefully # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN) signal.signal(signal.SIGINT, original_sigint_handler) if verbose: start_s: float = time.time() try: for i in range(cores): p = mp.Process(target=score_seqs, args=(sequences_split[i], motif, no_reverse, return_dict, scanned_seqs_dict, scanned_nucs_dict, i, debug)) jobs.append(p) p.start() # to print 0%, otherwise start from % as first chunk id already completed completed printProgressBar(proc_finished, cores, prefix='Progress:', suffix='Complete', length=50) for job in jobs: job.join() # sync point proc_finished += 1 printProgressBar(proc_finished, cores, prefix='Progress:', suffix='Complete', length=50) except KeyboardInterrupt: sigint_handler() die(2) else: if verbose: end_s: float = time.time() print("Scored all sequences in %.2fs" % (end_s - start_s)) os.chdir(cwd) if not testmode: cmd: str = "rm -rf {}".format(sequence_loc) code: int = subprocess.call(cmd, shell=True) if code != 0: errmsg = "An error occurred while executing {}.\n" exception_handler(SubprocessError, errmsg.format(cmd), debug) if verbose: start_df: str = time.time() # recover all analysis results and summarize them in a single # data structure seqs_scanned: int = 0 nucs_scanned: int = 0 summary = ResultTmp() for key in return_dict.keys(): partialres = return_dict[key] summary.append_list(partialres[0], partialres[1], partialres[2], partialres[3], partialres[4], partialres[5], partialres[6], partialres[7], partialres[8], partialres[9]) seqs_scanned += scanned_seqs_dict[key] nucs_scanned += scanned_nucs_dict[key] if summary.isempty(): errmsg = "No result retrieved. Unable to proceed. Are you using the correct VGs and searching on the right chromosomes?\n" exception_handler(ValueError, errmsg, debug) # compute the q-values if not no_qvalue: if verbose: start_q = time.time() qvalues = compute_qvalues(summary.pvalues, debug) summary.add_qvalues(qvalues) if verbose: end_q = time.time() print("Q-values computed in %.2fs." % (end_q - start_q)) print("Scanned sequences:\t{}".format(seqs_scanned)) print("Scanned nucleotides:\t{}".format(nucs_scanned)) # summarize results in a pandas DataFrame finaldf = summary.to_df(motif, threshold, qval_t, recomb, ignore_qvals=no_qvalue) if verbose: end_df: float = time.time() print("\nResults summary built in %.2fs" % (end_df - start_df)) return finaldf
def build_motif_MEME(motif_file, bg_file, pseudocount, no_reverse, cores, verbose): """ Build a the Motif object starting from the data stored in a given MEME file. The probabilities are processed and the resulting values are used to build the scoring matrix for the motif. ---- Parameters: motif_file (str) : path to the motif file bg_file (str) : path to the background file pseudocount (float) : value to add to the motif counts no_reverse (bool) : if set to True, only data related to forward strand will be used cores (int) : number of cores to use, during motif processing ---- Returns: motif (Motif) : Motif object built from data contained in motif_file """ if not motif_file: raise FileNotFoundError("\n\nERROR: the motif file is missing") # check if the input is in MEME format if not isMEME_ff(motif_file): # if in other format we should not be here raise NotValidFFException( "\n\nERROR: the given motif file is not in MEME format") if verbose: start_rm_all = time.time() # read the motif file motif_lst = read_MEME_motif(motif_file, bg_file, pseudocount, no_reverse, verbose) motif_num = len(motif_lst) if verbose: end_rm_all = time.time() msg = ''.join([ "\nRead all motif contained in ", motif_file, " in ", str(end_rm_all - start_rm_all), "s" ]) print(msg) # end if print("\nRead", motif_num, "motifs in", motif_file) print("\nProcessing motifs\n") # list of the fully processed motifs complete_motifs = [] if verbose: start_mp_all = time.time() # process each found motif if motif_num >= cores: # worth to use multiprocessing original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN) pool = mp.Pool(processes=cores) # use #cores processes signal.signal( signal.SIGINT, original_sigint_handler ) # overwrite the default SIGINT handler to exit gracefully # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python try: res = (pool.map_async(process_motif_for_logodds, motif_lst)) it = 0 while (True): if res.ready(): # when finished call for the last time printProgressBar() printProgressBar(tot, tot, prefix='Progress:', suffix='Complete', length=50) break # end if if it == 0: tot = res._number_left remaining = res._number_left printProgressBar((tot - remaining), tot, prefix='Progress:', suffix='Complete', length=50) time.sleep(2) it += 1 # end while complete_motifs += res.get(60 * 60 * 60) # does not ignore signals except KeyboardInterrupt: pool.terminate() sigint_handler() else: pool.close() if verbose: end_mp_all = time.time() msg = ''.join([ "Processed all motifs contained in ", motif_file, " in ", str(end_mp_all - start_mp_all), "s" ]) print(msg) # end if return complete_motifs # end try else: # the sequential execution is fine # process each found motif for m in motif_lst: complete_motifs.append(process_motif_for_logodds(m)) if verbose: end_mp_all = time.time() msg = ''.join([ "Processed all motifs contained in ", motif_file, " in ", str(end_mp_all - start_mp_all), "s" ]) print(msg) # end if return complete_motifs
def get_regions(motif, args_obj): """ Compute all sequences of length L (L is the motif width) from the VG(s). The sequences are extracted from the regions defined in the input BED file. ---- Parameters: motif (Motif) : motif to search on the VG args_obj (Findmotif) : object storing the arguments required to extract the regions defined in the BED file, from the VG(s) ---- Return: sequence_loc (str) : location of the tmp files, containing the extracted sequences """ # check the input arguments if not isinstance(motif, Motif): errmsg = "\n\nERROR: unknown motif object type" raise ValueError(errmsg) if args_obj.has_graph_genome(): vg = args_obj.get_graph_genome() if not isGraph_genome_xg(vg): errmsg = "\n\nERROR: the genome variation graph is not in XG format" raise VGException(errmsg) # end if elif args_obj.has_graph_genome_dir(): vg = args_obj.get_graph_genome_dir() else: raise VGException("\n\nERROR: the genome variation graph is missing") # end if bedfile = args_obj.get_bedfile() motif_width = motif.getWidth() chroms = args_obj.get_chroms() cores = args_obj.get_cores() global verbose verbose = args_obj.get_verbose() print("\nExtracting regions defined in", bedfile, "\n") # read the regions where search the motif occurrences from the given BED file regions = getBEDregions(bedfile) if verbose: print("\nFound", len(regions), "regions in", bedfile) if chroms: # user defined subset of the chromosomes chr_list = [''.join(['chr', c]) for c in chroms] else: # all the chromosomes chr_list = [''.join(['chr', c]) for c in CHROMS_LIST] # end if # create a tmp working directory tmpwd = tempfile.mkdtemp(prefix='grafimo_') # if the tmp directory name already exists remove it # this shouldn't happen, but to be sure if os.path.isdir(tmpwd): cmd = 'rm -rf {0}'.format(tmpwd) code = subprocess.call(cmd, shell=True) if code != 0: raise SubprocessError(' '.join( ["an error occurred executing", cmd, ". Exiting"])) # end if cmd = 'mkdir -p {0}'.format(tmpwd) code = subprocess.call(cmd, shell=True) if code != 0: raise SubprocessError(' '.join( ["an error occurred executing", cmd, ". Exiting"])) # get the new location of graphs wrt the tmp dir cwd = os.getcwd() # enter the tmp dir where store the extracted sequences os.chdir(tmpwd) if verbose: start_re = time.time() # redefine default SIGINT handler original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN) pool = mp.Pool(processes=cores) # use #cores processes signal.signal(signal.SIGINT, original_sigint_handler ) # overwrite the default SIGINT handler to exit gracefully # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python if args_obj.has_graph_genome_dir(): # vg -> directory containing a set of VGs if vg[-1] == "/": pass else: vg = ''.join([vg, "/"]) # end if queries = [] # set of queries for region in regions: chrom = region['chr'] start = region['start'] stop = region['stop'] if chrom in chr_list: # the chromosome is among the ones to query region_index = ''.join( [chrom, ':', str(start), '-', str(stop)]) region_name = ''.join([chrom, '_', str(start), '-', str(stop)]) seqs = correct_path('./', region_name, '.tsv') xg = ''.join([vg, chrom, '.xg']) if not os.path.exists(xg): errmsg = ''.join( ["\n\nERROR: unable to use ", xg, ". Exiting"]) raise FileNotFoundError(errmsg) query = 'vg find -x {0} -E -p {1} -K {2} > {3}'.format( xg, region_index, motif_width, seqs) queries.append(query) # extract regions try: # query the VGs res = (pool.map_async(get_seqs, queries)) if not verbose: it = 0 while (True): if res.ready(): # when finished call for the last time printProgressBar() printProgressBar(tot, tot, prefix='Progress:', suffix='Complete', length=50) break # end if if it == 0: tot = res._number_left remaining = res._number_left printProgressBar((tot - remaining), tot, prefix='Progress:', suffix='Complete', length=50) time.sleep(2) it += 1 # end while # end if ret = res.get(60 * 60 * 60) # does not ignore signals except KeyboardInterrupt: pool.terminate() sigint_handler() else: pool.close() if verbose: end_re = time.time() msg = ''.join([ "Extracted all regions from VGs stored in ", vg, ", in ", str(end_re - start_re), "s" ]) print(msg) # end if # end try elif args_obj.has_graph_genome(): queries = [] # set of queries for region in regions: chrom = region['chr'] start = region['start'] stop = region['stop'] if chrom in chr_list: # the chromosome is among the ones to query region_index = ''.join( [chrom, ':', str(start), '-', str(stop)]) region_name = ''.join([chrom, '_', str(start), '-', str(stop)]) seqs = correct_path('./', region_name, '.tsv') if not os.path.exists(vg): errmsg = ''.join( ["\n\nERROR: unable to use ", vg, ". Exiting"]) raise FileNotFoundError(errmsg) query = 'vg find -x {0} -E -p {1} -K {2} > {3}'.format( vg, region_index, motif_width, seqs) queries.append(query) # extract regions try: # query the VGs res = (pool.map_async(get_seqs, queries)) if not verbose: it = 0 while (True): if res.ready(): # when finished call for the last time printProgressBar() printProgressBar(tot, tot, prefix='Progress:', suffix='Complete', length=50) break # end if if it == 0: tot = res._number_left remaining = res._number_left printProgressBar((tot - remaining), tot, prefix='Progress:', suffix='Complete', length=50) time.sleep(2) it += 1 # end while # end if ret = res.get(60 * 60 * 60) # does not ignore signals except KeyboardInterrupt: pool.terminate() sigint_handler() else: pool.close() if verbose: end_re = time.time() msg = ''.join([ "Extracted all regions from VGs stored in ", vg, ", in ", str(end_re - start_re), "s" ]) print(msg) # end if # end try else: raise Exception("\n\nERROR: do not know how to proceed".Exiting) # end if sequence_loc = os.getcwd() # the extracted sequences are store in the cwd os.chdir(cwd) # get back to the origin return sequence_loc