def get_motif_pwm(motif_file, args_obj, cores): """ Build a Motif object starting from a given PWM ---- Parameters: motif_file (str) : motif file to process args_obj (Findmotif): data-structure containing the parameters to scan a given VG or a set of VGs cores (int) : number of cores to use during motif processing ---- Returns: motif (list) : list of processed motifs as Motif objects """ # get arguments required to process the motif bgs = args_obj.get_bgfile() pseudo = args_obj.get_pseudo() no_reverse = args_obj.get_no_reverse() verbose = args_obj.get_verbose() if not motif_file: raise FileNotFoundError("\n\nERROR: the motif file is missing") if (not isMEME_ff(motif_file)) and (not isJaspar_ff(motif_file)): raise NotValidFFException( "\n\nERROR: the motif file must be in MEME or JASPAR format") if isJaspar_ff(motif_file): motif = build_motif_JASPAR(motif_file, bgs, pseudo, no_reverse, verbose) elif isMEME_ff(motif_file): motif = build_motif_MEME(motif_file, bgs, pseudo, no_reverse, cores, verbose) else: errmsg = ' '.join( ["\n\nERROR: do not know what to do with file", motif_file]) raise NotValidFFException(errmsg) # end if if not isinstance(motif, list): motif = [motif] return motif
def main(cmdLineargs: Optional[List[str]] = None) -> None: try: # starting point of the execution time start: float = time.time() # read the command-line arguments parser: GRAFIMOArgumentParser = get_parser() if cmdLineargs is None: cmdLineargs: List[str] = sys.argv[1:] # get input args # no arguments given --> print help if len(cmdLineargs) == 0: parser.error_noargs() die(2) # the second argument must be buildvg or findmotif if ((cmdLineargs[0] != "-h" and cmdLineargs[0] != "--help") and cmdLineargs[0] != "--version" and (cmdLineargs[0] != "buildvg" and cmdLineargs[0] != "findmotif")): parser.error( "The second argument must be one between 'buildvg' and 'findmotif'" ) die(1) args: argparse.Namespace = parser.parse_args(cmdLineargs) if args.verbose: print("Parsing arguments...") start_args_parse: float = time.time() #--------------------------------------------------------------# # check commandline arguments consistency # #---------------------- general options -----------------------# # workflow type if args.workflow != "buildvg" and args.workflow != "findmotif": parser.error("Unexpected workflow given. Available options:\n" "\tbuildvg: construct VG from user data.\n" "\tfindmotif: scan VG for DNA motif(s) occurrences") die(1) # cpu cores if args.cores < 0: parser.error("Negative number of CPU cores given") elif args.cores == 0 and args.graph_genome: # when whole genome variation graph is given, it is safer to # use 1 CPU core by default. This beacuse of the space needed # to load the whole VG on RAM. # # CAVEAT: before requiring more CPU cores to be used, be sure # your system has enough memory args.cores = 1 elif args.cores == 0: # default option -> use all available CPU cores args.cores = mp.cpu_count() else: # args.cores > 0 if args.cores > mp.cpu_count(): parser.error("Too many CPU cores to use ({})".format( args.cores)) # verbosity if (not isinstance(args.verbose, bool) or (args.verbose != False and args.verbose != True)): parser.error( '\"--verbose\" does not accept any positional argument') # debugging if (not isinstance(args.debug, bool) or (args.debug != False and args.debug != True)): parser.error("\"--debug\" does not accept any positional argument") #---------------------- buildvg options -----------------------# buildvg_err_msg: str = "Unexpected arguments for \"grafimo buildvg\": \"{}\"" if args.workflow == "buildvg": if args.graph_genome_dir: parser.error(buildvg_err_msg.format("-d, --genome-graph-dir")) die(1) elif args.graph_genome: parser.error(buildvg_err_msg.format("-g, --genome-graph")) die(1) elif args.bedfile: parser.error(buildvg_err_msg.format("-b, --bedfile")) die(1) elif args.motif: parser.error(buildvg_err_msg.format("-m, --motif")) die(1) elif args.bgfile != UNIF: # if default ignored parser.error(buildvg_err_msg.format("-k, --bgfile")) die(1) elif args.pseudo != 0.1: # if default ignored parser.error(buildvg_err_msg.format("-p, --pseudo")) die(1) elif args.threshold != 1e-4: # if default ignored parser.error(buildvg_err_msg.format("-t, --thresh")) die(1) elif args.no_qvalue: parser.error(buildvg_err_msg.format("-q, --no-qvalue")) die(1) elif args.no_reverse: parser.error(buildvg_err_msg.format("-r, --no-reverse")) die(1) elif args.text_only: parser.error(buildvg_err_msg.format("-f, --text-only")) die(1) elif args.chroms_find: parser.error(buildvg_err_msg.format("--chroms-find")) die(1) elif args.chroms_prefix_find: parser.error(buildvg_err_msg.format("--chroms-prefix-find")) die(1) elif args.chroms_namemap_find != NOMAP: # if default ignored parser.error(buildvg_err_msg.format("--chroms-namemap-find")) die(1) elif args.qval_t: parser.error(buildvg_err_msg.format("--qvalueT")) die(1) elif args.recomb: parser.error(buildvg_err_msg.format("--recomb")) die(1) elif args.top_graphs != 0: # if default ignored parser.error(buildvg_err_msg.format("--top-graphs")) die(1) elif not args.linear_genome: parser.error("No reference genome given") die(1) elif not args.vcf: parser.error("No VCF file given") die(1) else: # arguments for buildvg are correct # reference genome if (args.linear_genome.split('.')[-1] != 'fa' and args.linear_genome.split('.')[-1] != 'fasta'): parser.error( "The reference genome file must be in FASTA format") die(1) else: if not os.path.isfile(args.linear_genome): parser.error("Unable to find {}".format( args.linear_genome)) die(1) if os.stat(args.linear_genome).st_size == 0: # empty file parser.error("{} seems to be empty.".format( args.linear_genome)) die(1) args.linear_genome = os.path.abspath(args.linear_genome) # VCF --> the VCF file must have been compressed with # bgzip (https://github.com/samtools/tabix) if (args.vcf.split(".")[-1] != "gz" and args.vcf.split(".")[-2] != "vcf"): parser.error( "Wrong VCF file given. The VCF file must have been " "compressed with bgzip (e.g. myvcf.vcf.gz)") die(1) else: if not os.path.isfile(args.vcf): parser.error('Unable to find {}'.format(args.vcf)) die(1) if os.stat(args.vcf).st_size == 0: # empty file parser.error("{} seems to be empty.".format(args.vcf)) die(1) args.vcf = os.path.abspath(args.vcf) # chromosome to construct VG if len(args.chroms_build) == 0: args.chroms_build = [ALL_CHROMS] # use all chromosome else: if anydup(args.chroms_build): parser.error( "Duplicated chromosome names given to \"--chroms-build\"" ) # chromosome name-map if args.chroms_namemap_build != NOMAP: if not os.path.isfile(args.chroms_namemap_build): parser.error("Unable to locate {}".format( args.chroms_namemap_build)) if (args.chroms_prefix_build and args.chroms_namemap_build != NOMAP): parser.error( "\"--chroms-prefix-build\" and \"chroms-namemap-build\" " "cannot used together. Choose one of those options") # if no out directory is specified the VGs are stored in # the current directory if args.out == "": args.out = os.path.abspath("./") workflow: BuildVG = BuildVG(args) if args.verbose: end_args_parse: float = time.time() print("Arguments parsed in %.2fs." % (end_args_parse - start_args_parse)) # end if # end if #---------------------- findmotif options -----------------------# findmotif_err_msg: str = "Unexpected arguments for \"grafimo findmotif\": \"{}\"" if args.workflow == "findmotif": if args.linear_genome: parser.error(findmotif_err_msg.format("-l, --linear-genome")) die(1) elif args.vcf: parser.error(findmotif_err_msg.format("-v, --vcf")) die(1) elif args.chroms_build: parser.error(findmotif_err_msg.format("--chroms-build")) elif args.chroms_prefix_build: parser.error(findmotif_err_msg.format("--chroms-prefix-build")) elif args.chroms_namemap_build != NOMAP: parser.error( findmotif_err_msg.format("--chroms-namemap-build")) elif args.reindex: # if default ignored parser.error(findmotif_err_msg.format("--reindex")) die(1) elif not args.graph_genome_dir and not args.graph_genome: parser.error( "No arguments given for both \"--genome-graph\" and \"--genome-graph-dir\"" ) die(1) elif not args.bedfile: parser.error("No BED file given") die(1) elif not args.motif: parser.error("No motif PWM given") die(1) else: # only one between graph_genome and graph_genome_dir is allowed if args.graph_genome and args.graph_genome_dir: parser.error( "Only one argument between \"--genome-graph\" and \"--genome-graph-dir\"" " can be used") die(1) # genome graph if args.graph_genome: if (args.graph_genome.split('.')[-1] != "xg" and args.graph_genome.split('.')[-1] != "vg"): parser.error( "Unrecognized genome variation graph format. Only" "VG and XG format are allowed") die(1) elif not os.path.isfile(args.graph_genome): parser.error("Unable to locate {}".format( args.graph_genome)) die(1) else: # using absolute path avoid potential problems args.graph_genome = os.path.abspath(args.graph_genome) # genome graphs directory if args.graph_genome_dir: if not os.path.isdir(args.graph_genome_dir): parser.error("Unable to locate {}".format( args.graph_genome_dir)) die(1) if len(glob(os.path.join(args.graph_genome_dir, "*.xg"))) <= 0: parser.error( "No genome variation graph found in {}".format( args.graph_genome_dir)) die(1) else: # using absolute path avoid potential problems args.graph_genome_dir = os.path.abspath( args.graph_genome_dir) # BED file if args.bedfile: if not isbed(args.bedfile, args.debug): parser.error( "The genomic coordinates must be given in UCSC BED files" ) die(1) else: if not os.path.isfile(args.bedfile): parser.error("Unable to locate {}".format( args.bedfile)) else: parser.error("No BED file given") # motif pwm if not args.motif: parser.error("No motif PWM given") else: motifs: List[str] = args.motif for m in motifs: if not isMEME_ff(m, args.debug) and not isJaspar_ff( m, args.debug): parser.error( "Unrecognized motif PWM file format. " "{} does not follow the MEME or JASPAR format rules" .format(m)) die(1) if not os.path.isfile(m): parser.error("Unable to locate {}".format(m)) # background file if args.bgfile != UNIF: if not os.path.isfile(args.bgfile): parser.error("Unable to locate {}".format(args.bgfile)) # pseudocount if args.pseudo <= 0: parser.error( "Pseudocount values must be > 0, got {}".format( args.pseudo)) die(1) # statistical significance threshold if args.threshold <= 0 or args.threshold > 1: parser.error( "Motif statistical significance threshold must be between 0 and 1" ) die(1) # q-value flag if (not isinstance(args.no_qvalue, bool) or (args.no_qvalue != False and args.no_qvalue != True)): parser.error( "\"--qvalue\" accepts only True or False values") die(1) # no reverse flag if (not isinstance(args.no_reverse, bool) or (args.no_reverse != False and args.no_reverse != True)): parser.error( "\"--no-reverse\" accepts only True or False values") die(1) # text only flag if (not isinstance(args.text_only, bool) or (args.text_only != False and args.text_only != True)): parser.error( "\"--text-only\" accepts only True or False values") die(1) # chromosome to consider during VG scan if len(args.chroms_find) == 0: args.chroms_find = [ALL_CHROMS] # use all chromosome else: if anydup(args.chroms_find): parser.error( "Duplicated chromosome names given to \"--chroms-find\"" ) # chromosome name-map if args.chroms_namemap_find != NOMAP: if not os.path.isfile(args.chroms_namemap_find): parser.error("Unable to locate {}".format( args.chroms_namemap_find)) if (args.chroms_prefix_find and args.chroms_namemap_find != NOMAP): parser.error( "\"--chroms-prefix-find\" and \"chroms-namemap-find\" " "cannot used together. Choose one of those options") # recomb flag if (not isinstance(args.recomb, bool) or (args.recomb != False and args.recomb != True)): parser.error( "\"--recomb\" accepts only True or False values") die(1) # out directory if args.out == "": # default option args.out = DEFAULT_OUTDIR print(args.out) # threshold on q-value flag if (not isinstance(args.qval_t, bool) or (args.qval_t != False and args.qval_t != True)): parser.error( "\"--qvalueT accepts only True or False values") die(1) elif args.no_qvalue == True and args.qval_t == True: parser.error( "Unable to apply statistical significance threshold on" " q-values if you don't want them") die(1) # number of graph regions to store as PNG images if args.top_graphs < 0: parser.error("Negative number of regions to display") workflow: Findmotif = Findmotif(args) if args.verbose: end_args_parse: float = time.time() print("Arguments parsed in %.2fs." % (end_args_parse - start_args_parse)) # end if # end if # chck that external dependencies are satisfied if args.verbose: sys.stderr.write( "Checking GRAFIMO external dependencies {}\n".format(EXT_DEPS)) start_deps: float = time.time() satisfied: bool deps_lack: List[str] satisfied, deps_lack = check_deps() if not satisfied and len(deps_lack) > 0: errmsg = "Some dependencies are not satisfied: {}.\nPlease solve them before running GRAFIMO.\n" exception_handler(DependencyError, errmsg.format(deps_lack), args.debug) elif not satisfied and len(deps_lack) <= 0: errmsg = "Dependencies satisfied, but unable to recover them.\n Be sure they are in system PATH.\n" exception_handler(DependencyError, errmsg, args.debug) if args.verbose and satisfied: end_deps: float = time.time() print("Dependencies satisfied.") print("Dependencies checked in %.2fs." % (end_deps - start_deps)) #--------------------------------------------------------------- # dependency check was ok, so we go to workflow selection: # * construction of the genome variation graph for # each chromosome or a user defined subset of them # * scan of a precomputed VG or a set of precomputed VG if isinstance(workflow, BuildVG): buildvg(workflow, args.debug) elif isinstance(workflow, Findmotif): findmotif(workflow, args.debug) else: errmsg = "Expected BuildVG or Findmotif, got {}.\n" exception_handler(TypeError, errmsg.format(type(workflow).__name__), args.debug) end: float = time.time() # GRAFIMO execution finishes here print("Elapsed time %.2fs." % (end - start)) except KeyboardInterrupt: sigint_handler() finally: pass
def get_motif_pwm(motif_file: str, args_obj: Findmotif, cores: int, debug: bool) -> List[Motif]: """Construction of Motif object from PWM file. The motif PWM is processed in order to obtain the corresponding scoring matrix (values scaled in [0,1000]) and the corresponding P-value matrix, which is used to assign statistical significance to motif occurrence candidates scores. To store all these informations is created a Motif object. ... Parameters ---------- motif_file : str path to motif PWM file (MEME or JASPAR format) args_obj : Findmotif arguments container cores : int CPU cores to use during motif processing (used only when processing MEME motif files with multiple PWMs) debug : bool trace the full error stack Returns ------- List[Motif] Motif objects """ bgs: dict = args_obj.bgfile pseudo: float = args_obj.pseudo no_reverse: bool = args_obj.noreverse verbose: bool = args_obj.verbose errmsg: str if not isinstance(motif_file, str): errmsg = "Expected str, got {}.\n" exception_handler(TypeError, errmsg.format(type(motif_file).__name__), debug) if not os.path.isfile(motif_file): errmsg = "Unable to locate {}.\n" exception_handler(FileNotFoundError, errmsg.format(motif_file), debug) if (not isMEME_ff(motif_file, debug)) and (not isJaspar_ff( motif_file, debug)): errmsg = "Motif PWM must be in MEME or JASPAR format.\n" exception_handler(MotifFileFormatError, errmsg, debug) # chhose motif PWM parsing method if isJaspar_ff(motif_file, debug): motif = build_motif_JASPAR(motif_file, bgs, pseudo, no_reverse, verbose, debug) elif isMEME_ff(motif_file, debug): motif = build_motif_MEME(motif_file, bgs, pseudo, no_reverse, cores, verbose, debug) else: errmsg = "Motif PWM must be in MEME or JASPAR format.\n" exception_handler(MotifFileFormatError, errmsg, debug) # list instance required to proceed if not isinstance(motif, list): motif = [motif] assert isinstance(motif, list) return motif
def build_motif_MEME(motif_file: str, bg_file: str, pseudocount: float, no_reverse: bool, cores: int, verbose: bool, debug: bool) -> List[Motif]: """Read motif PWMs in MEME format. It is computed the scoring matrix from the values given with the PWM and the P-value matrix to assign a statistical significance to each motif occurrence candidate, based on the resulting log-odds score. ... Parameters: motif_file : str path to the motif PWM bg_file path to the background file in Markov Background Format (http://meme-suite.org/doc/bfile-format.html). pseudocount : float value to add to motif PWM counts no_reverse : bool if False only the forward strand will be considered, otherwise both forward and reverse are considered cores : int number of CPU cores (used when MEME file has more than one PWM) verbose : bool print additional information debug : bool trace the full error stack Returns ------- Motif Motif object storing the data contained in motif_file """ if not isinstance(motif_file, str): errmsg = "Expected str, got {}.\n" exception_handler(TypeError, errmsg.format(type(motif_file).__name__), debug) if not os.path.isfile(motif_file): errmsg = "Unable to locate {}.\n" exception_handler(FileNotFoundError, errmsg.format(motif_file), debug) if not isMEME_ff(motif_file, debug): errmsg = "Required MEME motif PWM parsing, but {} is not in MEME format.\n" exception_handler(MotifFileFormatError, errmsg.format(motif_file), debug) if verbose: start_rm_all: float = time.time() motif_lst: List[Motif] = read_MEME_motif(motif_file, bg_file, pseudocount, no_reverse, verbose, debug) motif_num: int = len(motif_lst) if verbose: end_rm_all: float = time.time() print("Read all motifs in %s in %.2fs." % (motif_file, (end_rm_all - start_rm_all))) print("\nRead {} motifs in {}".format(motif_num, motif_file)) print("\nProcessing motifs\n") complete_motifs = list() # fully processed motifs if verbose: start_mp_all: str = time.time() if motif_num >= cores: # worth to use multiprocessing original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN) pool: mp.Pool = mp.Pool(processes=cores) # overwrite the default SIGINT handler to exit gracefully # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python signal.signal(signal.SIGINT, original_sigint_handler) try: args = [(motif, debug) for motif in motif_lst] res = (pool.starmap_async(process_motif_for_logodds, args)) it: int = 0 # ---- progress bar while (True): if res.ready(): # when finished call for the last time printProgressBar() printProgressBar(tot, tot, prefix='Progress:', suffix='Complete', length=50) break if it == 0: tot = res._number_left remaining = res._number_left printProgressBar((tot - remaining), tot, prefix='Progress:', suffix='Complete', length=50) time.sleep(1) it += 1 complete_motifs += res.get(60 * 60 * 60) # does not ignore signals except KeyboardInterrupt: pool.terminate() sigint_handler() else: pool.close() if verbose: end_mp_all: float = time.time() print("Processed motif(s) in %s in %.2fs" % (motif_file, (end_mp_all - start_mp_all))) return complete_motifs else: for m in motif_lst: # process each found motif complete_motifs.append(process_motif_for_logodds(m, debug)) if verbose: end_mp_all: float = time.time() print("Processed motif(s) in %s in %.2fs" % (motif_file, (end_mp_all - start_mp_all))) return complete_motifs
def main(cmdLineargs=None): """ Main function of GRAFIMO. The arguments given in input are checked for consistency, then a pipeline is followed. ---- Parameters: cmdLineargs (str) ---- Returns: None """ try: # starting point of the execution time start = time.time() # read the command-line arguments parser = get_AP() if cmdLineargs is None: cmdLineargs = sys.argv[1:] # take input args # the second argument must be buildvg or findmotif if ((cmdLineargs[0] != "-h" and cmdLineargs[0] != "--help") and cmdLineargs[0] != "--version" and (cmdLineargs[0] != "buildvg" and cmdLineargs[0] != "findmotif")): parser.error("The second argument must be one between 'buildvg' and 'findmotif'") die(1) args = parser.parse_args(cmdLineargs) # parse args if args.verbose: print("Parsing arguments...") start_args_parse = time.time() ##################################################################### # check arguments consistency ##################################################################### if args.workflow != "buildvg" and args.workflow != "findmotif": parser.error("Do not know what to do. Available options: create VGs with 'grafimo buildvg' or scan a " "precomputed genome variation graph with 'grafimo findmotif'") die(1) # cores (shared by the two workflows) if args.cores < 0: parser.error("The number of cores cannot be negative") elif args.cores == 0 and args.graph_genome: args.cores = 1 # to query a whole genome graph is loaded into RAM, since usually are # very heavy in terms of bytes is safer to use 1 thread by default, otherwise # it would be loaded #cores times. If you want use more cores, be sure # your system can handle the resulting amount of data elif args.cores == 0: args.cores = mp.cpu_count() # by default take all the available CPUs # end if # check verbose flag if (not isinstance(args.verbose, bool) or (args.verbose != False and args.verbose != True)): parser.error('The --verbose parameter accepts only True or False values') # chromosomes check (shared by the two workflows) for c in args.chroms: if c not in CHROMS_LIST: parser.error("Invalid chromosome") args.chroms = initialize_chroms_list(args.chroms) # checks for buildvg workflow if args.workflow == "buildvg": if args.graph_genome_dir: parser.error("Invalid arguments for grafimo buildvg") die(1) elif args.graph_genome: parser.error("Invalid arguments for grafimo buildvg") die(1) elif args.bedfile: parser.error("Invalid arguments for grafimo buildvg") die(1) elif args.motif: parser.error("Invalid arguments for grafimo buildvg") die(1) elif args.bgfile != 'UNIF': # if default ignored parser.error("Invalid arguments for grafimo buildvg") die(1) elif args.pseudo != 0.1: # if default ignored parser.error("Invalid arguments for grafimo buildvg") die(1) elif args.threshold != 1e-4: # if default ignored" parser.error("Invalid arguments for grafimo buildvg") die(1) elif args.no_qvalue: parser.error("Invalid arguments for grafimo buildvg") die(1) elif args.no_reverse: parser.error("Invalid arguments for grafimo buildvg") die(1) elif args.text_only: parser.error("Invalid arguments for grafimo buildvg") die(1) elif args.qval_t: parser.error("Invalid arguments for grafimo buildvg") die(1) elif args.top_graphs != 0: # if default ignored parser.error("Invalid arguments for grafimo buildvg") die(1) elif not args.linear_genome: parser.error("No reference genome given") die(1) elif not args.vcf: parser.error("No VCF file given") die(1) else: # check linear genome if (args.linear_genome.split('.')[-1] != 'fa' and args.linear_genome.split('.')[-1] != 'fasta'): parser.error('The linear genome must be in FASTA format (FASTA and FA extensions allowed)') die(1) else: if len(glob.glob(args.linear_genome)) != 1: parser.error('Cannot find the given reference genome file') die(1) args.linear_genome = os.path.abspath(args.linear_genome) # end if # check VCF if ((args.vcf.split('.')[-1] != 'gz' and args.vcf.split('.')[-1] != 'zip') or args.vcf.split('.')[-2] != 'vcf'): # allow only compressed VCF files parser.error('Incorrect VCF file given: the VCF must be compressed (e.g. myvcf.vcf.gz)') die(1) else: if len(glob.glob(args.vcf)) <= 0: parser.error('Cannot find the given VCF file') die(1) args.vcf = os.path.abspath(args.vcf) # by deafult the built VGs will be stored in the current directory if args.out == "grafimo_out": # general default value args.out = os.path.abspath("./") workflow = BuildVG(args) if args.verbose: end_args_parse = time.time() print(''.join(["Arguments parsed in ", str(end_args_parse - start_args_parse), "s"])) # end if # end if # checks for findmotif workflow if args.workflow == "findmotif": if args.linear_genome: parser.error("Invalid arguments for grafimo findmotif") die(1) elif args.vcf: parser.error("Invalid arguments for grafimo buildvg") die(1) elif not args.graph_genome_dir and not args.graph_genome: parser.error("No genome variation graph or directory containing them given") die(1) elif not args.bedfile: parser.error("No BED file given") die(1) elif not args.motif: parser.error("No motif file (MEME of JASPAR format) given") die(1) else: # only one between graph_genome and graph_genome_dir allowed if args.graph_genome and args.graph_genome_dir: parser.error("Invalid arguments for grafimo buildvg") die(1) # check graph_genome if args.graph_genome: if (args.graph_genome.split('.')[-1] != 'xg' and args.graph_genome.split('.')[-1] != 'vg'): parser.error("Cannot use the given genome variation graph (only VG or XG format allowed)") die(1) elif not os.path.isfile(args.graph_genome): parser.error("Unable to find the given variation genome graph") die(1) else: graph_genome = os.path.abspath(args.graph_genome) # safer to use absolute path args.graph_genome = graph_genome # end if # end if # check graph_genome_dir if args.graph_genome_dir: if not os.path.isdir(args.graph_genome_dir): parser.error("Cannot find the given directory containing the genome variation graphs") die(1) if args.graph_genome_dir[-1] == '/': graph_genome_dir = args.graph_genome_dir else: graph_genome_dir = ''.join([args.graph_genome_dir, '/']) # end if if len(glob.glob(graph_genome_dir + '*.xg')) <= 0: parser.error(' '.join(['No XG genome variation graph found in', graph_genome_dir])) die(1) else: graph_genome_dir = os.path.abspath(graph_genome_dir) args.graph_genome_dir = graph_genome_dir # end if # end if # check BED file if args.bedfile: if args.bedfile.split('.')[-1] != 'bed': parser.error('Incorrect BED file given') die(1) else: bedfile = args.bedfile if len(glob.glob(bedfile)) <= 0: parser.error('Cannot find the given BED file') # end if else: parser.error('No BED file given') # end if # check motif file if not args.motif: parser.error('No motif given') else: motifs = args.motif # check if the given motifs exist for m in motifs: if not isMEME_ff(m) and not isJaspar_ff(m): parser.error("Unrecognized motif file format (only MEME or JASPAR allowed)") die(1) if len(glob.glob(m)) <= 0: parser.error('Cannot find motif file: ' + m) die(1) # end for # end if # check background file if args.bgfile != 'UNIF': bgfile = args.bgfile # we have a path to a bg file if len(glob.glob(bgfile)) <= 0: parser.error('Cannot find the given background file') die(1) # end if # check pseudocount if args.pseudo <= 0: parser.error('The pseudocount cannot be less than or equal 0') die(1) # check threshold if args.threshold <= 0 or args.threshold > 1: parser.error('The pvalue threshold must be between 0 and 1') die(1) # check q-value flag if (not isinstance(args.no_qvalue, bool) or (args.no_qvalue != False and args.no_qvalue != True)): parser.error('The --qvalue parameter accepts only True or False as values') die(1) # check no reverse flag if (not isinstance(args.no_reverse, bool) or (args.no_reverse != False and args.no_reverse != True)): parser.error('The --no-reverse parameter accepts only True or False as values') die(1) # check text only flag if (not isinstance(args.text_only, bool) or (args.text_only != False and args.text_only != True)): parser.error('The --text-only parameter accepts only True or False values') die(1) # out directory if args.out == 'grafimo_out': # default option # to make unique the output directory we add the PID # to the name. # # This is useful when calling grafimo in different runs on the # same machine. args.out = ''.join([args.out, '_', str(os.getpid())]) # check threshold on q-value flag if (not isinstance(args.qval_t, bool) or (args.qval_t != False and args.qval_t != True)): parser.error("The --qvalueT parameter accepts only True or False as values") die(1) elif args.no_qvalue == True and args.qval_t == True: parser.error("Cannot apply the threshold on q-values if you don't want them") die(1) # check the number of graph regions to store as PNG images if args.top_graphs < 0: parser.error("The number of region graphs to show must be positive") workflow = Findmotif(args) if args.verbose: end_args_parse = time.time() print(''.join(["Arguments parsed in ", str(end_args_parse - start_args_parse), "s"])) # end if # end if # check that external dependencies are satisfied if args.verbose: print("Checking GRAFIMO external dependencies " + str(EXT_DEPS)) start_deps = time.time() satisfied, deps_lack = check_deps() if not satisfied and len(deps_lack) > 0: raise DependencyError("\n\nERROR: The following dependencies are not sastisfied: " + str(deps_lack) + "\nPlease, solve them before running GRAFIMO") die(1) elif not satisfied and len(deps_lack) <= 0: raise DependencyError("Some dependencies were found, but was not possible to track them." "\nBe sure they are available in system PATH") die(1) # end if if args.verbose and satisfied: end_deps = time.time() print("Dependencies correctly satisfied") print(''.join(["Dependencies checked in ", str(end_deps - start_deps), "s"])) ##################################################################### """ dependency check was ok, so we go to workflow selection: - creation of the genome variation graph for each chromosome or a user defined subset of them - scan of a precomputed VG or a set of precomputed VG """ if isinstance(workflow, BuildVG): # build the VG for each chromosome or a user defined subset of them buildvg(workflow) elif isinstance(workflow, Findmotif): # scan a precomputed VG or a set of VGs findmotif(workflow) else: raise ValueError("Unknown arguments object type") # end if end = time.time() # GRAFIMO execution finishes here print(''.join(["\nElapsed time: ", str(end - start), "s"])) except KeyboardInterrupt: sigint_handler() finally: pass
def get_motif_pwm(motif_file: str, args_obj: Findmotif, cores: int) -> List[Motif]: """Starting point for the construction of a Motif object. The motif PWM will be read accordingly to the file format. From the read data will be computed the motif scoring matrix (with scores scaled) and the corresponding P-value matrix. All these data will be stored in a new Motif object. Parameters ---------- motif_file : str path to the motif PWM args_obj : Findmotif container for arguments needed for the motif scoring and P-value matrix computations cores : int number of cores to use during the computation (used only when processing MEME motif files) Returns ------- List[Motif] processed Motif object as element of a list """ bgs: dict pseudo: float no_reverse: bool verbose: bool # get arguments required to process the motif bgs = args_obj.get_bgfile() pseudo = args_obj.get_pseudo() no_reverse = args_obj.get_no_reverse() verbose = args_obj.get_verbose() errmsg: str if not motif_file: errmsg = "\n\nERROR: the motif file is missing" raise FileNotFoundError(errmsg) if (not isMEME_ff(motif_file)) and (not isJaspar_ff(motif_file)): errmsg = "\n\nERROR: the motif file must be in MEME or JASPAR format" raise NotValidFFException(errmsg) if isJaspar_ff(motif_file): motif = build_motif_JASPAR(motif_file, bgs, pseudo, no_reverse, verbose) elif isMEME_ff(motif_file): motif = build_motif_MEME(motif_file, bgs, pseudo, no_reverse, cores, verbose) else: errmsg = ' '.join( ["\n\nERROR: do not know what to do with file", motif_file]) raise NotValidFFException(errmsg) # end if if not isinstance(motif, list): motif = [motif] return motif
def build_motif_MEME(motif_file: str, bg_file: str, pseudocount: float, no_reverse: bool, cores: int, verbose: bool) -> List[Motif]: """Read a motif PWM in MEME format. The data read are then used to build the scoring matrix for the motif, the P-value matrix, etc. Parameters: motif_file : str path to the motif PWM in MEME format bg_file path to the background file in Markov Background Format (http://meme-suite.org/doc/bfile-format.html). pseudocount : float value to add to motif PWM counts no_reverse : bool if False only the forward strand will be considered, otherwise both forward and reverse are considered cores : int Number of cores to use while building the Motif object verbose : bool print additional information Returns ------- Motif Motif object storing the data contained in motif_file """ errmsg: str if not motif_file: errmsg = "\n\nERROR: the motif file is missing" raise FileNotFoundError(errmsg) if not isMEME_ff(motif_file): errmsg = "\n\nERROR: the given motif file is not in MEME format" raise NotValidFFException(errmsg) if verbose: start_rm_all: float = time.time() motif_lst: List[Motif] motif_lst = read_MEME_motif(motif_file, bg_file, pseudocount, no_reverse, verbose) motif_num: int = len(motif_lst) if verbose: end_rm_all: float = time.time() msg: str = ''.join([ "\nRead all motif contained in ", motif_file, " in ", str(end_rm_all - start_rm_all), "s" ]) print(msg) # end if print("\nRead", motif_num, "motifs in", motif_file) print("\nProcessing motifs\n") # list of the fully processed motifs complete_motifs = list() if verbose: start_mp_all: str = time.time() # process each found motif if motif_num >= cores: # worth to use multiprocessing original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN) pool: mp.Pool = mp.Pool(processes=cores) # use #cores processes signal.signal(signal.SIGINT, original_sigint_handler) # overwrite the default SIGINT handler to exit gracefully # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python try: res = (pool.map_async(process_motif_for_logodds, motif_lst)) it: int = 0 while (True): if res.ready(): # when finished call for the last time # printProgressBar() printProgressBar(tot, tot, prefix='Progress:', suffix='Complete', length=50) break # end if if it == 0: tot = res._number_left remaining = res._number_left printProgressBar((tot - remaining), tot, prefix='Progress:', suffix='Complete', length=50) time.sleep(2) it += 1 # end while # does not ignore signals complete_motifs += res.get(60 * 60 * 60) except KeyboardInterrupt: pool.terminate() sigint_handler() else: pool.close() if verbose: end_mp_all: float = time.time() print("Processed motif(s) in %s in %.2fs" % (motif_file, (end_mp_all - start_mp_all))) # end if return complete_motifs # end try else: # process each found motif for m in motif_lst: complete_motifs.append(process_motif_for_logodds(m)) if verbose: end_mp_all: float = time.time() print("Processed motif(s) in %s in %.2fs" % (motif_file, (end_mp_all - start_mp_all))) # end if return complete_motifs
def main(cmdLineargs: Optional[List[str]] = None) -> None : try: # starting point of the execution time start: float = time.time() # read the command-line arguments parser: GRAFIMOArgumentParser = get_parser() if cmdLineargs is None: cmdLineargs: List[str] = sys.argv[1:] # take input args # no argument given if len(cmdLineargs) == 0: parser.error_noargs() die(1) # the second argument must be buildvg or findmotif if ((cmdLineargs[0] != "-h" and cmdLineargs[0] != "--help") and cmdLineargs[0] != "--version" and (cmdLineargs[0] != "buildvg" and cmdLineargs[0] != "findmotif")): parser.error( "The second argument must be one between 'buildvg' and 'findmotif'") die(1) args: argparse.Namespace = parser.parse_args(cmdLineargs) if args.verbose: print("Parsing arguments...") start_args_parse: float = time.time() ################################################################ # check arguments consistency ################################################################ if args.workflow != "buildvg" and args.workflow != "findmotif": parser.error("Do not know what to do. Available options: create VGs " "with 'grafimo buildvg' or scan a precomputed genome " "variation graph with 'grafimo findmotif'") die(1) # cores (shared by the two workflows) if args.cores < 0: parser.error("The number of cores cannot be negative") elif args.cores == 0 and args.graph_genome: # to query a whole genome graph is loaded into RAM, since # usually they are very heavy in terms of bytes is safer to # use 1 thread by default, otherwise it would be loaded # #cores times. If you want use more cores, be sure your # system can handle the resulting amount of data args.cores = 1 elif args.cores == 0: # by default take all the available CPUs args.cores = mp.cpu_count() # end if # check verbose flag if (not isinstance(args.verbose, bool) or (args.verbose != False and args.verbose != True)): parser.error( 'The --verbose parameter accepts only True or False values') # chromosomes check (shared by the two workflows) if len(args.chroms) == 0: args.chroms = ['ALL_CHROMS'] buildvg_err_msg = "Invalid arguments for grafimo buildvg" # checks for buildvg workflow if args.workflow == "buildvg": if args.graph_genome_dir: parser.error(buildvg_err_msg) die(1) elif args.graph_genome: parser.error(buildvg_err_msg) die(1) elif args.bedfile: parser.error(buildvg_err_msg) die(1) elif args.motif: parser.error(buildvg_err_msg) die(1) elif args.bgfile != 'UNIF': # if default ignored parser.error(buildvg_err_msg) die(1) elif args.pseudo != 0.1: # if default ignored parser.error(buildvg_err_msg) die(1) elif args.threshold != 1e-4: # if default ignored" parser.error(buildvg_err_msg) die(1) elif args.no_qvalue: parser.error(buildvg_err_msg) die(1) elif args.no_reverse: parser.error(buildvg_err_msg) die(1) elif args.text_only: parser.error(buildvg_err_msg) die(1) elif args.qval_t: parser.error(buildvg_err_msg) die(1) elif args.recomb: parser.error(buildvg_err_msg) die(1) elif args.top_graphs != 0: # if default ignored parser.error(buildvg_err_msg) die(1) elif not args.linear_genome: parser.error("No reference genome given") die(1) elif not args.vcf: parser.error("No VCF file given") die(1) else: # check linear genome if (args.linear_genome.split('.')[-1] != 'fa' and args.linear_genome.split('.')[-1] != 'fasta'): parser.error( "The linear genome must be in FASTA format (FASTA and " "FA extensions allowed)") die(1) else: if len(glob.glob(args.linear_genome)) != 1: parser.error( 'Cannot find the given reference genome file') die(1) args.linear_genome = os.path.abspath(args.linear_genome) # end if # check VCF --> the VCF must have been compressed with # bgzip (https://github.com/samtools/tabix) if ((args.vcf.split('.')[-1] != 'gz' and args.vcf.split('.')[-1] != 'zip') or args.vcf.split('.')[-2] != 'vcf'): parser.error( "Incorrect VCF file given: the VCF must be compressed " "(e.g. myvcf.vcf.gz)") die(1) else: if len(glob.glob(args.vcf)) <= 0: parser.error('Cannot find the given VCF file') die(1) args.vcf = os.path.abspath(args.vcf) # by deafult the built VGs will be stored in the current # directory if args.out == "": # general default value args.out = os.path.abspath("./") workflow: BuildVG = BuildVG(args) if args.verbose: end_args_parse: float = time.time() print("Arguments parsed in %.2fs" % (end_args_parse - start_args_parse)) # end if # end if findmotif_err_msg = "Invalid arguments for grafimo findmotif" # checks for findmotif workflow if args.workflow == "findmotif": if args.linear_genome: parser.error(findmotif_err_msg) die(1) elif args.vcf: parser.error(findmotif_err_msg) die(1) elif args.reindex: # if default value is ignored parser.error(findmotif_err_msg) die(1) elif not args.graph_genome_dir and not args.graph_genome: parser.error( "No genome variation graph or directory containing them given") die(1) elif not args.bedfile: parser.error("No BED file given") die(1) elif not args.motif: parser.error("No motif file (MEME of JASPAR format) given") die(1) else: # only one between graph_genome and graph_genome_dir # are allowed if args.graph_genome and args.graph_genome_dir: parser.error("Invalid arguments for grafimo buildvg") die(1) # check graph_genome if args.graph_genome: if (args.graph_genome.split('.')[-1] != 'xg' and args.graph_genome.split('.')[-1] != 'vg'): parser.error( "Cannot use the given genome variation graph (only " "VG or XG format allowed)") die(1) elif not os.path.isfile(args.graph_genome): parser.error( "Unable to find the given variation genome graph") die(1) else: # it is safer to use absolute path to avoid bugs graph_genome: str = os.path.abspath(args.graph_genome) args.graph_genome = graph_genome # end if # end if # check graph_genome_dir if args.graph_genome_dir: if not os.path.isdir(args.graph_genome_dir): parser.error( "Cannot find the given directory containing the " "genome variation graphs") die(1) if args.graph_genome_dir[-1] == '/': graph_genome_dir = args.graph_genome_dir else: graph_genome_dir = ''.join([args.graph_genome_dir, '/']) # end if if len(glob.glob(graph_genome_dir + '*.xg')) <= 0: parser.error( ' '.join(['No XG genome variation graph found in', graph_genome_dir])) die(1) else: graph_genome_dir: str = os.path.abspath(graph_genome_dir) args.graph_genome_dir = graph_genome_dir # end if # end if # check BED file if args.bedfile: if args.bedfile.split('.')[-1] != 'bed': parser.error('Incorrect BED file given') die(1) else: bedfile: str = args.bedfile if len(glob.glob(bedfile)) <= 0: parser.error('Cannot find the given BED file') # end if else: parser.error('No BED file given') # end if # check motif file if not args.motif: parser.error('No motif given') else: motifs: List[str] = args.motif # check if the given motifs exist for m in motifs: if not isMEME_ff(m) and not isJaspar_ff(m): parser.error("Unrecognized motif file format (only MEME or JASPAR allowed)") die(1) if len(glob.glob(m)) <= 0: parser.error('Cannot find motif file: ' + m) die(1) # end for # end if # check background file if args.bgfile != 'UNIF': bgfile: str = args.bgfile if len(glob.glob(bgfile)) <= 0: parser.error('Cannot find the given background file') die(1) # end if # check pseudocount if args.pseudo <= 0: parser.error( 'The pseudocount cannot be less than or equal 0') die(1) # check threshold if args.threshold <= 0 or args.threshold > 1: parser.error('The pvalue threshold must be between 0 and 1') die(1) # check q-value flag if (not isinstance(args.no_qvalue, bool) or (args.no_qvalue != False and args.no_qvalue != True)): parser.error( "The --qvalue parameter accepts only True or False as " "values") die(1) # check no reverse flag if (not isinstance(args.no_reverse, bool) or (args.no_reverse != False and args.no_reverse != True)): parser.error( "The --no-reverse parameter accepts only True or False " "as values") die(1) # check text only flag if (not isinstance(args.text_only, bool) or (args.text_only != False and args.text_only != True)): parser.error( "The --text-only parameter accepts only True or False " "values") die(1) # check recombinant flag if (not isinstance(args.recomb, bool) or (args.recomb != False and args.recomb != True)): parser.error( "The --recomb parameter accepts only True or False values") die(1) # out directory if args.out == '': # default option args.out = DEFAULT_OUTDIR # check threshold on q-value flag if (not isinstance(args.qval_t, bool) or (args.qval_t != False and args.qval_t != True)): parser.error ("The --qvalueT parameter accepts only True or False as values") die(1) elif args.no_qvalue == True and args.qval_t == True: parser.error( "Cannot apply the threshold on q-values if you don't " "want them") die(1) # check the number of graph regions to store as PNG images if args.top_graphs < 0: parser.error( "The number of region graphs to show must be positive") workflow: Findmotif = Findmotif(args) if args.verbose: end_args_parse: float = time.time() print("Arguments parsed in %.2fs" % (end_args_parse - start_args_parse)) # end if # end if # check that external dependencies are satisfied if args.verbose: print("Checking GRAFIMO external dependencies " + str(EXT_DEPS)) start_deps: float = time.time() satisfied: bool deps_lack: List[str] satisfied, deps_lack = check_deps() if not satisfied and len(deps_lack) > 0: raise DependencyError("\n\nERROR: The following dependencies are not" " sastisfied: " + str(deps_lack) + "\nPlease, solve them before running GRAFIMO") elif not satisfied and len(deps_lack) <= 0: raise DependencyError("Some dependencies were found, but was not " "possible to track them.\n" "Be sure they are available in system PATH") # end if if args.verbose and satisfied: end_deps: float = time.time() print("Dependencies correctly satisfied") print("Dependencies checked in %.2fs" % (end_deps - start_deps)) ################################################################ # dependency check was ok, so we go to workflow selection: # * creation of the genome variation graph for # each chromosome or a user defined subset of them # * scan of a precomputed VG or a set of precomputed VG if isinstance(workflow, BuildVG): # build the VG for each chromosome or a user defined subset # of them buildvg(workflow) elif isinstance(workflow, Findmotif): # scan a precomputed VG or a set of VGs findmotif(workflow) else: raise ValueError("Unknown arguments object type") # end if end: float = time.time() # GRAFIMO execution finishes here print("Elapsed time %.2fs" % (end - start)) except KeyboardInterrupt: sigint_handler() finally: pass
def build_motif_MEME(motif_file, bg_file, pseudocount, no_reverse, cores, verbose): """ Build a the Motif object starting from the data stored in a given MEME file. The probabilities are processed and the resulting values are used to build the scoring matrix for the motif. ---- Parameters: motif_file (str) : path to the motif file bg_file (str) : path to the background file pseudocount (float) : value to add to the motif counts no_reverse (bool) : if set to True, only data related to forward strand will be used cores (int) : number of cores to use, during motif processing ---- Returns: motif (Motif) : Motif object built from data contained in motif_file """ if not motif_file: raise FileNotFoundError("\n\nERROR: the motif file is missing") # check if the input is in MEME format if not isMEME_ff(motif_file): # if in other format we should not be here raise NotValidFFException( "\n\nERROR: the given motif file is not in MEME format") if verbose: start_rm_all = time.time() # read the motif file motif_lst = read_MEME_motif(motif_file, bg_file, pseudocount, no_reverse, verbose) motif_num = len(motif_lst) if verbose: end_rm_all = time.time() msg = ''.join([ "\nRead all motif contained in ", motif_file, " in ", str(end_rm_all - start_rm_all), "s" ]) print(msg) # end if print("\nRead", motif_num, "motifs in", motif_file) print("\nProcessing motifs\n") # list of the fully processed motifs complete_motifs = [] if verbose: start_mp_all = time.time() # process each found motif if motif_num >= cores: # worth to use multiprocessing original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN) pool = mp.Pool(processes=cores) # use #cores processes signal.signal( signal.SIGINT, original_sigint_handler ) # overwrite the default SIGINT handler to exit gracefully # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python try: res = (pool.map_async(process_motif_for_logodds, motif_lst)) it = 0 while (True): if res.ready(): # when finished call for the last time printProgressBar() printProgressBar(tot, tot, prefix='Progress:', suffix='Complete', length=50) break # end if if it == 0: tot = res._number_left remaining = res._number_left printProgressBar((tot - remaining), tot, prefix='Progress:', suffix='Complete', length=50) time.sleep(2) it += 1 # end while complete_motifs += res.get(60 * 60 * 60) # does not ignore signals except KeyboardInterrupt: pool.terminate() sigint_handler() else: pool.close() if verbose: end_mp_all = time.time() msg = ''.join([ "Processed all motifs contained in ", motif_file, " in ", str(end_mp_all - start_mp_all), "s" ]) print(msg) # end if return complete_motifs # end try else: # the sequential execution is fine # process each found motif for m in motif_lst: complete_motifs.append(process_motif_for_logodds(m)) if verbose: end_mp_all = time.time() msg = ''.join([ "Processed all motifs contained in ", motif_file, " in ", str(end_mp_all - start_mp_all), "s" ]) print(msg) # end if return complete_motifs