Пример #1
0
def get_motif_pwm(motif_file, args_obj, cores):
    """
        Build a Motif object starting from a given PWM
        ----
        Parameters:
            motif_file (str) : motif file to process
            args_obj (Findmotif): data-structure containing the
                                    parameters to scan a given
                                    VG or a set of VGs
            cores (int) : number of cores to use during motif
                            processing
        ----
        Returns:
            motif (list) : list of processed motifs as Motif objects
    """

    # get arguments required to process the motif
    bgs = args_obj.get_bgfile()
    pseudo = args_obj.get_pseudo()
    no_reverse = args_obj.get_no_reverse()
    verbose = args_obj.get_verbose()

    if not motif_file:
        raise FileNotFoundError("\n\nERROR: the motif file is missing")

    if (not isMEME_ff(motif_file)) and (not isJaspar_ff(motif_file)):
        raise NotValidFFException(
            "\n\nERROR: the motif file must be in MEME or JASPAR format")

    if isJaspar_ff(motif_file):
        motif = build_motif_JASPAR(motif_file, bgs, pseudo, no_reverse,
                                   verbose)

    elif isMEME_ff(motif_file):
        motif = build_motif_MEME(motif_file, bgs, pseudo, no_reverse, cores,
                                 verbose)

    else:
        errmsg = ' '.join(
            ["\n\nERROR: do not know what to do with file", motif_file])
        raise NotValidFFException(errmsg)
    # end if

    if not isinstance(motif, list):
        motif = [motif]

    return motif
Пример #2
0
def main(cmdLineargs: Optional[List[str]] = None) -> None:

    try:
        # starting point of the execution time
        start: float = time.time()

        # read the command-line arguments
        parser: GRAFIMOArgumentParser = get_parser()

        if cmdLineargs is None:
            cmdLineargs: List[str] = sys.argv[1:]  # get input args

        # no arguments given --> print help
        if len(cmdLineargs) == 0:
            parser.error_noargs()
            die(2)

        # the second argument must be buildvg or findmotif
        if ((cmdLineargs[0] != "-h" and cmdLineargs[0] != "--help")
                and cmdLineargs[0] != "--version" and
            (cmdLineargs[0] != "buildvg" and cmdLineargs[0] != "findmotif")):
            parser.error(
                "The second argument must be one between 'buildvg' and 'findmotif'"
            )
            die(1)

        args: argparse.Namespace = parser.parse_args(cmdLineargs)

        if args.verbose:
            print("Parsing arguments...")
            start_args_parse: float = time.time()

        #--------------------------------------------------------------#
        # check commandline arguments consistency
        #

        #---------------------- general options -----------------------#

        # workflow type
        if args.workflow != "buildvg" and args.workflow != "findmotif":
            parser.error("Unexpected workflow given. Available options:\n"
                         "\tbuildvg: construct VG from user data.\n"
                         "\tfindmotif: scan VG for DNA motif(s) occurrences")
            die(1)

        # cpu cores
        if args.cores < 0:
            parser.error("Negative number of CPU cores given")
        elif args.cores == 0 and args.graph_genome:
            # when whole genome variation graph is given, it is safer to
            # use 1 CPU core by default. This beacuse of the space needed
            # to load the whole VG on RAM.
            #
            # CAVEAT: before requiring more CPU cores to be used, be sure
            # your system has enough memory
            args.cores = 1
        elif args.cores == 0:
            # default option -> use all available CPU cores
            args.cores = mp.cpu_count()
        else:  # args.cores > 0
            if args.cores > mp.cpu_count():
                parser.error("Too many CPU cores to use ({})".format(
                    args.cores))

        # verbosity
        if (not isinstance(args.verbose, bool)
                or (args.verbose != False and args.verbose != True)):
            parser.error(
                '\"--verbose\" does not accept any positional argument')

        # debugging
        if (not isinstance(args.debug, bool)
                or (args.debug != False and args.debug != True)):
            parser.error("\"--debug\" does not accept any positional argument")

        #---------------------- buildvg options -----------------------#

        buildvg_err_msg: str = "Unexpected arguments for \"grafimo buildvg\": \"{}\""

        if args.workflow == "buildvg":

            if args.graph_genome_dir:
                parser.error(buildvg_err_msg.format("-d, --genome-graph-dir"))
                die(1)
            elif args.graph_genome:
                parser.error(buildvg_err_msg.format("-g, --genome-graph"))
                die(1)
            elif args.bedfile:
                parser.error(buildvg_err_msg.format("-b, --bedfile"))
                die(1)
            elif args.motif:
                parser.error(buildvg_err_msg.format("-m, --motif"))
                die(1)
            elif args.bgfile != UNIF:  # if default ignored
                parser.error(buildvg_err_msg.format("-k, --bgfile"))
                die(1)
            elif args.pseudo != 0.1:  # if default ignored
                parser.error(buildvg_err_msg.format("-p, --pseudo"))
                die(1)
            elif args.threshold != 1e-4:  # if default ignored
                parser.error(buildvg_err_msg.format("-t, --thresh"))
                die(1)
            elif args.no_qvalue:
                parser.error(buildvg_err_msg.format("-q, --no-qvalue"))
                die(1)
            elif args.no_reverse:
                parser.error(buildvg_err_msg.format("-r, --no-reverse"))
                die(1)
            elif args.text_only:
                parser.error(buildvg_err_msg.format("-f, --text-only"))
                die(1)
            elif args.chroms_find:
                parser.error(buildvg_err_msg.format("--chroms-find"))
                die(1)
            elif args.chroms_prefix_find:
                parser.error(buildvg_err_msg.format("--chroms-prefix-find"))
                die(1)
            elif args.chroms_namemap_find != NOMAP:  # if default ignored
                parser.error(buildvg_err_msg.format("--chroms-namemap-find"))
                die(1)
            elif args.qval_t:
                parser.error(buildvg_err_msg.format("--qvalueT"))
                die(1)
            elif args.recomb:
                parser.error(buildvg_err_msg.format("--recomb"))
                die(1)
            elif args.top_graphs != 0:  # if default ignored
                parser.error(buildvg_err_msg.format("--top-graphs"))
                die(1)
            elif not args.linear_genome:
                parser.error("No reference genome given")
                die(1)
            elif not args.vcf:
                parser.error("No VCF file given")
                die(1)
            else:  # arguments for buildvg are correct
                # reference genome
                if (args.linear_genome.split('.')[-1] != 'fa'
                        and args.linear_genome.split('.')[-1] != 'fasta'):
                    parser.error(
                        "The reference genome file must be in FASTA format")
                    die(1)
                else:
                    if not os.path.isfile(args.linear_genome):
                        parser.error("Unable to find {}".format(
                            args.linear_genome))
                        die(1)
                    if os.stat(args.linear_genome).st_size == 0:  # empty file
                        parser.error("{} seems to be empty.".format(
                            args.linear_genome))
                        die(1)
                    args.linear_genome = os.path.abspath(args.linear_genome)
                # VCF --> the VCF file must have been compressed with
                # bgzip (https://github.com/samtools/tabix)
                if (args.vcf.split(".")[-1] != "gz"
                        and args.vcf.split(".")[-2] != "vcf"):
                    parser.error(
                        "Wrong VCF file given. The VCF file must have been "
                        "compressed with bgzip (e.g. myvcf.vcf.gz)")
                    die(1)
                else:
                    if not os.path.isfile(args.vcf):
                        parser.error('Unable to find {}'.format(args.vcf))
                        die(1)
                    if os.stat(args.vcf).st_size == 0:  # empty file
                        parser.error("{} seems to be empty.".format(args.vcf))
                        die(1)
                    args.vcf = os.path.abspath(args.vcf)

                # chromosome to construct VG
                if len(args.chroms_build) == 0:
                    args.chroms_build = [ALL_CHROMS]  # use all chromosome
                else:
                    if anydup(args.chroms_build):
                        parser.error(
                            "Duplicated chromosome names given to \"--chroms-build\""
                        )

                # chromosome name-map
                if args.chroms_namemap_build != NOMAP:
                    if not os.path.isfile(args.chroms_namemap_build):
                        parser.error("Unable to locate {}".format(
                            args.chroms_namemap_build))
                if (args.chroms_prefix_build
                        and args.chroms_namemap_build != NOMAP):
                    parser.error(
                        "\"--chroms-prefix-build\" and \"chroms-namemap-build\" "
                        "cannot used together. Choose one of those options")

                # if no out directory is specified the VGs are stored in
                # the current directory
                if args.out == "":
                    args.out = os.path.abspath("./")

                workflow: BuildVG = BuildVG(args)

                if args.verbose:
                    end_args_parse: float = time.time()
                    print("Arguments parsed in %.2fs." %
                          (end_args_parse - start_args_parse))
            # end if
        # end if

        #---------------------- findmotif options -----------------------#

        findmotif_err_msg: str = "Unexpected arguments for \"grafimo findmotif\": \"{}\""

        if args.workflow == "findmotif":
            if args.linear_genome:
                parser.error(findmotif_err_msg.format("-l, --linear-genome"))
                die(1)
            elif args.vcf:
                parser.error(findmotif_err_msg.format("-v, --vcf"))
                die(1)
            elif args.chroms_build:
                parser.error(findmotif_err_msg.format("--chroms-build"))
            elif args.chroms_prefix_build:
                parser.error(findmotif_err_msg.format("--chroms-prefix-build"))
            elif args.chroms_namemap_build != NOMAP:
                parser.error(
                    findmotif_err_msg.format("--chroms-namemap-build"))
            elif args.reindex:  # if default ignored
                parser.error(findmotif_err_msg.format("--reindex"))
                die(1)
            elif not args.graph_genome_dir and not args.graph_genome:
                parser.error(
                    "No arguments given for both \"--genome-graph\" and \"--genome-graph-dir\""
                )
                die(1)
            elif not args.bedfile:
                parser.error("No BED file given")
                die(1)
            elif not args.motif:
                parser.error("No motif PWM given")
                die(1)
            else:
                # only one between graph_genome and graph_genome_dir is allowed
                if args.graph_genome and args.graph_genome_dir:
                    parser.error(
                        "Only one argument between \"--genome-graph\" and \"--genome-graph-dir\""
                        " can be used")
                    die(1)

                # genome graph
                if args.graph_genome:
                    if (args.graph_genome.split('.')[-1] != "xg"
                            and args.graph_genome.split('.')[-1] != "vg"):
                        parser.error(
                            "Unrecognized genome variation graph format. Only"
                            "VG and XG format are allowed")
                        die(1)
                    elif not os.path.isfile(args.graph_genome):
                        parser.error("Unable to locate {}".format(
                            args.graph_genome))
                        die(1)
                    else:
                        # using absolute path avoid potential problems
                        args.graph_genome = os.path.abspath(args.graph_genome)

                # genome graphs directory
                if args.graph_genome_dir:
                    if not os.path.isdir(args.graph_genome_dir):
                        parser.error("Unable to locate {}".format(
                            args.graph_genome_dir))
                        die(1)
                    if len(glob(os.path.join(args.graph_genome_dir,
                                             "*.xg"))) <= 0:
                        parser.error(
                            "No genome variation graph found in {}".format(
                                args.graph_genome_dir))
                        die(1)
                    else:
                        # using absolute path avoid potential problems
                        args.graph_genome_dir = os.path.abspath(
                            args.graph_genome_dir)

                # BED file
                if args.bedfile:
                    if not isbed(args.bedfile, args.debug):
                        parser.error(
                            "The genomic coordinates must be given in UCSC BED files"
                        )
                        die(1)
                    else:
                        if not os.path.isfile(args.bedfile):
                            parser.error("Unable to locate {}".format(
                                args.bedfile))
                else:
                    parser.error("No BED file given")

                # motif pwm
                if not args.motif:
                    parser.error("No motif PWM given")

                else:
                    motifs: List[str] = args.motif
                    for m in motifs:
                        if not isMEME_ff(m, args.debug) and not isJaspar_ff(
                                m, args.debug):
                            parser.error(
                                "Unrecognized motif PWM file format. "
                                "{} does not follow the MEME or JASPAR format rules"
                                .format(m))
                            die(1)
                        if not os.path.isfile(m):
                            parser.error("Unable to locate {}".format(m))

                # background file
                if args.bgfile != UNIF:
                    if not os.path.isfile(args.bgfile):
                        parser.error("Unable to locate {}".format(args.bgfile))

                # pseudocount
                if args.pseudo <= 0:
                    parser.error(
                        "Pseudocount values must be > 0, got {}".format(
                            args.pseudo))
                    die(1)

                # statistical significance threshold
                if args.threshold <= 0 or args.threshold > 1:
                    parser.error(
                        "Motif statistical significance threshold must be between 0 and 1"
                    )
                    die(1)

                # q-value flag
                if (not isinstance(args.no_qvalue, bool) or
                    (args.no_qvalue != False and args.no_qvalue != True)):
                    parser.error(
                        "\"--qvalue\" accepts only True or False values")
                    die(1)

                # no reverse flag
                if (not isinstance(args.no_reverse, bool) or
                    (args.no_reverse != False and args.no_reverse != True)):
                    parser.error(
                        "\"--no-reverse\" accepts only True or False values")
                    die(1)

                # text only flag
                if (not isinstance(args.text_only, bool) or
                    (args.text_only != False and args.text_only != True)):
                    parser.error(
                        "\"--text-only\" accepts only True or False values")
                    die(1)

                # chromosome to consider during VG scan
                if len(args.chroms_find) == 0:
                    args.chroms_find = [ALL_CHROMS]  # use all chromosome
                else:
                    if anydup(args.chroms_find):
                        parser.error(
                            "Duplicated chromosome names given to \"--chroms-find\""
                        )

                # chromosome name-map
                if args.chroms_namemap_find != NOMAP:
                    if not os.path.isfile(args.chroms_namemap_find):
                        parser.error("Unable to locate {}".format(
                            args.chroms_namemap_find))
                if (args.chroms_prefix_find
                        and args.chroms_namemap_find != NOMAP):
                    parser.error(
                        "\"--chroms-prefix-find\" and \"chroms-namemap-find\" "
                        "cannot used together. Choose one of those options")

                # recomb flag
                if (not isinstance(args.recomb, bool)
                        or (args.recomb != False and args.recomb != True)):
                    parser.error(
                        "\"--recomb\" accepts only True or False values")
                    die(1)

                # out directory
                if args.out == "":  # default option
                    args.out = DEFAULT_OUTDIR
                    print(args.out)

                # threshold on q-value flag
                if (not isinstance(args.qval_t, bool)
                        or (args.qval_t != False and args.qval_t != True)):
                    parser.error(
                        "\"--qvalueT accepts only True or False values")
                    die(1)
                elif args.no_qvalue == True and args.qval_t == True:
                    parser.error(
                        "Unable to apply statistical significance threshold on"
                        " q-values if you don't want them")
                    die(1)

                # number of graph regions to store as PNG images
                if args.top_graphs < 0:
                    parser.error("Negative number of regions to display")

                workflow: Findmotif = Findmotif(args)

                if args.verbose:
                    end_args_parse: float = time.time()
                    print("Arguments parsed in %.2fs." %
                          (end_args_parse - start_args_parse))
            # end if
        # end if

        # chck that external dependencies are satisfied
        if args.verbose:
            sys.stderr.write(
                "Checking GRAFIMO external dependencies {}\n".format(EXT_DEPS))
            start_deps: float = time.time()
        satisfied: bool
        deps_lack: List[str]
        satisfied, deps_lack = check_deps()
        if not satisfied and len(deps_lack) > 0:
            errmsg = "Some dependencies are not satisfied: {}.\nPlease solve them before running GRAFIMO.\n"
            exception_handler(DependencyError, errmsg.format(deps_lack),
                              args.debug)
        elif not satisfied and len(deps_lack) <= 0:
            errmsg = "Dependencies satisfied, but unable to recover them.\n Be sure they are in system PATH.\n"
            exception_handler(DependencyError, errmsg, args.debug)

        if args.verbose and satisfied:
            end_deps: float = time.time()
            print("Dependencies satisfied.")
            print("Dependencies checked in %.2fs." % (end_deps - start_deps))

        #---------------------------------------------------------------
        # dependency check was ok, so we go to workflow selection:
        #   * construction of the genome variation graph for
        #     each chromosome or a user defined subset of them
        #   * scan of a precomputed VG or a set of precomputed VG
        if isinstance(workflow, BuildVG): buildvg(workflow, args.debug)
        elif isinstance(workflow, Findmotif): findmotif(workflow, args.debug)
        else:
            errmsg = "Expected BuildVG or Findmotif, got {}.\n"
            exception_handler(TypeError,
                              errmsg.format(type(workflow).__name__),
                              args.debug)

        end: float = time.time()  # GRAFIMO execution finishes here
        print("Elapsed time %.2fs." % (end - start))

    except KeyboardInterrupt:
        sigint_handler()
    finally:
        pass
Пример #3
0
def get_motif_pwm(motif_file: str, args_obj: Findmotif, cores: int,
                  debug: bool) -> List[Motif]:
    """Construction of Motif object from PWM file.

    The motif PWM is processed in order to obtain the corresponding scoring
    matrix (values scaled in [0,1000]) and the corresponding P-value matrix,
    which is used to assign statistical significance to motif occurrence
    candidates scores.

    To store all these informations is created a Motif object.

    ...

    Parameters
    ----------
    motif_file : str
        path to motif PWM file (MEME or JASPAR format)
    args_obj : Findmotif
        arguments container
    cores : int
        CPU cores to use during motif processing (used only when
        processing MEME motif files with multiple PWMs)
    debug : bool
        trace the full error stack
    
    Returns
    -------
    List[Motif]
        Motif objects
    """

    bgs: dict = args_obj.bgfile
    pseudo: float = args_obj.pseudo
    no_reverse: bool = args_obj.noreverse
    verbose: bool = args_obj.verbose
    errmsg: str
    if not isinstance(motif_file, str):
        errmsg = "Expected str, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(motif_file).__name__),
                          debug)
    if not os.path.isfile(motif_file):
        errmsg = "Unable to locate {}.\n"
        exception_handler(FileNotFoundError, errmsg.format(motif_file), debug)
    if (not isMEME_ff(motif_file, debug)) and (not isJaspar_ff(
            motif_file, debug)):
        errmsg = "Motif PWM must be in MEME or JASPAR format.\n"
        exception_handler(MotifFileFormatError, errmsg, debug)

    # chhose motif PWM parsing method
    if isJaspar_ff(motif_file, debug):
        motif = build_motif_JASPAR(motif_file, bgs, pseudo, no_reverse,
                                   verbose, debug)
    elif isMEME_ff(motif_file, debug):
        motif = build_motif_MEME(motif_file, bgs, pseudo, no_reverse, cores,
                                 verbose, debug)
    else:
        errmsg = "Motif PWM must be in MEME or JASPAR format.\n"
        exception_handler(MotifFileFormatError, errmsg, debug)

    # list instance required to proceed
    if not isinstance(motif, list): motif = [motif]
    assert isinstance(motif, list)
    return motif
Пример #4
0
def build_motif_MEME(motif_file: str, bg_file: str, pseudocount: float,
                     no_reverse: bool, cores: int, verbose: bool,
                     debug: bool) -> List[Motif]:
    """Read motif PWMs in MEME format.

    It is computed the scoring matrix from the values given with the PWM
    and the P-value matrix to assign a statistical significance to
    each motif occurrence candidate, based on the resulting log-odds
    score.

    ...

    Parameters:
    motif_file : str
        path to the motif PWM 
    bg_file
        path to the background file in Markov Background Format
        (http://meme-suite.org/doc/bfile-format.html).
    pseudocount : float
        value to add to motif PWM counts
    no_reverse : bool
        if False only the forward strand will be considered, otherwise
        both forward and reverse are considered
    cores : int
        number of CPU cores (used when MEME file has more than one PWM)
    verbose : bool
        print additional information
    debug : bool
        trace the full error stack

    Returns
    -------
    Motif
        Motif object storing the data contained in motif_file
    """

    if not isinstance(motif_file, str):
        errmsg = "Expected str, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(motif_file).__name__),
                          debug)
    if not os.path.isfile(motif_file):
        errmsg = "Unable to locate {}.\n"
        exception_handler(FileNotFoundError, errmsg.format(motif_file), debug)
    if not isMEME_ff(motif_file, debug):
        errmsg = "Required MEME motif PWM parsing, but {} is not in MEME format.\n"
        exception_handler(MotifFileFormatError, errmsg.format(motif_file),
                          debug)

    if verbose: start_rm_all: float = time.time()
    motif_lst: List[Motif] = read_MEME_motif(motif_file, bg_file, pseudocount,
                                             no_reverse, verbose, debug)
    motif_num: int = len(motif_lst)
    if verbose:
        end_rm_all: float = time.time()
        print("Read all motifs in %s in %.2fs." %
              (motif_file, (end_rm_all - start_rm_all)))
    print("\nRead {} motifs in {}".format(motif_num, motif_file))
    print("\nProcessing motifs\n")

    complete_motifs = list()  # fully processed motifs
    if verbose: start_mp_all: str = time.time()
    if motif_num >= cores:  # worth to use multiprocessing
        original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)
        pool: mp.Pool = mp.Pool(processes=cores)
        # overwrite the default SIGINT handler to exit gracefully
        # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python
        signal.signal(signal.SIGINT, original_sigint_handler)

        try:
            args = [(motif, debug) for motif in motif_lst]
            res = (pool.starmap_async(process_motif_for_logodds, args))
            it: int = 0
            # ---- progress bar
            while (True):
                if res.ready():
                    # when finished call for the last time printProgressBar()
                    printProgressBar(tot,
                                     tot,
                                     prefix='Progress:',
                                     suffix='Complete',
                                     length=50)
                    break
                if it == 0: tot = res._number_left
                remaining = res._number_left
                printProgressBar((tot - remaining),
                                 tot,
                                 prefix='Progress:',
                                 suffix='Complete',
                                 length=50)
                time.sleep(1)
                it += 1
            complete_motifs += res.get(60 * 60 * 60)  # does not ignore signals
        except KeyboardInterrupt:
            pool.terminate()
            sigint_handler()
        else:
            pool.close()
            if verbose:
                end_mp_all: float = time.time()
                print("Processed motif(s) in %s in %.2fs" %
                      (motif_file, (end_mp_all - start_mp_all)))
            return complete_motifs
    else:
        for m in motif_lst:  # process each found motif
            complete_motifs.append(process_motif_for_logodds(m, debug))
        if verbose:
            end_mp_all: float = time.time()
            print("Processed motif(s) in %s in %.2fs" %
                  (motif_file, (end_mp_all - start_mp_all)))
        return complete_motifs
Пример #5
0
def main(cmdLineargs=None):
    """

        Main function of GRAFIMO.

        The arguments given in input are checked for consistency,
        then a pipeline is followed.

        ----
        Parameters:
            cmdLineargs (str)
        ----
        Returns:
            None

    """

    try:
        # starting point of the execution time
        start = time.time()

        # read the command-line arguments
        parser = get_AP()

        if cmdLineargs is None:
            cmdLineargs = sys.argv[1:] # take input args

        # the second argument must be buildvg or findmotif
        if ((cmdLineargs[0] != "-h" and cmdLineargs[0] != "--help") and
                cmdLineargs[0] != "--version" and
                (cmdLineargs[0] != "buildvg" and cmdLineargs[0] != "findmotif")):
            parser.error("The second argument must be one between 'buildvg' and 'findmotif'")
            die(1)

        args = parser.parse_args(cmdLineargs)  # parse args

        if args.verbose:
            print("Parsing arguments...")
            start_args_parse = time.time()

        #####################################################################
        # check arguments consistency
        #####################################################################

        if args.workflow != "buildvg" and args.workflow != "findmotif":
            parser.error("Do not know what to do. Available options: create VGs with 'grafimo buildvg' or scan a "
                         "precomputed genome variation graph with 'grafimo findmotif'")
            die(1)

        # cores (shared by the two workflows)
        if args.cores < 0:
            parser.error("The number of cores cannot be negative")

        elif args.cores == 0 and args.graph_genome:
            args.cores = 1     # to query a whole genome graph is loaded into RAM, since usually are
                                 # very heavy in terms of bytes is safer to use 1 thread by default, otherwise
                                 # it would be loaded #cores times. If you want use more cores, be sure
                                 # your system can handle the resulting amount of data

        elif args.cores == 0:
            args.cores = mp.cpu_count()  # by default take all the available CPUs
        # end if

        # check verbose flag
        if (not isinstance(args.verbose, bool) or
                (args.verbose != False and args.verbose != True)):
            parser.error('The --verbose parameter accepts only True or False values')

        # chromosomes check (shared by the two workflows)
        for c in args.chroms:
            if c not in CHROMS_LIST:
                parser.error("Invalid chromosome")
                
        args.chroms = initialize_chroms_list(args.chroms)

        # checks for buildvg workflow
        if args.workflow == "buildvg":

            if args.graph_genome_dir:
                parser.error("Invalid arguments for grafimo buildvg")
                die(1)

            elif args.graph_genome:
                parser.error("Invalid arguments for grafimo buildvg")
                die(1)

            elif args.bedfile:
                parser.error("Invalid arguments for grafimo buildvg")
                die(1)

            elif args.motif:
                parser.error("Invalid arguments for grafimo buildvg")
                die(1)

            elif args.bgfile != 'UNIF':  # if default ignored
                parser.error("Invalid arguments for grafimo buildvg")
                die(1)

            elif args.pseudo != 0.1:  # if default ignored
                parser.error("Invalid arguments for grafimo buildvg")
                die(1)

            elif args.threshold != 1e-4:  # if default ignored"
                parser.error("Invalid arguments for grafimo buildvg")
                die(1)

            elif args.no_qvalue:
                parser.error("Invalid arguments for grafimo buildvg")
                die(1)

            elif args.no_reverse:
                parser.error("Invalid arguments for grafimo buildvg")
                die(1)

            elif args.text_only:
                parser.error("Invalid arguments for grafimo buildvg")
                die(1)

            elif args.qval_t:
                parser.error("Invalid arguments for grafimo buildvg")
                die(1)

            elif args.top_graphs != 0:  # if default ignored
                parser.error("Invalid arguments for grafimo buildvg")
                die(1)

            elif not args.linear_genome:
                parser.error("No reference genome given")
                die(1)

            elif not args.vcf:
                parser.error("No VCF file given")
                die(1)

            else:
                # check linear genome
                if (args.linear_genome.split('.')[-1] != 'fa' and
                        args.linear_genome.split('.')[-1] != 'fasta'):
                    parser.error('The linear genome must be in FASTA format (FASTA and FA extensions allowed)')
                    die(1)

                else:
                    if len(glob.glob(args.linear_genome)) != 1:
                        parser.error('Cannot find the given reference genome file')
                        die(1)

                    args.linear_genome = os.path.abspath(args.linear_genome)
                # end if

                # check VCF
                if ((args.vcf.split('.')[-1] != 'gz' and args.vcf.split('.')[-1] != 'zip')
                        or args.vcf.split('.')[-2] != 'vcf'):  # allow only compressed VCF files
                    parser.error('Incorrect VCF file given: the VCF must be compressed (e.g. myvcf.vcf.gz)')
                    die(1)

                else:
                    if len(glob.glob(args.vcf)) <= 0:
                        parser.error('Cannot find the given VCF file')
                        die(1)
                    args.vcf = os.path.abspath(args.vcf)

                # by deafult the built VGs will be stored in the current directory
                if args.out == "grafimo_out":  # general default value
                    args.out = os.path.abspath("./")

                workflow = BuildVG(args)

                if args.verbose:
                    end_args_parse = time.time()
                    print(''.join(["Arguments parsed in ", str(end_args_parse - start_args_parse), "s"]))
            # end if
        # end if

        # checks for findmotif workflow
        if args.workflow == "findmotif":
            if args.linear_genome:
                parser.error("Invalid arguments for grafimo findmotif")
                die(1)

            elif args.vcf:
                parser.error("Invalid arguments for grafimo buildvg")
                die(1)

            elif not args.graph_genome_dir and not args.graph_genome:
                parser.error("No genome variation graph or directory containing them given")
                die(1)

            elif not args.bedfile:
                parser.error("No BED file given")
                die(1)

            elif not args.motif:
                parser.error("No motif file (MEME of JASPAR format) given")
                die(1)

            else:

                # only one between graph_genome and graph_genome_dir allowed
                if args.graph_genome and args.graph_genome_dir:
                    parser.error("Invalid arguments for grafimo buildvg")
                    die(1)

                # check graph_genome
                if args.graph_genome:
                    if (args.graph_genome.split('.')[-1] != 'xg' and
                            args.graph_genome.split('.')[-1] != 'vg'):
                        parser.error("Cannot use the given genome variation graph (only VG or XG format allowed)")
                        die(1)

                    elif not os.path.isfile(args.graph_genome):
                        parser.error("Unable to find the given variation genome graph")
                        die(1)

                    else:
                        graph_genome = os.path.abspath(args.graph_genome)  # safer to use absolute path
                        args.graph_genome = graph_genome
                    # end if
                # end if

                # check graph_genome_dir
                if args.graph_genome_dir:
                    if not os.path.isdir(args.graph_genome_dir):
                        parser.error("Cannot find the given directory containing the genome variation graphs")
                        die(1)

                    if args.graph_genome_dir[-1] == '/':
                        graph_genome_dir = args.graph_genome_dir

                    else:
                        graph_genome_dir = ''.join([args.graph_genome_dir, '/'])
                    # end if

                    if len(glob.glob(graph_genome_dir + '*.xg')) <= 0:
                        parser.error(' '.join(['No XG genome variation graph found in', graph_genome_dir]))
                        die(1)

                    else:
                        graph_genome_dir = os.path.abspath(graph_genome_dir)
                        args.graph_genome_dir = graph_genome_dir
                    # end if
                # end if

                # check BED file
                if args.bedfile:
                    if args.bedfile.split('.')[-1] != 'bed':
                        parser.error('Incorrect BED file given')
                        die(1)

                    else:
                        bedfile = args.bedfile

                        if len(glob.glob(bedfile)) <= 0:
                            parser.error('Cannot find the given BED file')
                    # end if

                else:
                    parser.error('No BED file given')
                # end if

                # check motif file
                if not args.motif:
                    parser.error('No motif given')

                else:
                    motifs = args.motif

                    # check if the given motifs exist
                    for m in motifs:
                        if not isMEME_ff(m) and not isJaspar_ff(m):
                            parser.error("Unrecognized motif file format (only MEME or JASPAR allowed)")
                            die(1)

                        if len(glob.glob(m)) <= 0:
                            parser.error('Cannot find motif file: ' + m)
                            die(1)
                    # end for
                # end if

                # check background file
                if args.bgfile != 'UNIF':
                    bgfile = args.bgfile  # we have a path to a bg file

                    if len(glob.glob(bgfile)) <= 0:
                        parser.error('Cannot find the given background file')
                        die(1)
                # end if

                # check pseudocount
                if args.pseudo <= 0:
                    parser.error('The pseudocount cannot be less than or equal 0')
                    die(1)

                # check threshold
                if args.threshold <= 0 or args.threshold > 1:
                    parser.error('The pvalue threshold must be between 0 and 1')
                    die(1)

                # check q-value flag
                if (not isinstance(args.no_qvalue, bool) or
                        (args.no_qvalue != False and args.no_qvalue != True)):
                    parser.error('The --qvalue parameter accepts only True or False as values')
                    die(1)

                # check no reverse flag
                if (not isinstance(args.no_reverse, bool) or
                        (args.no_reverse != False and args.no_reverse != True)):
                    parser.error('The --no-reverse parameter accepts only True or False as values')
                    die(1)

                # check text only flag
                if (not isinstance(args.text_only, bool) or
                        (args.text_only != False and args.text_only != True)):
                    parser.error('The --text-only parameter accepts only True or False values')
                    die(1)

                # out directory
                if args.out == 'grafimo_out':  # default option
                    # to make unique the output directory we add the PID
                    # to the name.
                    #
                    # This is useful when calling grafimo in different runs on the
                    # same machine.

                    args.out = ''.join([args.out, '_', str(os.getpid())])

                # check threshold on q-value flag
                if (not isinstance(args.qval_t, bool) or
                        (args.qval_t != False and args.qval_t != True)):
                    parser.error("The --qvalueT parameter accepts only True or False as values")
                    die(1)

                elif args.no_qvalue == True and args.qval_t == True:
                    parser.error("Cannot apply the threshold on q-values if you don't want them")
                    die(1)

                # check the number of graph regions to store as PNG images
                if args.top_graphs < 0:
                    parser.error("The number of region graphs to show must be positive")

                workflow = Findmotif(args)

                if args.verbose:
                    end_args_parse = time.time()
                    print(''.join(["Arguments parsed in ", str(end_args_parse - start_args_parse), "s"]))

            # end if
        # end if

        # check that external dependencies are satisfied
        if args.verbose:
            print("Checking GRAFIMO external dependencies " + str(EXT_DEPS))
            start_deps = time.time()

        satisfied, deps_lack = check_deps()

        if not satisfied and len(deps_lack) > 0:
            raise DependencyError("\n\nERROR: The following dependencies are not sastisfied: " +
                                      str(deps_lack) +
                                      "\nPlease, solve them before running GRAFIMO")
            die(1)

        elif not satisfied and len(deps_lack) <= 0:
            raise DependencyError("Some dependencies were found, but was not possible to track them." 
                                        "\nBe sure they are available in system PATH")
            die(1)
        # end if

        if args.verbose and satisfied:
            end_deps = time.time()
            print("Dependencies correctly satisfied")
            print(''.join(["Dependencies checked in ", str(end_deps - start_deps), "s"]))

        #####################################################################

        """
            dependency check was ok, so we go to workflow selection:
               - creation of the genome variation graph for 
                   each chromosome or a user defined subset of them
               - scan of a precomputed VG or a set of precomputed VG
        """

        if isinstance(workflow, BuildVG):
            # build the VG for each chromosome or a user defined subset of them
            buildvg(workflow)

        elif isinstance(workflow, Findmotif):
            # scan a precomputed VG or a set of VGs
            findmotif(workflow)

        else:
            raise ValueError("Unknown arguments object type")
        # end if

        end = time.time()  # GRAFIMO execution finishes here

        print(''.join(["\nElapsed time: ", str(end - start), "s"]))

    except KeyboardInterrupt:
        sigint_handler()

    finally:
        pass
Пример #6
0
def get_motif_pwm(motif_file: str, args_obj: Findmotif,
                  cores: int) -> List[Motif]:
    """Starting point for the construction of a Motif object.

    The motif PWM will be read accordingly to the file format. From the
    read data will be computed the motif scoring matrix (with scores
    scaled) and the corresponding P-value matrix.

    All these data will be stored in a new Motif object.

    Parameters
    ----------
    motif_file : str
        path to the motif PWM
    args_obj : Findmotif
        container for arguments needed for the motif scoring and 
        P-value matrix computations
    cores : int
        number of cores to use during the computation (used only when
        processing MEME motif files)
    
    Returns
    -------
    List[Motif]
        processed Motif object as element of a list
    """

    bgs: dict
    pseudo: float
    no_reverse: bool
    verbose: bool

    # get arguments required to process the motif
    bgs = args_obj.get_bgfile()
    pseudo = args_obj.get_pseudo()
    no_reverse = args_obj.get_no_reverse()
    verbose = args_obj.get_verbose()

    errmsg: str
    if not motif_file:
        errmsg = "\n\nERROR: the motif file is missing"
        raise FileNotFoundError(errmsg)

    if (not isMEME_ff(motif_file)) and (not isJaspar_ff(motif_file)):
        errmsg = "\n\nERROR: the motif file must be in MEME or JASPAR format"
        raise NotValidFFException(errmsg)

    if isJaspar_ff(motif_file):
        motif = build_motif_JASPAR(motif_file, bgs, pseudo, no_reverse,
                                   verbose)

    elif isMEME_ff(motif_file):
        motif = build_motif_MEME(motif_file, bgs, pseudo, no_reverse, cores,
                                 verbose)

    else:
        errmsg = ' '.join(
            ["\n\nERROR: do not know what to do with file", motif_file])
        raise NotValidFFException(errmsg)
    # end if

    if not isinstance(motif, list):
        motif = [motif]

    return motif
Пример #7
0
def build_motif_MEME(motif_file: str, bg_file: str, pseudocount: float,
                     no_reverse: bool, cores: int,
                     verbose: bool) -> List[Motif]:
    """Read a motif PWM in MEME format.

    The data read are then used to build the scoring matrix for the 
    motif, the P-value matrix, etc.

    Parameters:
    motif_file : str
        path to the motif PWM in MEME format
    bg_file
        path to the background file in Markov Background Format
        (http://meme-suite.org/doc/bfile-format.html).
    pseudocount : float
        value to add to motif PWM counts
    no_reverse : bool
        if False only the forward strand will be considered, otherwise
        both forward and reverse are considered
    cores : int
        Number of cores to use while building the Motif object
    verbose : bool
        print additional information

    Returns
    -------
    Motif
        Motif object storing the data contained in motif_file
    """

    errmsg: str
    if not motif_file:
        errmsg = "\n\nERROR: the motif file is missing"
        raise FileNotFoundError(errmsg)

    if not isMEME_ff(motif_file):
        errmsg = "\n\nERROR: the given motif file is not in MEME format"
        raise NotValidFFException(errmsg)

    if verbose:
        start_rm_all: float = time.time()

    motif_lst: List[Motif]
    motif_lst = read_MEME_motif(motif_file, bg_file, pseudocount, no_reverse,
                                verbose)
    motif_num: int = len(motif_lst)

    if verbose:
        end_rm_all: float = time.time()
        msg: str = ''.join([
            "\nRead all motif contained in ", motif_file, " in ",
            str(end_rm_all - start_rm_all), "s"
        ])
        print(msg)
    # end if

    print("\nRead", motif_num, "motifs in", motif_file)
    print("\nProcessing motifs\n")

    # list of the fully processed motifs
    complete_motifs = list()

    if verbose:
        start_mp_all: str = time.time()

    # process each found motif
    if motif_num >= cores:  # worth to use multiprocessing

        original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)
        pool: mp.Pool = mp.Pool(processes=cores)  # use #cores processes
        signal.signal(signal.SIGINT, original_sigint_handler)
        # overwrite the default SIGINT handler to exit gracefully
        # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python

        try:
            res = (pool.map_async(process_motif_for_logodds, motif_lst))

            it: int = 0
            while (True):
                if res.ready():
                    # when finished call for the last time
                    # printProgressBar()
                    printProgressBar(tot,
                                     tot,
                                     prefix='Progress:',
                                     suffix='Complete',
                                     length=50)
                    break
                # end if
                if it == 0:
                    tot = res._number_left

                remaining = res._number_left
                printProgressBar((tot - remaining),
                                 tot,
                                 prefix='Progress:',
                                 suffix='Complete',
                                 length=50)
                time.sleep(2)
                it += 1
            # end while

            # does not ignore signals
            complete_motifs += res.get(60 * 60 * 60)

        except KeyboardInterrupt:
            pool.terminate()
            sigint_handler()

        else:
            pool.close()

            if verbose:
                end_mp_all: float = time.time()
                print("Processed motif(s) in %s in %.2fs" %
                      (motif_file, (end_mp_all - start_mp_all)))
            # end if

            return complete_motifs
        # end try

    else:

        # process each found motif
        for m in motif_lst:
            complete_motifs.append(process_motif_for_logodds(m))

        if verbose:
            end_mp_all: float = time.time()
            print("Processed motif(s) in %s in %.2fs" %
                  (motif_file, (end_mp_all - start_mp_all)))
        # end if

        return complete_motifs
Пример #8
0
def main(cmdLineargs: Optional[List[str]] = None) -> None :

    try:
        # starting point of the execution time
        start: float = time.time()

        # read the command-line arguments
        parser: GRAFIMOArgumentParser = get_parser()

        if cmdLineargs is None:
            cmdLineargs: List[str] = sys.argv[1:]  # take input args

        # no argument given
        if len(cmdLineargs) == 0:
            parser.error_noargs()
            die(1)

        # the second argument must be buildvg or findmotif
        if ((cmdLineargs[0] != "-h" and cmdLineargs[0] != "--help") and
                cmdLineargs[0] != "--version" and
                (cmdLineargs[0] != "buildvg" and cmdLineargs[0] != "findmotif")):
            parser.error(
                "The second argument must be one between 'buildvg' and 'findmotif'")
            die(1)

        args: argparse.Namespace = parser.parse_args(cmdLineargs) 

        if args.verbose:
            print("Parsing arguments...")
            start_args_parse: float = time.time()

        ################################################################
        # check arguments consistency
        ################################################################

        if args.workflow != "buildvg" and args.workflow != "findmotif":
            parser.error("Do not know what to do. Available options: create VGs "
                         "with 'grafimo buildvg' or scan a precomputed genome "
                         "variation graph with 'grafimo findmotif'")
            die(1)

        # cores (shared by the two workflows)
        if args.cores < 0:
            parser.error("The number of cores cannot be negative")

        elif args.cores == 0 and args.graph_genome:
            # to query a whole genome graph is loaded into RAM, since 
            # usually they are very heavy in terms of bytes is safer to 
            # use 1 thread by default, otherwise it would be loaded 
            # #cores times. If you want use more cores, be sure your 
            # system can handle the resulting amount of data
            args.cores = 1  

        elif args.cores == 0:
            # by default take all the available CPUs
            args.cores = mp.cpu_count() 
        # end if

        # check verbose flag
        if (not isinstance(args.verbose, bool) or
                (args.verbose != False and args.verbose != True)):
            parser.error(
                'The --verbose parameter accepts only True or False values')

        # chromosomes check (shared by the two workflows)
        if len(args.chroms) == 0:
            args.chroms = ['ALL_CHROMS']

        buildvg_err_msg = "Invalid arguments for grafimo buildvg"

        # checks for buildvg workflow
        if args.workflow == "buildvg":

            if args.graph_genome_dir:
                parser.error(buildvg_err_msg)
                die(1)

            elif args.graph_genome:
                parser.error(buildvg_err_msg)
                die(1)

            elif args.bedfile:
                parser.error(buildvg_err_msg)
                die(1)

            elif args.motif:
                parser.error(buildvg_err_msg)
                die(1)

            elif args.bgfile != 'UNIF':  # if default ignored
                parser.error(buildvg_err_msg)
                die(1)

            elif args.pseudo != 0.1:  # if default ignored
                parser.error(buildvg_err_msg)
                die(1)

            elif args.threshold != 1e-4:  # if default ignored"
                parser.error(buildvg_err_msg)
                die(1)

            elif args.no_qvalue:
                parser.error(buildvg_err_msg)
                die(1)

            elif args.no_reverse:
                parser.error(buildvg_err_msg)
                die(1)

            elif args.text_only:
                parser.error(buildvg_err_msg)
                die(1)

            elif args.qval_t:
                parser.error(buildvg_err_msg)
                die(1)

            elif args.recomb:
                parser.error(buildvg_err_msg)
                die(1)

            elif args.top_graphs != 0:  # if default ignored
                parser.error(buildvg_err_msg)
                die(1)

            elif not args.linear_genome:
                parser.error("No reference genome given")
                die(1)

            elif not args.vcf:
                parser.error("No VCF file given")
                die(1)

            else:
                # check linear genome
                if (args.linear_genome.split('.')[-1] != 'fa' and
                        args.linear_genome.split('.')[-1] != 'fasta'):
                    parser.error(
                        "The linear genome must be in FASTA format (FASTA and "
                        "FA extensions allowed)")
                    die(1)

                else:
                    if len(glob.glob(args.linear_genome)) != 1:
                        parser.error(
                            'Cannot find the given reference genome file')
                        die(1)

                    args.linear_genome = os.path.abspath(args.linear_genome)
                # end if

                # check VCF --> the VCF must have been compressed with
                # bgzip (https://github.com/samtools/tabix)
                if ((args.vcf.split('.')[-1] != 'gz' 
                        and args.vcf.split('.')[-1] != 'zip')
                        or args.vcf.split('.')[-2] != 'vcf'):  
                    parser.error(
                        "Incorrect VCF file given: the VCF must be compressed "
                        "(e.g. myvcf.vcf.gz)")
                    die(1)

                else:
                    if len(glob.glob(args.vcf)) <= 0:
                        parser.error('Cannot find the given VCF file')
                        die(1)
                    args.vcf = os.path.abspath(args.vcf)

                # by deafult the built VGs will be stored in the current 
                # directory
                if args.out == "":  # general default value
                    args.out = os.path.abspath("./")

                workflow: BuildVG = BuildVG(args)

                if args.verbose:
                    end_args_parse: float = time.time()
                    print("Arguments parsed in %.2fs" % 
                          (end_args_parse - start_args_parse))
            # end if
        # end if

        findmotif_err_msg = "Invalid arguments for grafimo findmotif"
        
        # checks for findmotif workflow
        if args.workflow == "findmotif":
            if args.linear_genome:
                parser.error(findmotif_err_msg)
                die(1)

            elif args.vcf:
                parser.error(findmotif_err_msg)
                die(1)

            elif args.reindex:  # if default value is ignored
                parser.error(findmotif_err_msg)
                die(1)

            elif not args.graph_genome_dir and not args.graph_genome:
                parser.error(
                    "No genome variation graph or directory containing them given")
                die(1)

            elif not args.bedfile:
                parser.error("No BED file given")
                die(1)

            elif not args.motif:
                parser.error("No motif file (MEME of JASPAR format) given")
                die(1)

            else:

                # only one between graph_genome and graph_genome_dir 
                # are allowed
                if args.graph_genome and args.graph_genome_dir:
                    parser.error("Invalid arguments for grafimo buildvg")
                    die(1)

                # check graph_genome
                if args.graph_genome:
                    if (args.graph_genome.split('.')[-1] != 'xg' and
                            args.graph_genome.split('.')[-1] != 'vg'):
                        parser.error(
                            "Cannot use the given genome variation graph (only "
                            "VG or XG format allowed)")
                        die(1)

                    elif not os.path.isfile(args.graph_genome):
                        parser.error(
                            "Unable to find the given variation genome graph")
                        die(1)

                    else:
                        # it is safer to use absolute path to avoid bugs
                        graph_genome: str = os.path.abspath(args.graph_genome)  
                        args.graph_genome = graph_genome
                    # end if
                # end if

                # check graph_genome_dir
                if args.graph_genome_dir:
                    if not os.path.isdir(args.graph_genome_dir):
                        parser.error(
                            "Cannot find the given directory containing the "
                            "genome variation graphs")
                        die(1)

                    if args.graph_genome_dir[-1] == '/':
                        graph_genome_dir = args.graph_genome_dir

                    else:
                        graph_genome_dir = ''.join([args.graph_genome_dir, '/'])
                    # end if

                    if len(glob.glob(graph_genome_dir + '*.xg')) <= 0:
                        parser.error(
                            ' '.join(['No XG genome variation graph found in', 
                                      graph_genome_dir]))
                        die(1)

                    else:
                        graph_genome_dir: str = os.path.abspath(graph_genome_dir)
                        args.graph_genome_dir = graph_genome_dir
                    # end if
                # end if

                # check BED file
                if args.bedfile:
                    if args.bedfile.split('.')[-1] != 'bed':
                        parser.error('Incorrect BED file given')
                        die(1)

                    else:
                        bedfile: str = args.bedfile

                        if len(glob.glob(bedfile)) <= 0:
                            parser.error('Cannot find the given BED file')
                    # end if

                else:
                    parser.error('No BED file given')
                # end if

                # check motif file
                if not args.motif:
                    parser.error('No motif given')

                else:
                    motifs: List[str] = args.motif

                    # check if the given motifs exist
                    for m in motifs:
                        if not isMEME_ff(m) and not isJaspar_ff(m):
                            parser.error("Unrecognized motif file format (only MEME or JASPAR allowed)")
                            die(1)

                        if len(glob.glob(m)) <= 0:
                            parser.error('Cannot find motif file: ' + m)
                            die(1)
                    # end for
                # end if

                # check background file
                if args.bgfile != 'UNIF':
                    bgfile: str = args.bgfile

                    if len(glob.glob(bgfile)) <= 0:
                        parser.error('Cannot find the given background file')
                        die(1)
                # end if

                # check pseudocount
                if args.pseudo <= 0:
                    parser.error(
                        'The pseudocount cannot be less than or equal 0')
                    die(1)

                # check threshold
                if args.threshold <= 0 or args.threshold > 1:
                    parser.error('The pvalue threshold must be between 0 and 1')
                    die(1)

                # check q-value flag
                if (not isinstance(args.no_qvalue, bool) or
                        (args.no_qvalue != False and args.no_qvalue != True)):
                    parser.error(
                        "The --qvalue parameter accepts only True or False as "
                        "values")
                    die(1)

                # check no reverse flag
                if (not isinstance(args.no_reverse, bool) or
                        (args.no_reverse != False and args.no_reverse != True)):
                    parser.error(
                        "The --no-reverse parameter accepts only True or False "
                        "as values")
                    die(1)

                # check text only flag
                if (not isinstance(args.text_only, bool) or
                        (args.text_only != False and args.text_only != True)):
                    parser.error(
                        "The --text-only parameter accepts only True or False "
                        "values")
                    die(1)

                # check recombinant flag
                if (not isinstance(args.recomb, bool) or
                        (args.recomb != False and args.recomb != True)):
                    parser.error(
                        "The --recomb parameter accepts only True or False values")
                    die(1)

                # out directory
                if args.out == '':  # default option
                    args.out = DEFAULT_OUTDIR 
                    
                # check threshold on q-value flag
                if (not isinstance(args.qval_t, bool) or
                        (args.qval_t != False and args.qval_t != True)):
                    parser.error
                    ("The --qvalueT parameter accepts only True or False as values")
                    die(1)

                elif args.no_qvalue == True and args.qval_t == True:
                    parser.error(
                        "Cannot apply the threshold on q-values if you don't "
                        "want them")
                    die(1)

                # check the number of graph regions to store as PNG images
                if args.top_graphs < 0:
                    parser.error(
                        "The number of region graphs to show must be positive")

                workflow: Findmotif = Findmotif(args)

                if args.verbose:
                    end_args_parse: float = time.time()
                    print("Arguments parsed in %.2fs" % 
                          (end_args_parse - start_args_parse))

            # end if
        # end if

        # check that external dependencies are satisfied
        if args.verbose:
            print("Checking GRAFIMO external dependencies " + str(EXT_DEPS))
            start_deps: float = time.time()

        satisfied: bool 
        deps_lack: List[str] 
        
        satisfied, deps_lack = check_deps()

        if not satisfied and len(deps_lack) > 0:
            raise DependencyError("\n\nERROR: The following dependencies are not" 
                                  " sastisfied: " + str(deps_lack) +
                                  "\nPlease, solve them before running GRAFIMO")

        elif not satisfied and len(deps_lack) <= 0:
            raise DependencyError("Some dependencies were found, but was not "
                                  "possible to track them.\n" 
                                  "Be sure they are available in system PATH")
        # end if

        if args.verbose and satisfied:
            end_deps: float = time.time()
            print("Dependencies correctly satisfied")
            print("Dependencies checked in %.2fs" % (end_deps - start_deps))

        ################################################################

        # dependency check was ok, so we go to workflow selection:
        #   * creation of the genome variation graph for 
        #     each chromosome or a user defined subset of them
        #   * scan of a precomputed VG or a set of precomputed VG

        if isinstance(workflow, BuildVG):
            # build the VG for each chromosome or a user defined subset 
            # of them
            buildvg(workflow)

        elif isinstance(workflow, Findmotif):
            # scan a precomputed VG or a set of VGs
            findmotif(workflow)

        else:
            raise ValueError("Unknown arguments object type")
        # end if

        end: float = time.time()  # GRAFIMO execution finishes here

        print("Elapsed time %.2fs" % (end - start))

    except KeyboardInterrupt:
        sigint_handler()

    finally:
        pass
Пример #9
0
def build_motif_MEME(motif_file, bg_file, pseudocount, no_reverse, cores,
                     verbose):
    """
        Build a the Motif object starting from the data
        stored in a given MEME file.

        The probabilities are processed and the resulting values
        are used to build the scoring matrix for the motif.
        ----
        Parameters:
            motif_file (str) : path to the motif file
            bg_file (str) : path to the background file
            pseudocount (float) : value to add to the motif counts
            no_reverse (bool) : if set to True, only data related to
                                forward strand will be used
            cores (int) : number of cores to use, during motif processing
        ----
        Returns:
            motif (Motif) : Motif object built from data contained in
                            motif_file
    """

    if not motif_file:
        raise FileNotFoundError("\n\nERROR: the motif file is missing")

    # check if the input is in MEME format
    if not isMEME_ff(motif_file):
        # if in other format we should not be here
        raise NotValidFFException(
            "\n\nERROR: the given motif file is not in MEME format")

    if verbose:
        start_rm_all = time.time()

    # read the motif file
    motif_lst = read_MEME_motif(motif_file, bg_file, pseudocount, no_reverse,
                                verbose)
    motif_num = len(motif_lst)

    if verbose:
        end_rm_all = time.time()
        msg = ''.join([
            "\nRead all motif contained in ", motif_file, " in ",
            str(end_rm_all - start_rm_all), "s"
        ])
        print(msg)
    # end if

    print("\nRead", motif_num, "motifs in", motif_file)
    print("\nProcessing motifs\n")

    # list of the fully processed motifs
    complete_motifs = []

    if verbose:
        start_mp_all = time.time()

    # process each found motif
    if motif_num >= cores:  # worth to use multiprocessing

        original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)
        pool = mp.Pool(processes=cores)  # use #cores processes
        signal.signal(
            signal.SIGINT, original_sigint_handler
        )  # overwrite the default SIGINT handler to exit gracefully
        # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python

        try:
            res = (pool.map_async(process_motif_for_logodds, motif_lst))

            it = 0
            while (True):
                if res.ready():
                    # when finished call for the last time printProgressBar()
                    printProgressBar(tot,
                                     tot,
                                     prefix='Progress:',
                                     suffix='Complete',
                                     length=50)
                    break
                # end if
                if it == 0:
                    tot = res._number_left

                remaining = res._number_left
                printProgressBar((tot - remaining),
                                 tot,
                                 prefix='Progress:',
                                 suffix='Complete',
                                 length=50)
                time.sleep(2)
                it += 1
            # end while

            complete_motifs += res.get(60 * 60 * 60)  # does not ignore signals

        except KeyboardInterrupt:
            pool.terminate()
            sigint_handler()

        else:
            pool.close()

            if verbose:
                end_mp_all = time.time()
                msg = ''.join([
                    "Processed all motifs contained in ", motif_file, " in ",
                    str(end_mp_all - start_mp_all), "s"
                ])
                print(msg)
            # end if

            return complete_motifs
        # end try

    else:  # the sequential execution is fine

        # process each found motif
        for m in motif_lst:
            complete_motifs.append(process_motif_for_logodds(m))

        if verbose:
            end_mp_all = time.time()
            msg = ''.join([
                "Processed all motifs contained in ", motif_file, " in ",
                str(end_mp_all - start_mp_all), "s"
            ])
            print(msg)
        # end if

        return complete_motifs