예제 #1
0
def get_chromlist(ref_genome: str, debug: bool) -> List[str]:
    """Scan the reference genome FASTA file to find the chromosomes for
    which there a sequence is available.
    
    The file must be in FASTA format and the chromosome names start with
    '>chr' (e.g. '>chrX', '>chr1', etc.)
        
    Parameters
    ----------
    ref_genome : str
        path to the reference genome FASTA file
        
    Returns
    -------
    list
        chomosomes for which a sequence is available in the given 
        reference genome FASTA file 
    """

    assert os.path.isfile(ref_genome)
    # redefine default SIGINT handler
    original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)
    # overwrite original SIGINT handler
    signal.signal(signal.SIGINT, original_sigint_handler)
    chroms = list()

    try:
        with open(ref_genome, mode='r') as ifstream:
            while True:
                line = ifstream.readline()
                if not line: return  # empty file ?
                if line[0] == ">": break  # data start here
            while True:
                if line[0] != ">":
                    errmsg = "Sequence names in FASTA file should begin with \">\"\n."
                    exception_handler(FileReadError, errmsg, debug)
                else:
                    seqname = line.rstrip().split()[0][1:]  # skip ">"
                line = ifstream.readline()
                while True:
                    if not line: break  # empty sequence ?
                    if line[0] == ">": break  # sequence end
                    line = ifstream.readline()
                chroms.append(seqname)
                if not line: break  # reached EOF
    except KeyboardInterrupt:
        sigint_handler()
    except:
        errmsg = "A problem was encountered reading {}\n."
        exception_handler(FileReadError, errmsg.format(ref_genome), debug)
    finally:
        ifstream.close()

    return chroms
예제 #2
0
def get_kmers(
    queries: List[str], 
    pool: mp.Pool, 
    debug: bool,
    verbose: Optional[bool] = False,
) -> None:
    """Retrieve sequences from genome variation graph(s). The k-mers search is
    made in parallel creating #cores processes.

    ...

    Parameters
    ----------
    queries : list
        list of queries
    pool : multiprocessing.Pool
        pool ps
    debug : bool
        trace the full error stack
    verbose : bool, optional
        print additional information
    """

    if not isinstance(queries, list):
        errmsg = "Expected list, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(queries).__name__), debug)

    if verbose: start_re: float = time.time()
    try:
        res: mp.pool.MapResult = (pool.map_async(get_seqs, queries))
        if not verbose:
            it: int = 0
            while (True):
                if res.ready():
                    printProgressBar(
                        tot, tot, prefix='Progress:', suffix='Complete', length=50
                    )
                    break
                if it == 0: tot = res._number_left
                remaining = res._number_left
                printProgressBar(
                    (tot - remaining), tot, prefix='Progress:', suffix='Complete', length=50
                )
                time.sleep(1)
                it += 1
        ret: list = res.get(60 * 60 * 60)  # does not ignore signals
    except KeyboardInterrupt:
        pool.terminate()
        sigint_handler()
    else:
        pool.close()
        if verbose:
            end_re: float = time.time()
            print("Extracted sequences from all regions in %.2fs" % (end_re - start_re))
예제 #3
0
def get_chromlist(ref_genome: str) -> List[str]:
    """Scan the reference genome FASTA file to find the chromosomes for
    which there a sequence is available.
    
    The file must be in FASTA format and the chromosome names start with
    '>chr' (e.g. '>chrX', '>chr1', etc.)
        
    Parameters
    ----------
    ref_genome : str
        path to the reference genome FASTA file
        
    Returns
    -------
    list
        chomosomes for which a sequence is available in the given 
        reference genome FASTA file 
    """

    # redefine default SIGINT handler
    original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)
    # overwrite original SIGINT handler
    signal.signal(signal.SIGINT, original_sigint_handler)
    chroms = list()

    print(
        "Reading the valid chromosome names from the given reference genome...\n"
    )

    try:
        with open(ref_genome, mode='r') as infile:
            for line in infile:
                line = line.strip()
                if line[0] == ">":  # this line contains the chromosome name
                    if line[:4] == ">chr":
                        chroms.append(line[4:])  # remove the starting '>chr'
                    else:
                        chroms.append(line[1:])  # remove the starting '>

    except Exception as e:
        raise e

    except KeyboardInterrupt:
        sigint_handler()

    finally:
        infile.close()  # close input stream

    return chroms
예제 #4
0
def main(cmdLineargs: Optional[List[str]] = None) -> None:

    try:
        # starting point of the execution time
        start: float = time.time()

        # read the command-line arguments
        parser: GRAFIMOArgumentParser = get_parser()

        if cmdLineargs is None:
            cmdLineargs: List[str] = sys.argv[1:]  # get input args

        # no arguments given --> print help
        if len(cmdLineargs) == 0:
            parser.error_noargs()
            die(2)

        # the second argument must be buildvg or findmotif
        if ((cmdLineargs[0] != "-h" and cmdLineargs[0] != "--help")
                and cmdLineargs[0] != "--version" and
            (cmdLineargs[0] != "buildvg" and cmdLineargs[0] != "findmotif")):
            parser.error(
                "The second argument must be one between 'buildvg' and 'findmotif'"
            )
            die(1)

        args: argparse.Namespace = parser.parse_args(cmdLineargs)

        if args.verbose:
            print("Parsing arguments...")
            start_args_parse: float = time.time()

        #--------------------------------------------------------------#
        # check commandline arguments consistency
        #

        #---------------------- general options -----------------------#

        # workflow type
        if args.workflow != "buildvg" and args.workflow != "findmotif":
            parser.error("Unexpected workflow given. Available options:\n"
                         "\tbuildvg: construct VG from user data.\n"
                         "\tfindmotif: scan VG for DNA motif(s) occurrences")
            die(1)

        # cpu cores
        if args.cores < 0:
            parser.error("Negative number of CPU cores given")
        elif args.cores == 0 and args.graph_genome:
            # when whole genome variation graph is given, it is safer to
            # use 1 CPU core by default. This beacuse of the space needed
            # to load the whole VG on RAM.
            #
            # CAVEAT: before requiring more CPU cores to be used, be sure
            # your system has enough memory
            args.cores = 1
        elif args.cores == 0:
            # default option -> use all available CPU cores
            args.cores = mp.cpu_count()
        else:  # args.cores > 0
            if args.cores > mp.cpu_count():
                parser.error("Too many CPU cores to use ({})".format(
                    args.cores))

        # verbosity
        if (not isinstance(args.verbose, bool)
                or (args.verbose != False and args.verbose != True)):
            parser.error(
                '\"--verbose\" does not accept any positional argument')

        # debugging
        if (not isinstance(args.debug, bool)
                or (args.debug != False and args.debug != True)):
            parser.error("\"--debug\" does not accept any positional argument")

        #---------------------- buildvg options -----------------------#

        buildvg_err_msg: str = "Unexpected arguments for \"grafimo buildvg\": \"{}\""

        if args.workflow == "buildvg":

            if args.graph_genome_dir:
                parser.error(buildvg_err_msg.format("-d, --genome-graph-dir"))
                die(1)
            elif args.graph_genome:
                parser.error(buildvg_err_msg.format("-g, --genome-graph"))
                die(1)
            elif args.bedfile:
                parser.error(buildvg_err_msg.format("-b, --bedfile"))
                die(1)
            elif args.motif:
                parser.error(buildvg_err_msg.format("-m, --motif"))
                die(1)
            elif args.bgfile != UNIF:  # if default ignored
                parser.error(buildvg_err_msg.format("-k, --bgfile"))
                die(1)
            elif args.pseudo != 0.1:  # if default ignored
                parser.error(buildvg_err_msg.format("-p, --pseudo"))
                die(1)
            elif args.threshold != 1e-4:  # if default ignored
                parser.error(buildvg_err_msg.format("-t, --thresh"))
                die(1)
            elif args.no_qvalue:
                parser.error(buildvg_err_msg.format("-q, --no-qvalue"))
                die(1)
            elif args.no_reverse:
                parser.error(buildvg_err_msg.format("-r, --no-reverse"))
                die(1)
            elif args.text_only:
                parser.error(buildvg_err_msg.format("-f, --text-only"))
                die(1)
            elif args.chroms_find:
                parser.error(buildvg_err_msg.format("--chroms-find"))
                die(1)
            elif args.chroms_prefix_find:
                parser.error(buildvg_err_msg.format("--chroms-prefix-find"))
                die(1)
            elif args.chroms_namemap_find != NOMAP:  # if default ignored
                parser.error(buildvg_err_msg.format("--chroms-namemap-find"))
                die(1)
            elif args.qval_t:
                parser.error(buildvg_err_msg.format("--qvalueT"))
                die(1)
            elif args.recomb:
                parser.error(buildvg_err_msg.format("--recomb"))
                die(1)
            elif args.top_graphs != 0:  # if default ignored
                parser.error(buildvg_err_msg.format("--top-graphs"))
                die(1)
            elif not args.linear_genome:
                parser.error("No reference genome given")
                die(1)
            elif not args.vcf:
                parser.error("No VCF file given")
                die(1)
            else:  # arguments for buildvg are correct
                # reference genome
                if (args.linear_genome.split('.')[-1] != 'fa'
                        and args.linear_genome.split('.')[-1] != 'fasta'):
                    parser.error(
                        "The reference genome file must be in FASTA format")
                    die(1)
                else:
                    if not os.path.isfile(args.linear_genome):
                        parser.error("Unable to find {}".format(
                            args.linear_genome))
                        die(1)
                    if os.stat(args.linear_genome).st_size == 0:  # empty file
                        parser.error("{} seems to be empty.".format(
                            args.linear_genome))
                        die(1)
                    args.linear_genome = os.path.abspath(args.linear_genome)
                # VCF --> the VCF file must have been compressed with
                # bgzip (https://github.com/samtools/tabix)
                if (args.vcf.split(".")[-1] != "gz"
                        and args.vcf.split(".")[-2] != "vcf"):
                    parser.error(
                        "Wrong VCF file given. The VCF file must have been "
                        "compressed with bgzip (e.g. myvcf.vcf.gz)")
                    die(1)
                else:
                    if not os.path.isfile(args.vcf):
                        parser.error('Unable to find {}'.format(args.vcf))
                        die(1)
                    if os.stat(args.vcf).st_size == 0:  # empty file
                        parser.error("{} seems to be empty.".format(args.vcf))
                        die(1)
                    args.vcf = os.path.abspath(args.vcf)

                # chromosome to construct VG
                if len(args.chroms_build) == 0:
                    args.chroms_build = [ALL_CHROMS]  # use all chromosome
                else:
                    if anydup(args.chroms_build):
                        parser.error(
                            "Duplicated chromosome names given to \"--chroms-build\""
                        )

                # chromosome name-map
                if args.chroms_namemap_build != NOMAP:
                    if not os.path.isfile(args.chroms_namemap_build):
                        parser.error("Unable to locate {}".format(
                            args.chroms_namemap_build))
                if (args.chroms_prefix_build
                        and args.chroms_namemap_build != NOMAP):
                    parser.error(
                        "\"--chroms-prefix-build\" and \"chroms-namemap-build\" "
                        "cannot used together. Choose one of those options")

                # if no out directory is specified the VGs are stored in
                # the current directory
                if args.out == "":
                    args.out = os.path.abspath("./")

                workflow: BuildVG = BuildVG(args)

                if args.verbose:
                    end_args_parse: float = time.time()
                    print("Arguments parsed in %.2fs." %
                          (end_args_parse - start_args_parse))
            # end if
        # end if

        #---------------------- findmotif options -----------------------#

        findmotif_err_msg: str = "Unexpected arguments for \"grafimo findmotif\": \"{}\""

        if args.workflow == "findmotif":
            if args.linear_genome:
                parser.error(findmotif_err_msg.format("-l, --linear-genome"))
                die(1)
            elif args.vcf:
                parser.error(findmotif_err_msg.format("-v, --vcf"))
                die(1)
            elif args.chroms_build:
                parser.error(findmotif_err_msg.format("--chroms-build"))
            elif args.chroms_prefix_build:
                parser.error(findmotif_err_msg.format("--chroms-prefix-build"))
            elif args.chroms_namemap_build != NOMAP:
                parser.error(
                    findmotif_err_msg.format("--chroms-namemap-build"))
            elif args.reindex:  # if default ignored
                parser.error(findmotif_err_msg.format("--reindex"))
                die(1)
            elif not args.graph_genome_dir and not args.graph_genome:
                parser.error(
                    "No arguments given for both \"--genome-graph\" and \"--genome-graph-dir\""
                )
                die(1)
            elif not args.bedfile:
                parser.error("No BED file given")
                die(1)
            elif not args.motif:
                parser.error("No motif PWM given")
                die(1)
            else:
                # only one between graph_genome and graph_genome_dir is allowed
                if args.graph_genome and args.graph_genome_dir:
                    parser.error(
                        "Only one argument between \"--genome-graph\" and \"--genome-graph-dir\""
                        " can be used")
                    die(1)

                # genome graph
                if args.graph_genome:
                    if (args.graph_genome.split('.')[-1] != "xg"
                            and args.graph_genome.split('.')[-1] != "vg"):
                        parser.error(
                            "Unrecognized genome variation graph format. Only"
                            "VG and XG format are allowed")
                        die(1)
                    elif not os.path.isfile(args.graph_genome):
                        parser.error("Unable to locate {}".format(
                            args.graph_genome))
                        die(1)
                    else:
                        # using absolute path avoid potential problems
                        args.graph_genome = os.path.abspath(args.graph_genome)

                # genome graphs directory
                if args.graph_genome_dir:
                    if not os.path.isdir(args.graph_genome_dir):
                        parser.error("Unable to locate {}".format(
                            args.graph_genome_dir))
                        die(1)
                    if len(glob(os.path.join(args.graph_genome_dir,
                                             "*.xg"))) <= 0:
                        parser.error(
                            "No genome variation graph found in {}".format(
                                args.graph_genome_dir))
                        die(1)
                    else:
                        # using absolute path avoid potential problems
                        args.graph_genome_dir = os.path.abspath(
                            args.graph_genome_dir)

                # BED file
                if args.bedfile:
                    if not isbed(args.bedfile, args.debug):
                        parser.error(
                            "The genomic coordinates must be given in UCSC BED files"
                        )
                        die(1)
                    else:
                        if not os.path.isfile(args.bedfile):
                            parser.error("Unable to locate {}".format(
                                args.bedfile))
                else:
                    parser.error("No BED file given")

                # motif pwm
                if not args.motif:
                    parser.error("No motif PWM given")

                else:
                    motifs: List[str] = args.motif
                    for m in motifs:
                        if not isMEME_ff(m, args.debug) and not isJaspar_ff(
                                m, args.debug):
                            parser.error(
                                "Unrecognized motif PWM file format. "
                                "{} does not follow the MEME or JASPAR format rules"
                                .format(m))
                            die(1)
                        if not os.path.isfile(m):
                            parser.error("Unable to locate {}".format(m))

                # background file
                if args.bgfile != UNIF:
                    if not os.path.isfile(args.bgfile):
                        parser.error("Unable to locate {}".format(args.bgfile))

                # pseudocount
                if args.pseudo <= 0:
                    parser.error(
                        "Pseudocount values must be > 0, got {}".format(
                            args.pseudo))
                    die(1)

                # statistical significance threshold
                if args.threshold <= 0 or args.threshold > 1:
                    parser.error(
                        "Motif statistical significance threshold must be between 0 and 1"
                    )
                    die(1)

                # q-value flag
                if (not isinstance(args.no_qvalue, bool) or
                    (args.no_qvalue != False and args.no_qvalue != True)):
                    parser.error(
                        "\"--qvalue\" accepts only True or False values")
                    die(1)

                # no reverse flag
                if (not isinstance(args.no_reverse, bool) or
                    (args.no_reverse != False and args.no_reverse != True)):
                    parser.error(
                        "\"--no-reverse\" accepts only True or False values")
                    die(1)

                # text only flag
                if (not isinstance(args.text_only, bool) or
                    (args.text_only != False and args.text_only != True)):
                    parser.error(
                        "\"--text-only\" accepts only True or False values")
                    die(1)

                # chromosome to consider during VG scan
                if len(args.chroms_find) == 0:
                    args.chroms_find = [ALL_CHROMS]  # use all chromosome
                else:
                    if anydup(args.chroms_find):
                        parser.error(
                            "Duplicated chromosome names given to \"--chroms-find\""
                        )

                # chromosome name-map
                if args.chroms_namemap_find != NOMAP:
                    if not os.path.isfile(args.chroms_namemap_find):
                        parser.error("Unable to locate {}".format(
                            args.chroms_namemap_find))
                if (args.chroms_prefix_find
                        and args.chroms_namemap_find != NOMAP):
                    parser.error(
                        "\"--chroms-prefix-find\" and \"chroms-namemap-find\" "
                        "cannot used together. Choose one of those options")

                # recomb flag
                if (not isinstance(args.recomb, bool)
                        or (args.recomb != False and args.recomb != True)):
                    parser.error(
                        "\"--recomb\" accepts only True or False values")
                    die(1)

                # out directory
                if args.out == "":  # default option
                    args.out = DEFAULT_OUTDIR
                    print(args.out)

                # threshold on q-value flag
                if (not isinstance(args.qval_t, bool)
                        or (args.qval_t != False and args.qval_t != True)):
                    parser.error(
                        "\"--qvalueT accepts only True or False values")
                    die(1)
                elif args.no_qvalue == True and args.qval_t == True:
                    parser.error(
                        "Unable to apply statistical significance threshold on"
                        " q-values if you don't want them")
                    die(1)

                # number of graph regions to store as PNG images
                if args.top_graphs < 0:
                    parser.error("Negative number of regions to display")

                workflow: Findmotif = Findmotif(args)

                if args.verbose:
                    end_args_parse: float = time.time()
                    print("Arguments parsed in %.2fs." %
                          (end_args_parse - start_args_parse))
            # end if
        # end if

        # chck that external dependencies are satisfied
        if args.verbose:
            sys.stderr.write(
                "Checking GRAFIMO external dependencies {}\n".format(EXT_DEPS))
            start_deps: float = time.time()
        satisfied: bool
        deps_lack: List[str]
        satisfied, deps_lack = check_deps()
        if not satisfied and len(deps_lack) > 0:
            errmsg = "Some dependencies are not satisfied: {}.\nPlease solve them before running GRAFIMO.\n"
            exception_handler(DependencyError, errmsg.format(deps_lack),
                              args.debug)
        elif not satisfied and len(deps_lack) <= 0:
            errmsg = "Dependencies satisfied, but unable to recover them.\n Be sure they are in system PATH.\n"
            exception_handler(DependencyError, errmsg, args.debug)

        if args.verbose and satisfied:
            end_deps: float = time.time()
            print("Dependencies satisfied.")
            print("Dependencies checked in %.2fs." % (end_deps - start_deps))

        #---------------------------------------------------------------
        # dependency check was ok, so we go to workflow selection:
        #   * construction of the genome variation graph for
        #     each chromosome or a user defined subset of them
        #   * scan of a precomputed VG or a set of precomputed VG
        if isinstance(workflow, BuildVG): buildvg(workflow, args.debug)
        elif isinstance(workflow, Findmotif): findmotif(workflow, args.debug)
        else:
            errmsg = "Expected BuildVG or Findmotif, got {}.\n"
            exception_handler(TypeError,
                              errmsg.format(type(workflow).__name__),
                              args.debug)

        end: float = time.time()  # GRAFIMO execution finishes here
        print("Elapsed time %.2fs." % (end - start))

    except KeyboardInterrupt:
        sigint_handler()
    finally:
        pass
예제 #5
0
def compute_results(motif: Motif,
                    sequence_loc: str,
                    args_obj: Optional[Findmotif] = None,
                    testmode: Optional[bool] = False
) -> pd.DataFrame:
    """Score all the sequences extracted from the genome variation graph
    in the regions defined in the input BED file.

    To score the sequences is used the scaled motif scoring matrix, 
    stored in the input Motif instance.

    To each score is assigned a P-value using the P-value matrix, 
    contained in the Motif instance.
    
    Parameters
    ----------
    motif : Motif
        motif data to score sequences
    sequence_loc : str
        path to the intermediate files containing the sequences 
        extracted from the genome variation graph
    args_obj : Findmotif, optional
        container for the arguments needed during the scoring step
    testmode : bool, optional
        flag value manually set used for test purposes

    Returns
    -------
    pandas.DataFrame
        scoring results
    """

    cores:int
    threshold: float
    no_qvalue: bool
    qval_t: bool
    no_reverse: bool
    recomb: bool
    verbose: bool
    errmsg: str

    if not isinstance(sequence_loc, str):
        errmsg = ''.join(["\n\nERROR: unable to locate extracted sequences in ", 
                          sequence_loc])
        raise FileNotFoundError(errmsg)

    if not isinstance(motif, Motif):
        errmsg = "\n\nERROR: the given motif is not an instance of Motif"
        raise ValueError(errmsg)

    if not testmode:
        if not isinstance(args_obj, Findmotif):
            errmsg = "\n\nERROR: unrecognized argument object type"
            raise ValueError(errmsg)

    if not testmode:
        cores = args_obj.get_cores()
        threshold = args_obj.get_threshold()
        no_qvalue = args_obj.get_no_qvalue()
        qval_t = args_obj.get_qvalueT()
        no_reverse = args_obj.get_no_reverse()
        recomb = args_obj.get_recomb()
        verbose = args_obj.get_verbose()
    else:
        cores = 1
        threshold = 1
        recomb = True
        no_qvalue = False
        qval_t = False
        no_reverse = False
        verbose = False

    assert threshold > 0
    assert threshold <= 1
    assert cores >= 1

    print_scoring_msg(no_reverse, motif)

    cwd: str = os.getcwd()
    os.chdir(sequence_loc)

    manager: SyncManager = mp.Manager()
    # results
    return_dict: DictProxy = manager.dict()
    # scanned nucleotides
    scanned_nucs_dict: DictProxy = manager.dict()
    # scanned sequences  
    scanned_seqs_dict: DictProxy = manager.dict()  

    # get all tmp files containing sequences
    sequences: List[str] = glob.glob('*.tsv')  
    if len(sequences) < cores:
        cores = len(sequences)
    # split the sequence set in no. cores chunks
    sequences_split: List[str] = np.array_split(sequences, cores)  

    jobs = list()  # jobs list
    proc_finished: int = 0 

    original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)
    # overwrite the default SIGINT handler to exit gracefully
    # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python
    signal.signal(signal.SIGINT, original_sigint_handler)  
    

    if verbose:
        start_s: float = time.time()

    try:

        # compute results in parallel
        for i in range(cores):
            p = mp.Process(
                target=score_seqs, args=(
                    sequences_split[i], motif, no_reverse, return_dict, 
                    scanned_seqs_dict, scanned_nucs_dict, i
                    )
                )
            jobs.append(p)
            p.start()  
        # end for

        # to print 0%, otherwise start from  % as first chunk id already completed completed
        printProgressBar(proc_finished, cores, prefix='Progress:',
                         suffix='Complete', length=50)
        for job in jobs:
            job.join()  # sync point
            proc_finished += 1
            printProgressBar(proc_finished, cores, prefix='Progress:',
                             suffix='Complete', length=50)
        # end for
    
    except KeyboardInterrupt:
        sigint_handler()
        sys.exit(2)

    else:
        if verbose:
            end_s: float = time.time()
            print(
                "Scored all sequences in %.2fs" % (end_s - start_s)
            )

        else:
            pass # all was OK, go to the next instruction
    
    # end try

    os.chdir(cwd) 

    if not testmode:
        cmd: str = "rm -rf {0}".format(sequence_loc)
        code: int = subprocess.call(cmd, shell=True)

        if code != 0:
            errmsg = "\n\nERROR: an error occurred while running %s" % cmd
            raise SubprocessError(errmsg)
    

    if verbose:
        start_df: str = time.time()

    # recover all analysis results and summarize them in a single 
    # data structure
    seqnames: List[str] = list()
    seqs: List[str] = list()
    chroms: List[str] = list()
    starts: List[int] = list()
    stops: List[int] = list()
    strands: List[str] = list()
    scores: List[np.double] = list()
    pvalues: List[np.double] = list()
    frequencies: List[int] = list()
    references: List[str] = list()

    seqs_scanned: int = 0
    nucs_scanned: int = 0

    for key in return_dict.keys():

        assert isinstance(return_dict[key], ResultTmp)

        seqnames += return_dict[key].get_seqnames()
        seqs += return_dict[key].get_seqs()
        chroms += return_dict[key].get_chroms()
        starts += return_dict[key].get_starts()
        stops += return_dict[key].get_stops()
        strands += return_dict[key].get_strands()
        scores += return_dict[key].get_scores()
        pvalues += return_dict[key].get_pvalues()
        frequencies += return_dict[key].get_frequencies()
        references += return_dict[key].get_references()

        # compute the total number of scanned sequences and nucleotides
        seqs_scanned += scanned_seqs_dict[key]  # the keys are the same as return_dict
        nucs_scanned += scanned_nucs_dict[key]  # the keys are the same as return_dict
    # end for

    qvalues: List[np.double]
    # compute the q-values
    if no_qvalue:
        qvalues = list()  # empty list -> not computed
    else:
        qvalues = compute_qvalues(pvalues)
    # end if

    print("Scanned sequences:", seqs_scanned)
    print("Scanned nucleotides:", nucs_scanned)

    # summarize results in a pandas DF
    finaldf: pd.DataFrame = build_df(motif, seqnames, starts, stops, strands, 
                                     scores, pvalues, qvalues, seqs, frequencies, 
                                     references, threshold, qval_t, no_qvalue, 
                                     recomb)

    if verbose:
        end_df: float = time.time()
        print("\nResults summary built in %.2fs" % (end_df - start_df))
    
    return finaldf
예제 #6
0
def compute_results(motif, sequence_loc, args_obj):
    """
        Score all the sequences extracted from regions defined in the
        input BED file.
        To score sequences is used the processed input motif.
        The results are then stored in a pandas DataFrame
        ----
        Parameters:
            motif (Motif) : processed motif, used to score sequences
            sequence_loc (str) : path to temporary files storing sequences extracted
                                    during the previous step
            args_obj (Findmotif) : arguments used during the sequnece scoring step
        ----
        Returns:
            finaldf (pd.DataFrame) : pandas DataFrame containing the results of
                                        the GRAFIMO analysis
    """

    if not isinstance(sequence_loc, str):
        errmsg = ''.join("\n\nERROR: unable to locate extracted sequences in ",
                         sequence_loc, ". Exiting")
        raise FileNotFoundError(errmsg)

    if not isinstance(motif, Motif):
        raise ValueError(
            "\n\nERROR: the given motif is not an instance of Motif")

    if not isinstance(args_obj, Findmotif):
        raise ValueError("\n\nERROR: unrecognized argument object type")

    # reading arguments
    cores = args_obj.get_cores()
    threshold = args_obj.get_threshold()
    no_qvalue = args_obj.get_no_qvalue()
    qval_t = args_obj.get_qvalueT()
    no_reverse = args_obj.get_no_reverse()
    verbose = args_obj.get_verbose()

    assert threshold > 0
    assert threshold <= 1
    assert cores >= 1

    print_scoring_msg(no_reverse, motif)

    cwd = os.getcwd()
    os.chdir(sequence_loc)  # go to sequence location

    manager = mp.Manager()
    return_dict = manager.dict()  # results
    scanned_nucs_dict = manager.dict()  # nucleotides scanned
    scanned_seqs_dict = manager.dict()  # sequences scanned

    sequences = glob.glob('*.tsv')  # get all tmp files containing sequences
    sequences_split = np.array_split(
        sequences, cores)  # split the sequence set in #cores chunks

    jobs = []  # jobs list
    proc_finished = 0  # number of jobs done

    original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)
    signal.signal(signal.SIGINT, original_sigint_handler
                  )  # overwrite the default SIGINT handler to exit gracefully
    # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python

    if verbose:
        start_s = time.time()

    try:

        # compute results in parallel
        for i in range(cores):
            p = mp.Process(target=score_seqs,
                           args=(sequences_split[i], motif, no_reverse,
                                 return_dict, scanned_seqs_dict,
                                 scanned_nucs_dict, i))
            jobs.append(p)
            p.start()  # start the process
        # end for

        # to print 0%, otherwise start from  % as first chunk id already completed completed
        printProgressBar(proc_finished,
                         cores,
                         prefix='Progress:',
                         suffix='Complete',
                         length=50)
        for job in jobs:
            job.join()  # sync point
            proc_finished += 1
            printProgressBar(proc_finished,
                             cores,
                             prefix='Progress:',
                             suffix='Complete',
                             length=50)
        # end for
    except KeyboardInterrupt:
        sigint_handler()
        sys.exit(2)

    else:

        if verbose:
            end_s = time.time()
            msg = ''.join(
                ["\nScored all sequences in ",
                 str(end_s - start_s), "s"])
            print(msg)

        else:
            pass  # all was OK, go to the next instruction
        # end if
    # end try

    os.chdir(cwd)  # get back to starting point

    cmd = "rm -rf {0}".format(sequence_loc)  # remove temporary sequence files
    code = subprocess.call(cmd, shell=True)

    if code != 0:
        msg = ' '.join(["\n\nERROR: an error occurred while running", cmd])
        raise SubprocessError(msg)
    # end if

    if verbose:
        start_df = time.time()

    # recover all analysis results and summarize them in a single data-structure
    seqnames = []
    seqs = []
    chroms = []
    starts = []
    stops = []
    strands = []
    scores = []
    pvalues = []
    references = []

    seqs_scanned = 0
    nucs_scanned = 0

    for key in return_dict.keys():

        assert isinstance(return_dict[key], ResultTmp)

        seqnames += return_dict[key].get_seqnames()
        seqs += return_dict[key].get_seqs()
        chroms += return_dict[key].get_chroms()
        starts += return_dict[key].get_starts()
        stops += return_dict[key].get_stops()
        strands += return_dict[key].get_strands()
        scores += return_dict[key].get_scores()
        pvalues += return_dict[key].get_pvalues()
        references += return_dict[key].get_references()

        # compute the total number of scanned sequences and nucleotides
        seqs_scanned += scanned_seqs_dict[
            key]  # the keys are the same as return_dict
        nucs_scanned += scanned_nucs_dict[
            key]  # the keys are the same as return_dict
    # end for

    # compute the q-values
    if no_qvalue:
        qvalues = []  # empty list -> not computed
    else:
        qvalues = compute_qvalues(pvalues)
    # end if

    print("Scanned sequences:", seqs_scanned)
    print("Scanned nucleotides:", nucs_scanned)

    # summarize results in a pandas DF
    finaldf = build_df(motif, seqnames, starts, stops, strands, scores,
                       pvalues, qvalues, seqs, references, threshold, qval_t,
                       no_qvalue)

    if verbose:
        end_df = time.time()
        msg = ''.join(
            ["\nBuilt result summary in ",
             str(end_df - start_df), "s"])

    return finaldf
예제 #7
0
def build_motif_MEME(motif_file: str, bg_file: str, pseudocount: float,
                     no_reverse: bool, cores: int, verbose: bool,
                     debug: bool) -> List[Motif]:
    """Read motif PWMs in MEME format.

    It is computed the scoring matrix from the values given with the PWM
    and the P-value matrix to assign a statistical significance to
    each motif occurrence candidate, based on the resulting log-odds
    score.

    ...

    Parameters:
    motif_file : str
        path to the motif PWM 
    bg_file
        path to the background file in Markov Background Format
        (http://meme-suite.org/doc/bfile-format.html).
    pseudocount : float
        value to add to motif PWM counts
    no_reverse : bool
        if False only the forward strand will be considered, otherwise
        both forward and reverse are considered
    cores : int
        number of CPU cores (used when MEME file has more than one PWM)
    verbose : bool
        print additional information
    debug : bool
        trace the full error stack

    Returns
    -------
    Motif
        Motif object storing the data contained in motif_file
    """

    if not isinstance(motif_file, str):
        errmsg = "Expected str, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(motif_file).__name__),
                          debug)
    if not os.path.isfile(motif_file):
        errmsg = "Unable to locate {}.\n"
        exception_handler(FileNotFoundError, errmsg.format(motif_file), debug)
    if not isMEME_ff(motif_file, debug):
        errmsg = "Required MEME motif PWM parsing, but {} is not in MEME format.\n"
        exception_handler(MotifFileFormatError, errmsg.format(motif_file),
                          debug)

    if verbose: start_rm_all: float = time.time()
    motif_lst: List[Motif] = read_MEME_motif(motif_file, bg_file, pseudocount,
                                             no_reverse, verbose, debug)
    motif_num: int = len(motif_lst)
    if verbose:
        end_rm_all: float = time.time()
        print("Read all motifs in %s in %.2fs." %
              (motif_file, (end_rm_all - start_rm_all)))
    print("\nRead {} motifs in {}".format(motif_num, motif_file))
    print("\nProcessing motifs\n")

    complete_motifs = list()  # fully processed motifs
    if verbose: start_mp_all: str = time.time()
    if motif_num >= cores:  # worth to use multiprocessing
        original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)
        pool: mp.Pool = mp.Pool(processes=cores)
        # overwrite the default SIGINT handler to exit gracefully
        # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python
        signal.signal(signal.SIGINT, original_sigint_handler)

        try:
            args = [(motif, debug) for motif in motif_lst]
            res = (pool.starmap_async(process_motif_for_logodds, args))
            it: int = 0
            # ---- progress bar
            while (True):
                if res.ready():
                    # when finished call for the last time printProgressBar()
                    printProgressBar(tot,
                                     tot,
                                     prefix='Progress:',
                                     suffix='Complete',
                                     length=50)
                    break
                if it == 0: tot = res._number_left
                remaining = res._number_left
                printProgressBar((tot - remaining),
                                 tot,
                                 prefix='Progress:',
                                 suffix='Complete',
                                 length=50)
                time.sleep(1)
                it += 1
            complete_motifs += res.get(60 * 60 * 60)  # does not ignore signals
        except KeyboardInterrupt:
            pool.terminate()
            sigint_handler()
        else:
            pool.close()
            if verbose:
                end_mp_all: float = time.time()
                print("Processed motif(s) in %s in %.2fs" %
                      (motif_file, (end_mp_all - start_mp_all)))
            return complete_motifs
    else:
        for m in motif_lst:  # process each found motif
            complete_motifs.append(process_motif_for_logodds(m, debug))
        if verbose:
            end_mp_all: float = time.time()
            print("Processed motif(s) in %s in %.2fs" %
                  (motif_file, (end_mp_all - start_mp_all)))
        return complete_motifs
예제 #8
0
def main(cmdLineargs=None):
    """

        Main function of GRAFIMO.

        The arguments given in input are checked for consistency,
        then a pipeline is followed.

        ----
        Parameters:
            cmdLineargs (str)
        ----
        Returns:
            None

    """

    try:
        # starting point of the execution time
        start = time.time()

        # read the command-line arguments
        parser = get_AP()

        if cmdLineargs is None:
            cmdLineargs = sys.argv[1:] # take input args

        # the second argument must be buildvg or findmotif
        if ((cmdLineargs[0] != "-h" and cmdLineargs[0] != "--help") and
                cmdLineargs[0] != "--version" and
                (cmdLineargs[0] != "buildvg" and cmdLineargs[0] != "findmotif")):
            parser.error("The second argument must be one between 'buildvg' and 'findmotif'")
            die(1)

        args = parser.parse_args(cmdLineargs)  # parse args

        if args.verbose:
            print("Parsing arguments...")
            start_args_parse = time.time()

        #####################################################################
        # check arguments consistency
        #####################################################################

        if args.workflow != "buildvg" and args.workflow != "findmotif":
            parser.error("Do not know what to do. Available options: create VGs with 'grafimo buildvg' or scan a "
                         "precomputed genome variation graph with 'grafimo findmotif'")
            die(1)

        # cores (shared by the two workflows)
        if args.cores < 0:
            parser.error("The number of cores cannot be negative")

        elif args.cores == 0 and args.graph_genome:
            args.cores = 1     # to query a whole genome graph is loaded into RAM, since usually are
                                 # very heavy in terms of bytes is safer to use 1 thread by default, otherwise
                                 # it would be loaded #cores times. If you want use more cores, be sure
                                 # your system can handle the resulting amount of data

        elif args.cores == 0:
            args.cores = mp.cpu_count()  # by default take all the available CPUs
        # end if

        # check verbose flag
        if (not isinstance(args.verbose, bool) or
                (args.verbose != False and args.verbose != True)):
            parser.error('The --verbose parameter accepts only True or False values')

        # chromosomes check (shared by the two workflows)
        for c in args.chroms:
            if c not in CHROMS_LIST:
                parser.error("Invalid chromosome")
                
        args.chroms = initialize_chroms_list(args.chroms)

        # checks for buildvg workflow
        if args.workflow == "buildvg":

            if args.graph_genome_dir:
                parser.error("Invalid arguments for grafimo buildvg")
                die(1)

            elif args.graph_genome:
                parser.error("Invalid arguments for grafimo buildvg")
                die(1)

            elif args.bedfile:
                parser.error("Invalid arguments for grafimo buildvg")
                die(1)

            elif args.motif:
                parser.error("Invalid arguments for grafimo buildvg")
                die(1)

            elif args.bgfile != 'UNIF':  # if default ignored
                parser.error("Invalid arguments for grafimo buildvg")
                die(1)

            elif args.pseudo != 0.1:  # if default ignored
                parser.error("Invalid arguments for grafimo buildvg")
                die(1)

            elif args.threshold != 1e-4:  # if default ignored"
                parser.error("Invalid arguments for grafimo buildvg")
                die(1)

            elif args.no_qvalue:
                parser.error("Invalid arguments for grafimo buildvg")
                die(1)

            elif args.no_reverse:
                parser.error("Invalid arguments for grafimo buildvg")
                die(1)

            elif args.text_only:
                parser.error("Invalid arguments for grafimo buildvg")
                die(1)

            elif args.qval_t:
                parser.error("Invalid arguments for grafimo buildvg")
                die(1)

            elif args.top_graphs != 0:  # if default ignored
                parser.error("Invalid arguments for grafimo buildvg")
                die(1)

            elif not args.linear_genome:
                parser.error("No reference genome given")
                die(1)

            elif not args.vcf:
                parser.error("No VCF file given")
                die(1)

            else:
                # check linear genome
                if (args.linear_genome.split('.')[-1] != 'fa' and
                        args.linear_genome.split('.')[-1] != 'fasta'):
                    parser.error('The linear genome must be in FASTA format (FASTA and FA extensions allowed)')
                    die(1)

                else:
                    if len(glob.glob(args.linear_genome)) != 1:
                        parser.error('Cannot find the given reference genome file')
                        die(1)

                    args.linear_genome = os.path.abspath(args.linear_genome)
                # end if

                # check VCF
                if ((args.vcf.split('.')[-1] != 'gz' and args.vcf.split('.')[-1] != 'zip')
                        or args.vcf.split('.')[-2] != 'vcf'):  # allow only compressed VCF files
                    parser.error('Incorrect VCF file given: the VCF must be compressed (e.g. myvcf.vcf.gz)')
                    die(1)

                else:
                    if len(glob.glob(args.vcf)) <= 0:
                        parser.error('Cannot find the given VCF file')
                        die(1)
                    args.vcf = os.path.abspath(args.vcf)

                # by deafult the built VGs will be stored in the current directory
                if args.out == "grafimo_out":  # general default value
                    args.out = os.path.abspath("./")

                workflow = BuildVG(args)

                if args.verbose:
                    end_args_parse = time.time()
                    print(''.join(["Arguments parsed in ", str(end_args_parse - start_args_parse), "s"]))
            # end if
        # end if

        # checks for findmotif workflow
        if args.workflow == "findmotif":
            if args.linear_genome:
                parser.error("Invalid arguments for grafimo findmotif")
                die(1)

            elif args.vcf:
                parser.error("Invalid arguments for grafimo buildvg")
                die(1)

            elif not args.graph_genome_dir and not args.graph_genome:
                parser.error("No genome variation graph or directory containing them given")
                die(1)

            elif not args.bedfile:
                parser.error("No BED file given")
                die(1)

            elif not args.motif:
                parser.error("No motif file (MEME of JASPAR format) given")
                die(1)

            else:

                # only one between graph_genome and graph_genome_dir allowed
                if args.graph_genome and args.graph_genome_dir:
                    parser.error("Invalid arguments for grafimo buildvg")
                    die(1)

                # check graph_genome
                if args.graph_genome:
                    if (args.graph_genome.split('.')[-1] != 'xg' and
                            args.graph_genome.split('.')[-1] != 'vg'):
                        parser.error("Cannot use the given genome variation graph (only VG or XG format allowed)")
                        die(1)

                    elif not os.path.isfile(args.graph_genome):
                        parser.error("Unable to find the given variation genome graph")
                        die(1)

                    else:
                        graph_genome = os.path.abspath(args.graph_genome)  # safer to use absolute path
                        args.graph_genome = graph_genome
                    # end if
                # end if

                # check graph_genome_dir
                if args.graph_genome_dir:
                    if not os.path.isdir(args.graph_genome_dir):
                        parser.error("Cannot find the given directory containing the genome variation graphs")
                        die(1)

                    if args.graph_genome_dir[-1] == '/':
                        graph_genome_dir = args.graph_genome_dir

                    else:
                        graph_genome_dir = ''.join([args.graph_genome_dir, '/'])
                    # end if

                    if len(glob.glob(graph_genome_dir + '*.xg')) <= 0:
                        parser.error(' '.join(['No XG genome variation graph found in', graph_genome_dir]))
                        die(1)

                    else:
                        graph_genome_dir = os.path.abspath(graph_genome_dir)
                        args.graph_genome_dir = graph_genome_dir
                    # end if
                # end if

                # check BED file
                if args.bedfile:
                    if args.bedfile.split('.')[-1] != 'bed':
                        parser.error('Incorrect BED file given')
                        die(1)

                    else:
                        bedfile = args.bedfile

                        if len(glob.glob(bedfile)) <= 0:
                            parser.error('Cannot find the given BED file')
                    # end if

                else:
                    parser.error('No BED file given')
                # end if

                # check motif file
                if not args.motif:
                    parser.error('No motif given')

                else:
                    motifs = args.motif

                    # check if the given motifs exist
                    for m in motifs:
                        if not isMEME_ff(m) and not isJaspar_ff(m):
                            parser.error("Unrecognized motif file format (only MEME or JASPAR allowed)")
                            die(1)

                        if len(glob.glob(m)) <= 0:
                            parser.error('Cannot find motif file: ' + m)
                            die(1)
                    # end for
                # end if

                # check background file
                if args.bgfile != 'UNIF':
                    bgfile = args.bgfile  # we have a path to a bg file

                    if len(glob.glob(bgfile)) <= 0:
                        parser.error('Cannot find the given background file')
                        die(1)
                # end if

                # check pseudocount
                if args.pseudo <= 0:
                    parser.error('The pseudocount cannot be less than or equal 0')
                    die(1)

                # check threshold
                if args.threshold <= 0 or args.threshold > 1:
                    parser.error('The pvalue threshold must be between 0 and 1')
                    die(1)

                # check q-value flag
                if (not isinstance(args.no_qvalue, bool) or
                        (args.no_qvalue != False and args.no_qvalue != True)):
                    parser.error('The --qvalue parameter accepts only True or False as values')
                    die(1)

                # check no reverse flag
                if (not isinstance(args.no_reverse, bool) or
                        (args.no_reverse != False and args.no_reverse != True)):
                    parser.error('The --no-reverse parameter accepts only True or False as values')
                    die(1)

                # check text only flag
                if (not isinstance(args.text_only, bool) or
                        (args.text_only != False and args.text_only != True)):
                    parser.error('The --text-only parameter accepts only True or False values')
                    die(1)

                # out directory
                if args.out == 'grafimo_out':  # default option
                    # to make unique the output directory we add the PID
                    # to the name.
                    #
                    # This is useful when calling grafimo in different runs on the
                    # same machine.

                    args.out = ''.join([args.out, '_', str(os.getpid())])

                # check threshold on q-value flag
                if (not isinstance(args.qval_t, bool) or
                        (args.qval_t != False and args.qval_t != True)):
                    parser.error("The --qvalueT parameter accepts only True or False as values")
                    die(1)

                elif args.no_qvalue == True and args.qval_t == True:
                    parser.error("Cannot apply the threshold on q-values if you don't want them")
                    die(1)

                # check the number of graph regions to store as PNG images
                if args.top_graphs < 0:
                    parser.error("The number of region graphs to show must be positive")

                workflow = Findmotif(args)

                if args.verbose:
                    end_args_parse = time.time()
                    print(''.join(["Arguments parsed in ", str(end_args_parse - start_args_parse), "s"]))

            # end if
        # end if

        # check that external dependencies are satisfied
        if args.verbose:
            print("Checking GRAFIMO external dependencies " + str(EXT_DEPS))
            start_deps = time.time()

        satisfied, deps_lack = check_deps()

        if not satisfied and len(deps_lack) > 0:
            raise DependencyError("\n\nERROR: The following dependencies are not sastisfied: " +
                                      str(deps_lack) +
                                      "\nPlease, solve them before running GRAFIMO")
            die(1)

        elif not satisfied and len(deps_lack) <= 0:
            raise DependencyError("Some dependencies were found, but was not possible to track them." 
                                        "\nBe sure they are available in system PATH")
            die(1)
        # end if

        if args.verbose and satisfied:
            end_deps = time.time()
            print("Dependencies correctly satisfied")
            print(''.join(["Dependencies checked in ", str(end_deps - start_deps), "s"]))

        #####################################################################

        """
            dependency check was ok, so we go to workflow selection:
               - creation of the genome variation graph for 
                   each chromosome or a user defined subset of them
               - scan of a precomputed VG or a set of precomputed VG
        """

        if isinstance(workflow, BuildVG):
            # build the VG for each chromosome or a user defined subset of them
            buildvg(workflow)

        elif isinstance(workflow, Findmotif):
            # scan a precomputed VG or a set of VGs
            findmotif(workflow)

        else:
            raise ValueError("Unknown arguments object type")
        # end if

        end = time.time()  # GRAFIMO execution finishes here

        print(''.join(["\nElapsed time: ", str(end - start), "s"]))

    except KeyboardInterrupt:
        sigint_handler()

    finally:
        pass
예제 #9
0
def build_motif_MEME(motif_file: str, bg_file: str, pseudocount: float,
                     no_reverse: bool, cores: int,
                     verbose: bool) -> List[Motif]:
    """Read a motif PWM in MEME format.

    The data read are then used to build the scoring matrix for the 
    motif, the P-value matrix, etc.

    Parameters:
    motif_file : str
        path to the motif PWM in MEME format
    bg_file
        path to the background file in Markov Background Format
        (http://meme-suite.org/doc/bfile-format.html).
    pseudocount : float
        value to add to motif PWM counts
    no_reverse : bool
        if False only the forward strand will be considered, otherwise
        both forward and reverse are considered
    cores : int
        Number of cores to use while building the Motif object
    verbose : bool
        print additional information

    Returns
    -------
    Motif
        Motif object storing the data contained in motif_file
    """

    errmsg: str
    if not motif_file:
        errmsg = "\n\nERROR: the motif file is missing"
        raise FileNotFoundError(errmsg)

    if not isMEME_ff(motif_file):
        errmsg = "\n\nERROR: the given motif file is not in MEME format"
        raise NotValidFFException(errmsg)

    if verbose:
        start_rm_all: float = time.time()

    motif_lst: List[Motif]
    motif_lst = read_MEME_motif(motif_file, bg_file, pseudocount, no_reverse,
                                verbose)
    motif_num: int = len(motif_lst)

    if verbose:
        end_rm_all: float = time.time()
        msg: str = ''.join([
            "\nRead all motif contained in ", motif_file, " in ",
            str(end_rm_all - start_rm_all), "s"
        ])
        print(msg)
    # end if

    print("\nRead", motif_num, "motifs in", motif_file)
    print("\nProcessing motifs\n")

    # list of the fully processed motifs
    complete_motifs = list()

    if verbose:
        start_mp_all: str = time.time()

    # process each found motif
    if motif_num >= cores:  # worth to use multiprocessing

        original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)
        pool: mp.Pool = mp.Pool(processes=cores)  # use #cores processes
        signal.signal(signal.SIGINT, original_sigint_handler)
        # overwrite the default SIGINT handler to exit gracefully
        # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python

        try:
            res = (pool.map_async(process_motif_for_logodds, motif_lst))

            it: int = 0
            while (True):
                if res.ready():
                    # when finished call for the last time
                    # printProgressBar()
                    printProgressBar(tot,
                                     tot,
                                     prefix='Progress:',
                                     suffix='Complete',
                                     length=50)
                    break
                # end if
                if it == 0:
                    tot = res._number_left

                remaining = res._number_left
                printProgressBar((tot - remaining),
                                 tot,
                                 prefix='Progress:',
                                 suffix='Complete',
                                 length=50)
                time.sleep(2)
                it += 1
            # end while

            # does not ignore signals
            complete_motifs += res.get(60 * 60 * 60)

        except KeyboardInterrupt:
            pool.terminate()
            sigint_handler()

        else:
            pool.close()

            if verbose:
                end_mp_all: float = time.time()
                print("Processed motif(s) in %s in %.2fs" %
                      (motif_file, (end_mp_all - start_mp_all)))
            # end if

            return complete_motifs
        # end try

    else:

        # process each found motif
        for m in motif_lst:
            complete_motifs.append(process_motif_for_logodds(m))

        if verbose:
            end_mp_all: float = time.time()
            print("Processed motif(s) in %s in %.2fs" %
                  (motif_file, (end_mp_all - start_mp_all)))
        # end if

        return complete_motifs
예제 #10
0
def main(cmdLineargs: Optional[List[str]] = None) -> None :

    try:
        # starting point of the execution time
        start: float = time.time()

        # read the command-line arguments
        parser: GRAFIMOArgumentParser = get_parser()

        if cmdLineargs is None:
            cmdLineargs: List[str] = sys.argv[1:]  # take input args

        # no argument given
        if len(cmdLineargs) == 0:
            parser.error_noargs()
            die(1)

        # the second argument must be buildvg or findmotif
        if ((cmdLineargs[0] != "-h" and cmdLineargs[0] != "--help") and
                cmdLineargs[0] != "--version" and
                (cmdLineargs[0] != "buildvg" and cmdLineargs[0] != "findmotif")):
            parser.error(
                "The second argument must be one between 'buildvg' and 'findmotif'")
            die(1)

        args: argparse.Namespace = parser.parse_args(cmdLineargs) 

        if args.verbose:
            print("Parsing arguments...")
            start_args_parse: float = time.time()

        ################################################################
        # check arguments consistency
        ################################################################

        if args.workflow != "buildvg" and args.workflow != "findmotif":
            parser.error("Do not know what to do. Available options: create VGs "
                         "with 'grafimo buildvg' or scan a precomputed genome "
                         "variation graph with 'grafimo findmotif'")
            die(1)

        # cores (shared by the two workflows)
        if args.cores < 0:
            parser.error("The number of cores cannot be negative")

        elif args.cores == 0 and args.graph_genome:
            # to query a whole genome graph is loaded into RAM, since 
            # usually they are very heavy in terms of bytes is safer to 
            # use 1 thread by default, otherwise it would be loaded 
            # #cores times. If you want use more cores, be sure your 
            # system can handle the resulting amount of data
            args.cores = 1  

        elif args.cores == 0:
            # by default take all the available CPUs
            args.cores = mp.cpu_count() 
        # end if

        # check verbose flag
        if (not isinstance(args.verbose, bool) or
                (args.verbose != False and args.verbose != True)):
            parser.error(
                'The --verbose parameter accepts only True or False values')

        # chromosomes check (shared by the two workflows)
        if len(args.chroms) == 0:
            args.chroms = ['ALL_CHROMS']

        buildvg_err_msg = "Invalid arguments for grafimo buildvg"

        # checks for buildvg workflow
        if args.workflow == "buildvg":

            if args.graph_genome_dir:
                parser.error(buildvg_err_msg)
                die(1)

            elif args.graph_genome:
                parser.error(buildvg_err_msg)
                die(1)

            elif args.bedfile:
                parser.error(buildvg_err_msg)
                die(1)

            elif args.motif:
                parser.error(buildvg_err_msg)
                die(1)

            elif args.bgfile != 'UNIF':  # if default ignored
                parser.error(buildvg_err_msg)
                die(1)

            elif args.pseudo != 0.1:  # if default ignored
                parser.error(buildvg_err_msg)
                die(1)

            elif args.threshold != 1e-4:  # if default ignored"
                parser.error(buildvg_err_msg)
                die(1)

            elif args.no_qvalue:
                parser.error(buildvg_err_msg)
                die(1)

            elif args.no_reverse:
                parser.error(buildvg_err_msg)
                die(1)

            elif args.text_only:
                parser.error(buildvg_err_msg)
                die(1)

            elif args.qval_t:
                parser.error(buildvg_err_msg)
                die(1)

            elif args.recomb:
                parser.error(buildvg_err_msg)
                die(1)

            elif args.top_graphs != 0:  # if default ignored
                parser.error(buildvg_err_msg)
                die(1)

            elif not args.linear_genome:
                parser.error("No reference genome given")
                die(1)

            elif not args.vcf:
                parser.error("No VCF file given")
                die(1)

            else:
                # check linear genome
                if (args.linear_genome.split('.')[-1] != 'fa' and
                        args.linear_genome.split('.')[-1] != 'fasta'):
                    parser.error(
                        "The linear genome must be in FASTA format (FASTA and "
                        "FA extensions allowed)")
                    die(1)

                else:
                    if len(glob.glob(args.linear_genome)) != 1:
                        parser.error(
                            'Cannot find the given reference genome file')
                        die(1)

                    args.linear_genome = os.path.abspath(args.linear_genome)
                # end if

                # check VCF --> the VCF must have been compressed with
                # bgzip (https://github.com/samtools/tabix)
                if ((args.vcf.split('.')[-1] != 'gz' 
                        and args.vcf.split('.')[-1] != 'zip')
                        or args.vcf.split('.')[-2] != 'vcf'):  
                    parser.error(
                        "Incorrect VCF file given: the VCF must be compressed "
                        "(e.g. myvcf.vcf.gz)")
                    die(1)

                else:
                    if len(glob.glob(args.vcf)) <= 0:
                        parser.error('Cannot find the given VCF file')
                        die(1)
                    args.vcf = os.path.abspath(args.vcf)

                # by deafult the built VGs will be stored in the current 
                # directory
                if args.out == "":  # general default value
                    args.out = os.path.abspath("./")

                workflow: BuildVG = BuildVG(args)

                if args.verbose:
                    end_args_parse: float = time.time()
                    print("Arguments parsed in %.2fs" % 
                          (end_args_parse - start_args_parse))
            # end if
        # end if

        findmotif_err_msg = "Invalid arguments for grafimo findmotif"
        
        # checks for findmotif workflow
        if args.workflow == "findmotif":
            if args.linear_genome:
                parser.error(findmotif_err_msg)
                die(1)

            elif args.vcf:
                parser.error(findmotif_err_msg)
                die(1)

            elif args.reindex:  # if default value is ignored
                parser.error(findmotif_err_msg)
                die(1)

            elif not args.graph_genome_dir and not args.graph_genome:
                parser.error(
                    "No genome variation graph or directory containing them given")
                die(1)

            elif not args.bedfile:
                parser.error("No BED file given")
                die(1)

            elif not args.motif:
                parser.error("No motif file (MEME of JASPAR format) given")
                die(1)

            else:

                # only one between graph_genome and graph_genome_dir 
                # are allowed
                if args.graph_genome and args.graph_genome_dir:
                    parser.error("Invalid arguments for grafimo buildvg")
                    die(1)

                # check graph_genome
                if args.graph_genome:
                    if (args.graph_genome.split('.')[-1] != 'xg' and
                            args.graph_genome.split('.')[-1] != 'vg'):
                        parser.error(
                            "Cannot use the given genome variation graph (only "
                            "VG or XG format allowed)")
                        die(1)

                    elif not os.path.isfile(args.graph_genome):
                        parser.error(
                            "Unable to find the given variation genome graph")
                        die(1)

                    else:
                        # it is safer to use absolute path to avoid bugs
                        graph_genome: str = os.path.abspath(args.graph_genome)  
                        args.graph_genome = graph_genome
                    # end if
                # end if

                # check graph_genome_dir
                if args.graph_genome_dir:
                    if not os.path.isdir(args.graph_genome_dir):
                        parser.error(
                            "Cannot find the given directory containing the "
                            "genome variation graphs")
                        die(1)

                    if args.graph_genome_dir[-1] == '/':
                        graph_genome_dir = args.graph_genome_dir

                    else:
                        graph_genome_dir = ''.join([args.graph_genome_dir, '/'])
                    # end if

                    if len(glob.glob(graph_genome_dir + '*.xg')) <= 0:
                        parser.error(
                            ' '.join(['No XG genome variation graph found in', 
                                      graph_genome_dir]))
                        die(1)

                    else:
                        graph_genome_dir: str = os.path.abspath(graph_genome_dir)
                        args.graph_genome_dir = graph_genome_dir
                    # end if
                # end if

                # check BED file
                if args.bedfile:
                    if args.bedfile.split('.')[-1] != 'bed':
                        parser.error('Incorrect BED file given')
                        die(1)

                    else:
                        bedfile: str = args.bedfile

                        if len(glob.glob(bedfile)) <= 0:
                            parser.error('Cannot find the given BED file')
                    # end if

                else:
                    parser.error('No BED file given')
                # end if

                # check motif file
                if not args.motif:
                    parser.error('No motif given')

                else:
                    motifs: List[str] = args.motif

                    # check if the given motifs exist
                    for m in motifs:
                        if not isMEME_ff(m) and not isJaspar_ff(m):
                            parser.error("Unrecognized motif file format (only MEME or JASPAR allowed)")
                            die(1)

                        if len(glob.glob(m)) <= 0:
                            parser.error('Cannot find motif file: ' + m)
                            die(1)
                    # end for
                # end if

                # check background file
                if args.bgfile != 'UNIF':
                    bgfile: str = args.bgfile

                    if len(glob.glob(bgfile)) <= 0:
                        parser.error('Cannot find the given background file')
                        die(1)
                # end if

                # check pseudocount
                if args.pseudo <= 0:
                    parser.error(
                        'The pseudocount cannot be less than or equal 0')
                    die(1)

                # check threshold
                if args.threshold <= 0 or args.threshold > 1:
                    parser.error('The pvalue threshold must be between 0 and 1')
                    die(1)

                # check q-value flag
                if (not isinstance(args.no_qvalue, bool) or
                        (args.no_qvalue != False and args.no_qvalue != True)):
                    parser.error(
                        "The --qvalue parameter accepts only True or False as "
                        "values")
                    die(1)

                # check no reverse flag
                if (not isinstance(args.no_reverse, bool) or
                        (args.no_reverse != False and args.no_reverse != True)):
                    parser.error(
                        "The --no-reverse parameter accepts only True or False "
                        "as values")
                    die(1)

                # check text only flag
                if (not isinstance(args.text_only, bool) or
                        (args.text_only != False and args.text_only != True)):
                    parser.error(
                        "The --text-only parameter accepts only True or False "
                        "values")
                    die(1)

                # check recombinant flag
                if (not isinstance(args.recomb, bool) or
                        (args.recomb != False and args.recomb != True)):
                    parser.error(
                        "The --recomb parameter accepts only True or False values")
                    die(1)

                # out directory
                if args.out == '':  # default option
                    args.out = DEFAULT_OUTDIR 
                    
                # check threshold on q-value flag
                if (not isinstance(args.qval_t, bool) or
                        (args.qval_t != False and args.qval_t != True)):
                    parser.error
                    ("The --qvalueT parameter accepts only True or False as values")
                    die(1)

                elif args.no_qvalue == True and args.qval_t == True:
                    parser.error(
                        "Cannot apply the threshold on q-values if you don't "
                        "want them")
                    die(1)

                # check the number of graph regions to store as PNG images
                if args.top_graphs < 0:
                    parser.error(
                        "The number of region graphs to show must be positive")

                workflow: Findmotif = Findmotif(args)

                if args.verbose:
                    end_args_parse: float = time.time()
                    print("Arguments parsed in %.2fs" % 
                          (end_args_parse - start_args_parse))

            # end if
        # end if

        # check that external dependencies are satisfied
        if args.verbose:
            print("Checking GRAFIMO external dependencies " + str(EXT_DEPS))
            start_deps: float = time.time()

        satisfied: bool 
        deps_lack: List[str] 
        
        satisfied, deps_lack = check_deps()

        if not satisfied and len(deps_lack) > 0:
            raise DependencyError("\n\nERROR: The following dependencies are not" 
                                  " sastisfied: " + str(deps_lack) +
                                  "\nPlease, solve them before running GRAFIMO")

        elif not satisfied and len(deps_lack) <= 0:
            raise DependencyError("Some dependencies were found, but was not "
                                  "possible to track them.\n" 
                                  "Be sure they are available in system PATH")
        # end if

        if args.verbose and satisfied:
            end_deps: float = time.time()
            print("Dependencies correctly satisfied")
            print("Dependencies checked in %.2fs" % (end_deps - start_deps))

        ################################################################

        # dependency check was ok, so we go to workflow selection:
        #   * creation of the genome variation graph for 
        #     each chromosome or a user defined subset of them
        #   * scan of a precomputed VG or a set of precomputed VG

        if isinstance(workflow, BuildVG):
            # build the VG for each chromosome or a user defined subset 
            # of them
            buildvg(workflow)

        elif isinstance(workflow, Findmotif):
            # scan a precomputed VG or a set of VGs
            findmotif(workflow)

        else:
            raise ValueError("Unknown arguments object type")
        # end if

        end: float = time.time()  # GRAFIMO execution finishes here

        print("Elapsed time %.2fs" % (end - start))

    except KeyboardInterrupt:
        sigint_handler()

    finally:
        pass
예제 #11
0
def get_kmers(queries: List[str],
              pool: mp.Pool,
              verbose: Optional[bool] = False) -> None:
    """Extract the genomic sequences (both from reverse and forward 
    strands)in the queried regions from the VG.

    The sequence extraction is perfromed in parallel working on a 
    user defined number of cores (by default all the cores available).

    Parameters
    ----------
    queries : list
        set of queries to perform on the graph to extract the motif
        occurrence candidates
    pool : multiprocessing.Pool
        pool of parallel processes to run  
    verbose : bool, optional
        flag used to define if additional information has to printed

    """

    if not isinstance(queries, list):
        raise Exception

    if verbose:
        start_re: float = time.time()

    # extract regions
    try:
        # query the VGs
        res: mp.pool.MapResult = (pool.map_async(get_seqs, queries))

        if not verbose:
            it: int = 0
            while (True):
                if res.ready():
                    # when finished call for the last time
                    # printProgressBar()
                    printProgressBar(tot,
                                     tot,
                                     prefix='Progress:',
                                     suffix='Complete',
                                     length=50)
                    break
                # end if
                if it == 0:
                    tot = res._number_left

                remaining = res._number_left
                printProgressBar((tot - remaining),
                                 tot,
                                 prefix='Progress:',
                                 suffix='Complete',
                                 length=50)
                time.sleep(2)
                it += 1
            # end while
        # end if

        ret: list = res.get(60 * 60 * 60)  # does not ignore signals

    except KeyboardInterrupt:
        pool.terminate()
        sigint_handler()

    else:
        pool.close()

        if verbose:
            end_re: float = time.time()
            print("Extracted sequences from all regions in %.2fs" %
                  (end_re - start_re))
예제 #12
0
def compute_results(
    motif: Motif,
    sequence_loc: str,
    debug: bool,
    args_obj: Optional[Findmotif] = None,
    testmode: Optional[bool] = False,
) -> pd.DataFrame:
    """Score the sequences extracted from the genome variation graph.

    The potential motif occurrences are scored using the scaled scoring matrix.
    The scaled values are then used to retrieve the corresponding P-value.

    ...
    
    Parameters
    ----------
    motif : Motif
        motif object
    sequence_loc : str
        path to sequences extracted
    debug : bool
        trace the full error stack
    args_obj : Findmotif, optional
        commandline arguments container
    testmode : bool, optional
        test (manually set)

    Returns
    -------
    pandas.DataFrame
        results
    """

    if not isinstance(motif, Motif):
        errmsg = "Expected Motif, got {}.\n"
        exception_handler(TypeError, errmsg.format(type(motif).__name__),
                          debug)
    if not isinstance(sequence_loc, str):
        errmsg = "Expected str, got {}.\n"
        exception_handler(TypeError,
                          errmsg.format(type(sequence_loc).__name__), debug)
    if not os.path.isdir(sequence_loc):
        errmsg = "Unable to locate {}.\n"
        exception_handler(FileNotFoundError, errmsg.format(sequence_loc),
                          debug)
    if not testmode:
        if not isinstance(args_obj, Findmotif):
            errmsg = "Expected Findmotif, got {}.\n"
            exception_handler(TypeError,
                              errmsg.format(type(args_obj).__name__), debug)

    if not testmode:
        cores: int = args_obj.cores
        threshold: float = args_obj.threshold
        no_qvalue: bool = args_obj.noqvalue
        qval_t: bool = args_obj.qvalueT
        no_reverse: bool = args_obj.noreverse
        recomb: bool = args_obj.recomb
        verbose: bool = args_obj.verbose
    else:  # pytest - during normal execution we should never go here
        cores = 1
        threshold = float(1)
        recomb = True
        no_qvalue = False
        qval_t = False
        no_reverse = False
        verbose = False
    assert threshold > 0 and threshold <= 1
    assert cores >= 1

    print_scoring_msg(motif, no_reverse, debug)
    cwd: str = os.getcwd()
    os.chdir(sequence_loc)
    manager: SyncManager = mp.Manager()
    return_dict: DictProxy = manager.dict()  # results
    scanned_nucs_dict: DictProxy = manager.dict()  # scanned nucleotides
    scanned_seqs_dict: DictProxy = manager.dict()  # scanned sequences
    sequences: List[str] = glob.glob('*.tsv')  # sequences
    if len(sequences) < cores: cores = len(sequences)
    # split the sequence set in no. cores chunks
    sequences_split: List[str] = np.array_split(sequences, cores)
    jobs = list()  # jobs list
    proc_finished: int = 0
    # overwrite the default SIGINT handler to exit gracefully
    # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python
    original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)
    signal.signal(signal.SIGINT, original_sigint_handler)
    if verbose: start_s: float = time.time()
    try:
        for i in range(cores):
            p = mp.Process(target=score_seqs,
                           args=(sequences_split[i], motif, no_reverse,
                                 return_dict, scanned_seqs_dict,
                                 scanned_nucs_dict, i, debug))
            jobs.append(p)
            p.start()
        # to print 0%, otherwise start from % as first chunk id already completed completed
        printProgressBar(proc_finished,
                         cores,
                         prefix='Progress:',
                         suffix='Complete',
                         length=50)
        for job in jobs:
            job.join()  # sync point
            proc_finished += 1
            printProgressBar(proc_finished,
                             cores,
                             prefix='Progress:',
                             suffix='Complete',
                             length=50)
    except KeyboardInterrupt:
        sigint_handler()
        die(2)
    else:
        if verbose:
            end_s: float = time.time()
            print("Scored all sequences in %.2fs" % (end_s - start_s))
    os.chdir(cwd)
    if not testmode:
        cmd: str = "rm -rf {}".format(sequence_loc)
        code: int = subprocess.call(cmd, shell=True)
        if code != 0:
            errmsg = "An error occurred while executing {}.\n"
            exception_handler(SubprocessError, errmsg.format(cmd), debug)
    if verbose: start_df: str = time.time()
    # recover all analysis results and summarize them in a single
    # data structure
    seqs_scanned: int = 0
    nucs_scanned: int = 0
    summary = ResultTmp()
    for key in return_dict.keys():
        partialres = return_dict[key]
        summary.append_list(partialres[0], partialres[1], partialres[2],
                            partialres[3], partialres[4], partialres[5],
                            partialres[6], partialres[7], partialres[8],
                            partialres[9])
        seqs_scanned += scanned_seqs_dict[key]
        nucs_scanned += scanned_nucs_dict[key]
    if summary.isempty():
        errmsg = "No result retrieved. Unable to proceed. Are you using the correct VGs and searching on the right chromosomes?\n"
        exception_handler(ValueError, errmsg, debug)
    # compute the q-values
    if not no_qvalue:
        if verbose: start_q = time.time()
        qvalues = compute_qvalues(summary.pvalues, debug)
        summary.add_qvalues(qvalues)
        if verbose:
            end_q = time.time()
            print("Q-values computed in %.2fs." % (end_q - start_q))
    print("Scanned sequences:\t{}".format(seqs_scanned))
    print("Scanned nucleotides:\t{}".format(nucs_scanned))
    # summarize results in a pandas DataFrame
    finaldf = summary.to_df(motif,
                            threshold,
                            qval_t,
                            recomb,
                            ignore_qvals=no_qvalue)
    if verbose:
        end_df: float = time.time()
        print("\nResults summary built in %.2fs" % (end_df - start_df))

    return finaldf
예제 #13
0
def build_motif_MEME(motif_file, bg_file, pseudocount, no_reverse, cores,
                     verbose):
    """
        Build a the Motif object starting from the data
        stored in a given MEME file.

        The probabilities are processed and the resulting values
        are used to build the scoring matrix for the motif.
        ----
        Parameters:
            motif_file (str) : path to the motif file
            bg_file (str) : path to the background file
            pseudocount (float) : value to add to the motif counts
            no_reverse (bool) : if set to True, only data related to
                                forward strand will be used
            cores (int) : number of cores to use, during motif processing
        ----
        Returns:
            motif (Motif) : Motif object built from data contained in
                            motif_file
    """

    if not motif_file:
        raise FileNotFoundError("\n\nERROR: the motif file is missing")

    # check if the input is in MEME format
    if not isMEME_ff(motif_file):
        # if in other format we should not be here
        raise NotValidFFException(
            "\n\nERROR: the given motif file is not in MEME format")

    if verbose:
        start_rm_all = time.time()

    # read the motif file
    motif_lst = read_MEME_motif(motif_file, bg_file, pseudocount, no_reverse,
                                verbose)
    motif_num = len(motif_lst)

    if verbose:
        end_rm_all = time.time()
        msg = ''.join([
            "\nRead all motif contained in ", motif_file, " in ",
            str(end_rm_all - start_rm_all), "s"
        ])
        print(msg)
    # end if

    print("\nRead", motif_num, "motifs in", motif_file)
    print("\nProcessing motifs\n")

    # list of the fully processed motifs
    complete_motifs = []

    if verbose:
        start_mp_all = time.time()

    # process each found motif
    if motif_num >= cores:  # worth to use multiprocessing

        original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)
        pool = mp.Pool(processes=cores)  # use #cores processes
        signal.signal(
            signal.SIGINT, original_sigint_handler
        )  # overwrite the default SIGINT handler to exit gracefully
        # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python

        try:
            res = (pool.map_async(process_motif_for_logodds, motif_lst))

            it = 0
            while (True):
                if res.ready():
                    # when finished call for the last time printProgressBar()
                    printProgressBar(tot,
                                     tot,
                                     prefix='Progress:',
                                     suffix='Complete',
                                     length=50)
                    break
                # end if
                if it == 0:
                    tot = res._number_left

                remaining = res._number_left
                printProgressBar((tot - remaining),
                                 tot,
                                 prefix='Progress:',
                                 suffix='Complete',
                                 length=50)
                time.sleep(2)
                it += 1
            # end while

            complete_motifs += res.get(60 * 60 * 60)  # does not ignore signals

        except KeyboardInterrupt:
            pool.terminate()
            sigint_handler()

        else:
            pool.close()

            if verbose:
                end_mp_all = time.time()
                msg = ''.join([
                    "Processed all motifs contained in ", motif_file, " in ",
                    str(end_mp_all - start_mp_all), "s"
                ])
                print(msg)
            # end if

            return complete_motifs
        # end try

    else:  # the sequential execution is fine

        # process each found motif
        for m in motif_lst:
            complete_motifs.append(process_motif_for_logodds(m))

        if verbose:
            end_mp_all = time.time()
            msg = ''.join([
                "Processed all motifs contained in ", motif_file, " in ",
                str(end_mp_all - start_mp_all), "s"
            ])
            print(msg)
        # end if

        return complete_motifs
예제 #14
0
def get_regions(motif, args_obj):
    """
        Compute all sequences of length L (L is the
        motif width) from the VG(s).
        The sequences are extracted from the regions defined
        in the input BED file.
        ----
        Parameters:
            motif (Motif) : motif to search on the VG
            args_obj (Findmotif) : object storing the arguments
                                    required to extract the
                                    regions defined in the BED
                                    file, from the VG(s)
        ----
        Return:
            sequence_loc (str) : location of the tmp files,
                                    containing the extracted
                                    sequences
    """

    # check the input arguments
    if not isinstance(motif, Motif):
        errmsg = "\n\nERROR: unknown motif object type"
        raise ValueError(errmsg)

    if args_obj.has_graph_genome():
        vg = args_obj.get_graph_genome()

        if not isGraph_genome_xg(vg):
            errmsg = "\n\nERROR: the genome variation graph is not in XG format"
            raise VGException(errmsg)
        # end if

    elif args_obj.has_graph_genome_dir():
        vg = args_obj.get_graph_genome_dir()

    else:
        raise VGException("\n\nERROR: the genome variation graph is missing")
    # end if

    bedfile = args_obj.get_bedfile()
    motif_width = motif.getWidth()
    chroms = args_obj.get_chroms()
    cores = args_obj.get_cores()

    global verbose
    verbose = args_obj.get_verbose()

    print("\nExtracting regions defined in", bedfile, "\n")

    # read the regions where search the motif occurrences from the given BED file
    regions = getBEDregions(bedfile)

    if verbose:
        print("\nFound", len(regions), "regions in", bedfile)

    if chroms:
        # user defined subset of the chromosomes
        chr_list = [''.join(['chr', c]) for c in chroms]
    else:
        # all the chromosomes
        chr_list = [''.join(['chr', c]) for c in CHROMS_LIST]
    # end if

    # create a tmp working directory
    tmpwd = tempfile.mkdtemp(prefix='grafimo_')

    # if the tmp directory name already exists remove it
    # this shouldn't happen, but to be sure
    if os.path.isdir(tmpwd):
        cmd = 'rm -rf {0}'.format(tmpwd)
        code = subprocess.call(cmd, shell=True)

        if code != 0:
            raise SubprocessError(' '.join(
                ["an error occurred executing", cmd, ". Exiting"]))
    # end if

    cmd = 'mkdir -p {0}'.format(tmpwd)
    code = subprocess.call(cmd, shell=True)
    if code != 0:
        raise SubprocessError(' '.join(
            ["an error occurred executing", cmd, ". Exiting"]))

    # get the new location of graphs wrt the tmp dir
    cwd = os.getcwd()

    # enter the tmp dir where store the extracted sequences
    os.chdir(tmpwd)

    if verbose:
        start_re = time.time()

    # redefine default SIGINT handler
    original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)
    pool = mp.Pool(processes=cores)  # use #cores processes
    signal.signal(signal.SIGINT, original_sigint_handler
                  )  # overwrite the default SIGINT handler to exit gracefully
    # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python

    if args_obj.has_graph_genome_dir():

        # vg -> directory containing a set of VGs
        if vg[-1] == "/":
            pass
        else:
            vg = ''.join([vg, "/"])
        # end if

        queries = []  # set of queries

        for region in regions:
            chrom = region['chr']
            start = region['start']
            stop = region['stop']

            if chrom in chr_list:

                # the chromosome is among the ones to query
                region_index = ''.join(
                    [chrom, ':', str(start), '-',
                     str(stop)])
                region_name = ''.join([chrom, '_', str(start), '-', str(stop)])
                seqs = correct_path('./', region_name, '.tsv')

                xg = ''.join([vg, chrom, '.xg'])

                if not os.path.exists(xg):
                    errmsg = ''.join(
                        ["\n\nERROR: unable to use ", xg, ". Exiting"])
                    raise FileNotFoundError(errmsg)

                query = 'vg find -x {0} -E -p {1} -K {2} > {3}'.format(
                    xg, region_index, motif_width, seqs)
                queries.append(query)

        # extract regions
        try:

            # query the VGs
            res = (pool.map_async(get_seqs, queries))

            if not verbose:
                it = 0
                while (True):
                    if res.ready():
                        # when finished call for the last time printProgressBar()
                        printProgressBar(tot,
                                         tot,
                                         prefix='Progress:',
                                         suffix='Complete',
                                         length=50)
                        break
                    # end if
                    if it == 0:
                        tot = res._number_left

                    remaining = res._number_left
                    printProgressBar((tot - remaining),
                                     tot,
                                     prefix='Progress:',
                                     suffix='Complete',
                                     length=50)
                    time.sleep(2)
                    it += 1
                # end while
            # end if

            ret = res.get(60 * 60 * 60)  # does not ignore signals

        except KeyboardInterrupt:
            pool.terminate()
            sigint_handler()

        else:
            pool.close()

            if verbose:
                end_re = time.time()
                msg = ''.join([
                    "Extracted all regions from VGs stored in ", vg, ", in ",
                    str(end_re - start_re), "s"
                ])
                print(msg)
            # end if
        # end try

    elif args_obj.has_graph_genome():

        queries = []  # set of queries

        for region in regions:
            chrom = region['chr']
            start = region['start']
            stop = region['stop']

            if chrom in chr_list:

                # the chromosome is among the ones to query
                region_index = ''.join(
                    [chrom, ':', str(start), '-',
                     str(stop)])
                region_name = ''.join([chrom, '_', str(start), '-', str(stop)])
                seqs = correct_path('./', region_name, '.tsv')

                if not os.path.exists(vg):
                    errmsg = ''.join(
                        ["\n\nERROR: unable to use ", vg, ". Exiting"])
                    raise FileNotFoundError(errmsg)

                query = 'vg find -x {0} -E -p {1} -K {2} > {3}'.format(
                    vg, region_index, motif_width, seqs)
                queries.append(query)

        # extract regions
        try:

            # query the VGs
            res = (pool.map_async(get_seqs, queries))

            if not verbose:
                it = 0
                while (True):
                    if res.ready():
                        # when finished call for the last time printProgressBar()
                        printProgressBar(tot,
                                         tot,
                                         prefix='Progress:',
                                         suffix='Complete',
                                         length=50)
                        break
                    # end if
                    if it == 0:
                        tot = res._number_left

                    remaining = res._number_left
                    printProgressBar((tot - remaining),
                                     tot,
                                     prefix='Progress:',
                                     suffix='Complete',
                                     length=50)
                    time.sleep(2)
                    it += 1
                # end while
            # end if

            ret = res.get(60 * 60 * 60)  # does not ignore signals

        except KeyboardInterrupt:
            pool.terminate()
            sigint_handler()

        else:
            pool.close()

            if verbose:
                end_re = time.time()
                msg = ''.join([
                    "Extracted all regions from VGs stored in ", vg, ", in ",
                    str(end_re - start_re), "s"
                ])
                print(msg)
            # end if
        # end try

    else:
        raise Exception("\n\nERROR: do not know how to proceed".Exiting)
    # end if

    sequence_loc = os.getcwd()  # the extracted sequences are store in the cwd
    os.chdir(cwd)  # get back to the origin

    return sequence_loc