def compute_qvalues(pvalues: List[np.double], debug: bool) -> List[np.double]: """Corrects P-values with False Discovery Rate Benjamini-Hochberg procedure. ... Parameters ---------- pvalues : list P-values debug : bool trace the full error stack Returns ------- list corrected P-values (q-values) """ if not isinstance(pvalues, list): errmsg = "Expected list, got {}.\n" exception_handler(TypeError, errmsg.format(type(pvalues).__name__), debug) print("\nComputing q-values...\n") # use Benjamini-Hochberg procedure to correct P-values mt_obj = multipletests(pvalues, method="fdr_bh") qvalues: List[float] = list(mt_obj[1]) return qvalues
def print_results(results: pd.DataFrame, debug: bool): """Print GRAFIMO results to stdout. It is printed the tab-separated result summary. ... Parameters ---------- results : pandas.DataFrame analysis results debug : bool trace the full error stack """ if not isinstance(results, pd.DataFrame): errmsg = "Expected pandas.DataFrame, got {}.\n" exception_handler(TypeError, errmsg.format(type(results).__name__), debug) # little hack in pd df parameters to avoid the weird default # print of a DataFrame (cut the majority of lines) pd.set_option("display.max_rows", len(results)) print() # newline print(results) pd.reset_option("display.max_rows")
def average_bg_with_rc(bgs: Dict, debug: bool): """Background probabilities are averaged with those occurring on the reverse complement strand. Parameters ---------- bgs : dict background probability distribution debug: bool trace full error stack Returns ------- dict background probability distribution averaged for reverse complement feequencies """ if not isinstance(bgs, dict): errmsg = "Expected dict, got {}.\n" exception_handler(TypeError, errmsg.format(type(bgs).__name__), debug) bgs_avg: Dict = dict() for nuc in bgs.keys(): rc: str = REV_COMPL[nuc.upper()] if REV_COMPL[rc] == nuc and ord(nuc) < ord(rc): avg_freq = np.double((bgs[nuc] + bgs[rc]) / np.double(2)) bgs_avg.update({nuc: avg_freq}) bgs_avg.update({rc: avg_freq}) return bgs_avg
def print_scoring_msg(motif: Motif, noreverse: bool, debug: bool): """Message printed when scoring procedure begins. ... Parameters ---------- motif : Motif motif object noreverse : bool skip reverse strand sequences debug : bool trace the full error stack """ if not isinstance(motif, Motif): errmsg = "Expected Motif, got {}.\n" exception_handler(TypeError, errmsg.format(type(motif).__name__), debug) if not isinstance(noreverse, bool): errmsg = "Expected bool, got {}.\n" exception_handler(TypeError, errmsg.format(type(noreverse).__name__), debug) fw_id: str = "".join(["+", motif.motifID]) if not noreverse: rev_id: str = "".join(["-", motif.motifID]) msg = "Scoring hits for motif {}." print(msg.format(fw_id)) if not noreverse: print(msg.format(rev_id), end="\n\n")
def get_regions_bed(bedfile: str, debug: bool) -> Tuple[Dict, int]: """Read BED file and store genomic regions in a dictionary with the chromosome numbers as keys. This allows to optimize VG cache loading. ... Parameters ---------- bedfile : str path to BED file debug : bool trace the full error stack Returns ------- dict genomic regions grouped by chromosome int number of genomic regions """ if not isinstance(bedfile, str): errmsg = "Expected str, got {}.\n" exception_handler(TypeError, errmsg.format(type(bedfile).__name__), debug) if not os.path.isfile(bedfile): errmsg = "Unable to locate {}.\n" exception_handler(FileNotFoundError, errmsg.format(bedfile), debug) if not isbed(bedfile, debug): errmsg = "{} is not a UCSC BED file.\n" exception_handler(FileFormatError, errmsg.format(bedfile), debug) if os.stat(bedfile).st_size == 0: errmsg = "{} is empty.\n" exception_handler(FileReadError, errmsg.format(bedfile), debug) regions: Dict = dict() region_num: int = 0 gzipped = False ff = bedfile.split(".")[-1] if ff == "gz": gzipped = True # file is compressed try: if gzipped: ifstream = gzip.open(bedfile, mode="rt") else: ifstream = open(bedfile, mode="r") while True: line = ifstream.readline() if not line: break # EOF or empty line? if line.startswith("chr"): # data chrom, start, stop = line.strip().split()[:3] if chrom not in regions.keys(): regions.update({chrom:[(start, stop)]}) else: regions[chrom].append((start, stop)) region_num += 1 except: errmsg = "An error occurred while reading {}.\n" exception_handler(FileReadError, errmsg.format(bedfile), debug) finally: ifstream.close() return regions, region_num
def indexVG(vg: str, vcf: str, threads: int, verbose: bool, debug: bool) -> int: """Construct the XG and GBWT indexes for the given genome variation graph. These indexes are required to query the genome when extracting motif occurrence candidates. The GBWT index allows to keep track of the haplotypes used to build the graph data structure and retrieve the samples genomes. The indexing operation could take some time. Parameters ---------- vg : str path to the genome variation graph (VG format) vcf : str path to the phased VCF file used to build the corresponding VG threads : int number of threads to use during indexing verbose : bool print information about graph indexing Returns ------- int status of VG indexing (0 = all ok; 1 = an error occurred) """ if not isinstance(vg, str): errmsg = "Expected str instance, got {}.\n" exception_handler(TypeError, errmsg.format(type(vg).__name__), debug) if not os.path.exists(vg): errmsg = "Unable to find {}.\n" exception_handler(FileNotFoundError, errmsg, debug) success: int # take chromosome name and add it the XG extension graph_name: str = vg.split('.')[-2] xg: str = ''.join([graph_name, ".xg"]) gbwt: str = ''.join([graph_name, ".gbwt"]) # perform indexing of the current genome variation graph if verbose: # print information about indexing vg_index: str = 'vg index -t {0} -G {1} -v {2} -x {3} {4} -p'.format( threads, gbwt, vcf, xg, vg) else: vg_index = 'vg index -t {0} -G {1} -v {2} -x {3} {4}'.format( threads, gbwt, vcf, xg, vg) code: int = subprocess.call(vg_index, shell=True) if code != 0: success = 1 else: success = 0 return success
def get_chromlist(ref_genome: str, debug: bool) -> List[str]: """Scan the reference genome FASTA file to find the chromosomes for which there a sequence is available. The file must be in FASTA format and the chromosome names start with '>chr' (e.g. '>chrX', '>chr1', etc.) Parameters ---------- ref_genome : str path to the reference genome FASTA file Returns ------- list chomosomes for which a sequence is available in the given reference genome FASTA file """ assert os.path.isfile(ref_genome) # redefine default SIGINT handler original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN) # overwrite original SIGINT handler signal.signal(signal.SIGINT, original_sigint_handler) chroms = list() try: with open(ref_genome, mode='r') as ifstream: while True: line = ifstream.readline() if not line: return # empty file ? if line[0] == ">": break # data start here while True: if line[0] != ">": errmsg = "Sequence names in FASTA file should begin with \">\"\n." exception_handler(FileReadError, errmsg, debug) else: seqname = line.rstrip().split()[0][1:] # skip ">" line = ifstream.readline() while True: if not line: break # empty sequence ? if line[0] == ">": break # sequence end line = ifstream.readline() chroms.append(seqname) if not line: break # reached EOF except KeyboardInterrupt: sigint_handler() except: errmsg = "A problem was encountered reading {}\n." exception_handler(FileReadError, errmsg.format(ref_genome), debug) finally: ifstream.close() return chroms
def get_kmers( queries: List[str], pool: mp.Pool, debug: bool, verbose: Optional[bool] = False, ) -> None: """Retrieve sequences from genome variation graph(s). The k-mers search is made in parallel creating #cores processes. ... Parameters ---------- queries : list list of queries pool : multiprocessing.Pool pool ps debug : bool trace the full error stack verbose : bool, optional print additional information """ if not isinstance(queries, list): errmsg = "Expected list, got {}.\n" exception_handler(TypeError, errmsg.format(type(queries).__name__), debug) if verbose: start_re: float = time.time() try: res: mp.pool.MapResult = (pool.map_async(get_seqs, queries)) if not verbose: it: int = 0 while (True): if res.ready(): printProgressBar( tot, tot, prefix='Progress:', suffix='Complete', length=50 ) break if it == 0: tot = res._number_left remaining = res._number_left printProgressBar( (tot - remaining), tot, prefix='Progress:', suffix='Complete', length=50 ) time.sleep(1) it += 1 ret: list = res.get(60 * 60 * 60) # does not ignore signals except KeyboardInterrupt: pool.terminate() sigint_handler() else: pool.close() if verbose: end_re: float = time.time() print("Extracted sequences from all regions in %.2fs" % (end_re - start_re))
def norm_motif(motif_probs: pd.DataFrame, motif_width: int, alphabet: List[str], debug: bool) -> pd.DataFrame: """Normalize motif PWM. The PWM values must be given as probability (so called PFM), rather than simple raw counts. Parameters ---------- motif_probs : pandas.DataFrame motif probability matrix (PFM) motif_width : int motif width alphabet : list DNA motif alphabet debug: bool trace the full error stack Returns ------- pandas.DataFrame normalized motif probability matrix (nPFM) """ if not isinstance(motif_probs, pd.DataFrame): errmsg = "Expected pandas.DataFrame, got {}.\n" exception_handler(TypeError, errmsg.format(type(motif_probs).__name__), debug) if not isinstance(motif_width, int): errmsg = "Expected int, got {}.\n" exception_handler(TypeError, errmsg.format(type(motif_width).__name__), debug) if motif_width <= 0: errmsg = "Forbidden motif width.\n" exception_handler(ValueError, errmsg, debug) if not isinstance(alphabet, list): errmsg = "Expected list, got {}.\n" exception_handler(TypeError, errmsg.format(type(alphabet).__name__), debug) if any([nuc not in DNA_ALPHABET for nuc in alphabet]): errmsg = "The motif is not built on DNA alphabet.\n" exception_handler(ValueError, errmsg, debug) # tolerance in the difference between the position probability and 1 tolerance: float = 0.00001 for j in range(motif_width): tot = np.double(0) for nuc in alphabet: tot += motif_probs.loc[nuc, j] assert tot != 0 if not almost_equal(1, tot, tolerance): for nuc in alphabet: motif_probs.loc[nuc, j] = np.double(motif_probs.loc[nuc, j] / tot) return motif_probs
def buildvg(args_obj: BuildVG, debug: bool) -> None: """Call the functions needed to constuct the genome variation graph from a reference FASTA file and a phased VCF file. Parameters ---------- args_obj : BuildVG container of the argumentgs needed to build a genome variation graph """ if not isinstance(args_obj, BuildVG): errmsg = "Expected BuildVG object, got {}.\n" exception_handler(TypeError, errmsg.format(type(args_obj).__name__), debug) printWelcomeMsg() # if verbose == True print a lot of info verbose = args_obj.verbose print("\n\nBuilding the VG for chromosome:") for c in args_obj.chroms: print(c, end=" ") print("\n") # newline if verbose: print("Buildvg user parameters:") print("\t- Reference genome: ", args_obj.reference_genome) print("\t- VCF file: ", args_obj.vcf) print("\t- Reindex: ", args_obj.reindex) print("\t- Chromosomes: ", args_obj.chroms) print("\t- Chromosome prefix: ", args_obj.chroms_prefix) print("\t- Name-map: ", args_obj.namemap) print("\t- Cores: ", args_obj.cores) print("\t- Output directory: ", args_obj.outdir) print("\t- Debug:", debug) print("\t- Verbose: ", verbose) print("\t- Test mode: ", args_obj.get_test()) # end if if verbose: print("\nBeginning VGs construction\n") # begin VGs construction construct_vg( args_obj, debug) # the VGs will be stored in the defined output directory
def process_motif_for_logodds(motif: Motif, debug: bool) -> Motif: """Computes log-odds values from motif probability matrix (PFM). While processing motif probability matrix for log-odds values is also computed the p-value matrix for the current motif PWM. ... Parameters ---------- motif : Motif DNA motif debug : bool trace the full error stack Returns ------- Motif motif log-odds matrix """ if not isinstance(motif, Motif): errmsg = "Expected Motif, got {}.\n" exception_handler(TypeError, errmsg.format(type(motif).__name__), debug) # compute log-odds motif_log_odds = compute_log_odds(motif.countMatrix, motif.width, motif.bg, motif.alphabet, motif.nucsmap, debug) motif.set_motifScoreMatrix(motif_log_odds) # log-odds matrix scaling scaled_scores, min_val, max_val, scale, offset = scale_pwm( motif.scoreMatrix, motif.alphabet, motif.width, motif.nucsmap, debug) motif.set_motifScoreMatrix(scaled_scores) motif.set_isScaled() motif.set_scale(scale) motif.set_minVal(min_val) motif.set_maxVal(max_val) motif.set_offset(offset) # compute p-value matrix pval_mat = comp_pval_mat(motif, debug) motif.set_motifPvalMatrix(pval_mat) return motif
def __read_counts_meme(motif_file: str, ifstream, width: int, debug: bool) -> List[List[np.double]]: """Read motif letter probabilities from MEME files. ... Parameters ---------- motif_file : str path to motif PWM ifstream : _io.TextIOWrapper input stream width : int motif width debug: trace the full error stack Returns ------- list motif letter probabilities """ a = list() c = list() g = list() t = list() pos = 0 for line in ifstream: freqs = line.split() if len(freqs) != 4: if pos < width: errmsg = "Unexpected end of motif found.\n" exception_handler(EOFError, errmsg, debug) break # motif stop a.append(np.double(freqs[0])) c.append(np.double(freqs[1])) g.append(np.double(freqs[2])) t.append(np.double(freqs[3])) pos += 1 probs = [a, c, g, t] if any([len(p) != len(probs[0]) for p in probs]): errmsg = "Mismatch in letter probabilities vectors lengths.\n" exception_handler(ValueError, errmsg, debug) return probs
def pseudo_bg(bgs: Dict, no_reverse: bool, debug: bool) -> Dict: """Add pseudocount and normalize the nucleotides background probabilities. The processed background probabilities are then used to compute the scoring matrix from the input motif PWM. When considered both forward and reverse strand, the background probabilities are weighted and averaged on both strands. After the weighting and averaging steps (if required), the background probabilities are normalized. ... Parameters ---------- bgs : dict background probability distribution no_reverse : bool if True the averaging and weighting operation on bg probabilities on both DNA strands are skipped (only fwd strand considered). debug: bool trace the full error stack Returns ------- dict normalized background probability distribution """ if not isinstance(bgs, dict): errmsg = "Expected dict, got {}.\n" exception_handler(TypeError, errmsg.format(type(bgs).__name__), debug) if not isinstance(no_reverse, bool): errmsg = "Expected bool, got {}.\n" exception_handler(TypeError, errmsg.format(type(no_reverse).__name__), debug) if not no_reverse: # fwd + rev strand bgs_avg = average_bg_with_rc(bgs, debug) else: # only fwd bgs_avg = bgs bgs_proc = norm_bg(bgs_avg, debug) return bgs_proc
def get_reference_genome_from_ucsc(debug) -> str: """Download the reference genome (hg38 assembly), from the UCSC database, in the current working directory and returns the path to the corresponding FASTA file. This function has been written only for test purposes Parameters ---------- Returns ------- str path to the downloaded FASTA file (in .fa format) """ cmd: str code: int errmsg: str # download genome address = "ftp://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz" cmd = "wget -c {}".format(address) # the genome will be downloaded in the current directory code = subprocess.call(cmd, shell=True) if code != 0: errmsg = "An error occurred while executing \"{}\". Exiting.\n" exception_handler(SubprocessError, errmsg.format(cmd), debug) # uncompress genome print("Uncompressing the genome...") genome_comp: str = './hg38.fa.gz' if not os.path.exists(genome_comp): errmsg = "Unable to find {}.\n" exception_handler(FileNotFoundError, errmsg.format(genome_comp), debug) cmd = 'gunzip {0}'.format(genome_comp) code = subprocess.call(cmd, shell=True) if code != 0: errmsg = "An error occurred while executing \"{}\". Exiting.\n" exception_handler(SubprocessError, errmsg.format(cmd), debug) # remove FASTA.GZ file if still present if os.path.exists(genome_comp): cmd = 'rm {0}'.format(genome_comp) code = subprocess.call(cmd, shell=True) if code != 0: errmsg = "An error occurred while executing \"{}\". Exiting.\n" exception_handler(SubprocessError, errmsg.format(cmd), debug) # get the path to the genome file genome_uncomp: str = "./hg38.fa" # should be in the current dir assert os.path.exists(genome_uncomp) genome: str = os.path.abspath(genome_uncomp) return genome
def __read_alphabet_meme(motif_file: str, ifstream, debug: bool) -> List[str]: """Read alphabet from MEME files. ... Parameters ---------- motif_file : str path to motif PWM ifstream : _io.TextIOWrapper input stream debug : bool trace the full error stack Returns ------- list alphabet """ for line in ifstream: if line.startswith("ALPHABET"): break else: errmsg = "Unexpected EOF reached, unable to parse {}.\n" exception_handler(EOFError, errmsg.format(motif_file), debug) if not line.startswith("ALPHABET"): errmsg = "No line stores alphabet in {}.\n" exception_handler(ValueError, errmsg.format(motif_file), debug) line = line.strip().replace("ALPHABET= ", "") if line == "ACGT": alphabet = sorted(list(line)) else: errmsg = "The motif is not built on DNA alphabet.\n" exception_handler(ValueError, errmsg, debug) assert isListEqual(alphabet, DNA_ALPHABET) return alphabet
def isVGindexed(vg: str, debug: bool) -> bool: """Check if the genome variation graph has been indexed (XG format). ... Parameters ---------- vg : str path to genome variation graph debug : bool trace the full error stack Returns ------- bool check result """ if not isinstance(vg, str): errmsg = "Expected str, got{}.\n" exception_handler(TypeError, errmsg.format(type(vg).__name__), debug) if not os.path.isfile(vg): errmsg = "Unable to locate {}.\n" exception_handler(FileNotFoundError, errmsg.format(vg), debug) ff = vg.split(".")[-1] if ff == "xg": return True elif ff == "vg": return False else: # unknown genome variation graph format errmsg = "Unknown genome variation graph format (VG or XG allowed).\n" exception_handler(VGError, errmsg, debug)
def get_1000GProject_vcf(debug) -> str: """Downloads a WGS VCF file from the 1000 Genome Project database (phase 3), containing SNVs and indels. The present file is used for VG construction and graph indexing test purposes. Since the variants present in this file are not phased, it cannot be used to build the GBWT index and the corresponding haplotypes cannot be used. To use this features the VCF must be phased first. Parameters ---------- Returns ------- str path to the downloaded VCF file (compressed) """ address: str cmd: str code: int errmsg: str # download the VCF address = 'ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/' address += '1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/' address += 'ALL.wgs.shapeit2_integrated_snvindels_v2a.GRCh38.27022019.sites.vcf.gz' cmd = 'wget -c {0}'.format(address) code = subprocess.call(cmd, shell=True) if code != 0: errmsg = "An error occurred while executing \"{}\". Exiting.\n" exception_handler(SubprocessError, errmsg.format(cmd), debug) # vcf should be in the current dir vcf_file: str = './ALL.wgs.shapeit2_integrated_snvindels_v2a.GRCh38.27022019.sites.vcf.gz' assert os.path.exists(vcf_file) vcf: str = os.path.abspath(vcf_file) return vcf
def norm_bg(bgs: Dict, debug: bool): """Normalize the background probability distribution. Parameters ---------- bgs : dict background probability distribution debug: bool trace the full error stack Returns ------- dict normalized background probability distribution """ if not isinstance(bgs, dict): errmsg = "Expected dict, got {}.\n" exception_handler(TypeError, errmsg.format(type(bgs).__name__), debug) alphabet: List[str] = sorted(list(bgs.keys())) tot = np.double(len(alphabet) * PSEUDObg) bgs_norm = dict() # PSEUDO = np.double(0.0000005) for nuc in bgs.keys(): tot += np.double(bgs[nuc]) assert tot > 0 for nuc in bgs.keys(): prob = np.double((bgs[nuc] + PSEUDObg) / tot) bgs_norm.update({nuc: prob}) tot = np.double(0) for nuc in bgs.keys(): tot += bgs[nuc] assert tot != 0 return bgs_norm
def build_motif_JASPAR(motif_file: str, bg_file: str, pseudocount: float, no_reverse: bool, verbose: bool, debug: bool) -> Motif: """Build the Motif object from a JASPAR motif Position Weight Matrix. It is computed the scoring matrix from the values given with the PWM and the P-value matrix to assign a statistical significance to each motif occurrence candidate, based on the resulting log-odds score. ... Parameters ---------- motif_file : str path to the motif PWM bg_file path to the background file in Markov Background Format (http://meme-suite.org/doc/bfile-format.html). pseudocount : float value to add to motif PWM counts no_reverse : bool if False only the forward strand will be considered, otherwise both forward and reverse are considered verbose : bool print additional information debug : bool trace the full error stack Returns ------- Motif processed motif object """ if not isinstance(motif_file, str): errmsg = "Expected str, got {}.\n" exception_handler(TypeError, errmsg.format(type(motif_file).__name__), debug) if not os.path.isfile(motif_file): errmsg = "Unable to locate {}.\n" exception_handler(FileNotFoundError, errmsg.format(motif_file), debug) if not isJaspar_ff(motif_file, debug): errmsg = "Required JASPAR motif PWM parsing, but {} is not in JASPAR format.\n" exception_handler(MotifFileFormatError, errmsg.format(motif_file), debug) if not isinstance(bg_file, str): errmsg = "Expected str, got {}.\n" exception_handler(TypeError, errmsg.format(type(bg_file).__name__), debug) if bg_file != UNIF and not os.path.isfile(bg_file): errmsg = "Unable to locate {}.\n" exception_handler(FileNotFoundError, errmsg.format(bg_file), debug) if pseudocount <= 0: errmsg = "Pseudocount value must be positive.\n" exception_handler(ValueError, pseudocount, debug) if not isinstance(no_reverse, bool): errmsg = "Expected bool, got {}.\n" exception_handler(TypeError, errmsg.format(type(no_reverse).__name__), debug) # parse motif PWM motif: Motif = read_JASPAR_motif(motif_file, bg_file, pseudocount, no_reverse, verbose, debug) if verbose: start_mp: float = time.time() motif = process_motif_for_logodds(motif, debug) # get log-odds values for motif if verbose: end_mp: float = time.time() print("Motif %s processed in %.2fs" % (motif.motifID, (end_mp - start_mp))) return motif
def construct_vg(buildvg_args: BuildVG, debug: bool) -> None: """ Create the genome graph from the given genome reference and phased VCF file given. The genome is not built as a single whole genome graph but a single graph is constructed for each chromosome. This approach avoids memory issues and allows the genome variation graph construction also on machines with less resources. There is NO drawback using this approach with respect to build a whole genome graph and query it. Moreover, it allows parallel queries on the different chromosomes to be perfromed also on regular laptops (>= 16 GB of memory), which is very difficult with a whole genome graph, that requires the user to set appropriately the number of cores to use. Anyway a whole genome graph can be queried using a regular laptop using one core. Parameters ---------- buildvg_args : BuildVG container for the arguments required to build the genome variation graph """ errmsg: str if not isinstance(buildvg_args, BuildVG): errmsg = "Expectd BuildVG object, got {}.\n" exception_handler(TypeError, errmsg.format(type(buildvg_args).__name__), debug) # read the arguments to build the VGs reindex: bool = buildvg_args.reindex chroms: List[str] = buildvg_args.chroms chroms_prefix: str = buildvg_args.chroms_prefix namemap: Dict = buildvg_args.namemap threads: int = buildvg_args.cores outdir: str = buildvg_args.outdir verbose: bool = buildvg_args.verbose test: bool = buildvg_args.get_test() # manually set in the code msg: str reference: str vcf: str if test: reference = get_reference_genome_from_ucsc(debug) vcf = get_1000GProject_vcf(debug) else: reference = buildvg_args.reference_genome vcf = buildvg_args.vcf if verbose: print("using reference genome: ", reference) print("Using VCF file: ", vcf, "\n\n") if verbose: start_c: float = time.time() print("Reading chromosome names from {}...".format(reference)) # read chromosome names in reference FASTA chroms_available: List[str] = get_chromlist(reference, debug) if verbose: end_c: int = time.time() print("done in %.2fs" % (end_c - start_c)) print("Found chromosomes:\n", chroms_available, end="\n\n") if len(chroms) == 1 and chroms[0] == ALL_CHROMS: chroms: List[str] = chroms_available else: # check user-defined chromosome names consistency with names in # reference for c in chroms: if c not in chroms_available: errmsg = "Chromosome \"{}\" not found among names in {}.\n" exception_handler(ValueError, errmsg.format(c, reference), debug) cwd: str = os.getcwd() cmd: str code: int # check if the VCF file has already been indexed with tabix if not tbiexist(vcf): msg = "TBI file not found for {}. Indexing VCF file with tabix..." print(msg.format(vcf.split('/')[-1])) cmd = 'tabix -p vcf {0}'.format(vcf) code = subprocess.call(cmd, shell=True) if code != 0: # tabix didn't work errmsg = "An error occurred while executing \"{}\". Exiting.\n" exception_handler(SubprocessError, errmsg.format(cmd), debug) elif reindex: # user asked to reindex VCF msg = "Reindexing {}...\n" print(msg.format(vcf.split('/')[-1])) # remove old index cmd = "rm {0}".format(''.join([vcf, ".tbi"])) code = subprocess.call(cmd, shell=True) if code != 0: errmsg = "An error occurred while executing \"{}\". Exiting.\n" exception_handler(SubprocessError, errmsg.format(cmd), debug) # reindex the VCF cmd = "tabix -p vcf {0}".format(vcf) code = subprocess.call(cmd, shell=True) if code != 0: # tabix didn't work errmsg = "An error occurred while executing \"{}\". Exiting.\n" exception_handler(SubprocessError, errmsg.format(cmd), debug) # end if # enter the output directory os.chdir(outdir) if chroms_prefix: assert not bool(namemap) if bool(namemap): assert chroms_prefix != "chr" # build the VG for each chromosome or only for those told by user for chrname in chroms: if not bool(namemap): chrom: str = "".join([chroms_prefix, chrname]) elif bool(namemap): try: chrom: str = namemap[chrname] except: errmsg = "Missing out name map for chromosome \"{}\".\n'" exception_handler(KeyError, errmsg.format(chrname), debug) vg: str = ''.join([".", chrom, '.vg']) # build VG for current chromosome if verbose: start_build: float = time.time() code = build_vg(vg, reference, vcf, chrname, threads) if code != 0: errmsg = "An error occurred during construction of {}.\n" exception_handler(VGError, errmsg.format(vg), debug) if verbose: end_build: float = time.time() msg = "Elapsed time to build {}:" print(msg.format(vg), "%.2fs" % (end_build - start_build), sep=" ") # index VG if verbose: start_index: float = time.time() msg = "Indexing {} VG and building the GBWT index..." print(msg.format(chrom)) code = indexVG(vg, vcf, threads, verbose, debug) if code != 0: errmsg = "An error occurred while indexing {}." exception_handler(VGError, errmsg.format(vg), debug) if verbose: end_index: float = time.time() msg = "Elapsed time to index {}" print(msg.format(vg), "%.2fs" % (end_index - start_index), sep=" ") # end if # The majority of applications work only with indexed graph, # so to save disk space is worth to delete the VGs and keep # only the XGs (is simple to go back using VG built-in functions) if verbose: print("Deleting {0}".format(vg)) cmd = 'rm {0}'.format(vg) subprocess.call(cmd, shell=True) if code != 0: errmsg = "An error occurred while executing \"{}\". Exiting.\n" exception_handler(SubprocessError, errmsg.format(cmd), debug) # end for # get VGs location graphs_loc: str = os.getcwd() # return to the original working directory os.chdir(cwd)
def writeGFF3(prefix: str, data: pd.DataFrame, no_qvalue: bool, debug: bool) -> None: """Write GFF3 file (https://www.ensembl.org/info/website/upload/gff3.html). The GFF3 file annotates the potential motf occurrences found by GRAFIMO. The report can be loaded as custom track to the UCSC genome browser for results visualization. ... Parameters ---------- prefix : str out filename prefix data : pandas.DataFrame analysis results no_qvalue : bool ignore q-values debug : bool trace the full error stack """ if not isinstance(prefix, str): errmsg = "Expected str, got {}.\n" exception_handler(TypeError, errmsg.format_map(type(prefix).__name__), debug) if not isinstance(data, pd.DataFrame): errmsg = "Expected pandas.DataFrame, got {}.\n" exception_handler(TypeError, errmsg.format(type(data).__name__), debug) if not isinstance(no_qvalue, bool): errmsg = "Expected bool, got {}.\n" exception_handler(TypeError, errmsg.format(type(no_qvalue).__name__), debug) data_list = dftolist(data, no_qvalue, debug) try: gfffn = ".".join([prefix, "gff"]) ofstream = open(gfffn, mode='w+') header = "##gff-version 3\n" ofstream.write(header) if not no_qvalue and len(data_list) != 12: errmsg = "Q-values columns seems to be missing.\n" exception_handler(ValueError, errmsg, debug) if no_qvalue and len(data_list) != 11: errmsg = "Too many or too few columns.\n" exception_handler(ValueError, errmsg, debug) data_list_size: int = len(data_list[0]) for i in range(data_list_size): seqname: str = data_list[2][i] chrom: str = seqname.split(":")[0] # take only chromosome name score: float = round(data_list[6][i], 1) strand: str = data_list[5][i] if strand == "-": # keep forward strand coordinates start = str(data_list[4][i]) stop = str(data_list[3][i]) else: start = str(data_list[3][i]) stop = str(data_list[4][i]) motifID: str = data_list[0][i] motifName: str = data_list[1][i] pvalue: float = np.format_float_scientific(data_list[7][i], exp_digits=2) sequence: str = data_list[8][i] reference: str = data_list[10][i] if not no_qvalue: qvalue: float = np.format_float_scientific(data_list[11][i], exp_digits=2) # gff line attributes att1: str = "".join( ["Name=", motifID, "_", seqname, strand, ":", reference]) att2: str = "".join(["Alias=", motifName]) att3: str = "".join(["ID=", motifID, "-", motifName, "-", seqname]) att4: str = "".join(["pvalue=", str(pvalue)]) att5: str = "".join(["sequence=", sequence, ";\n"]) # end of gff line if not no_qvalue: attqv: str = "".join(["qvalue=", str(qvalue)]) atts = ";".join([att1, att2, att3, att4, attqv, att5]) else: atts = ";".join([att1, att2, att3, att4, att5]) # full gff line gffline: str = "\t".join([ chrom, SOURCE, TP, start, stop, str(score), strand, PHASE, atts ]) ofstream.write(gffline) except: errmsg = "An error ocurred while writing {}.\n" exception_handler(FileWriteError, errmsg.format(gfffn), debug) finally: ofstream.close()
def scale_pwm(motif_matrix: np.ndarray, alphabet: List[str], motif_width: int, nucsmap: dict, debug: bool) -> Tuple[np.ndarray, int, int, int, np.double]: """Scale the motif log-odds matrix scores to integer values. The values are scaled in the range [0, 1000]. The scaling improves computational speed while scoring potential motif occurrences, and allows constant time p-value estimatimation. ... Parameters ---------- motif_matrix : numpy.ndarray motif log-odds matrix alphabet: list DNA motif alphabet motif_width: int motif width nucsmap: dict nucleotide index map debug : bool trace the full error stack Returns ------- numpy.ndarray scaled motif score matrix int minimum value of the scaled score matrix int maximum value of the scaled score matrix int scaling factor numpy.double scaling offset """ if not isinstance(motif_matrix, np.ndarray): errmsg = "Expected numpy.ndarray, got {}.\n" exception_handler(TypeError, errmsg.format(type(motif_matrix).__name__), debug) if motif_matrix.size == 0 or sum(sum(motif_matrix)) == 0: errmsg = "The motif log-odds natrix is empty.\n" exception_handler(ValueError, errmsg, debug) if not isinstance(alphabet, list): errmsg = "Expected list, got {}.\n" exception_handler(TypeError, errmsg.format(type(alphabet).__name__), debug) if not isListEqual(alphabet, DNA_ALPHABET): errmsg = "The motif is not built on DNA alphabet.\n" exception_handler(ValueError, errmsg, debug) if not isinstance(motif_width, int): errmsg = "Expected int, got {}.\n" exception_handler(TypeError, errmsg.format(type(motif_width).__name__), debug) if motif_width <= 0: errmsg = "Forbidden motif width.\n" exception_handler(ValueError, errmsg, debug) if not isinstance(nucsmap, dict): errmsg = "Expected dict, got {}.\n" exception_handler(TypeError, errmsg.format(type(nucsmap).__name__), debug) min_val = motif_matrix.min() max_val = motif_matrix.max() motif_matrixsc = np.zeros(motif_matrix.shape, dtype=np.double) lower: int = min_val upper: int = max_val if lower == upper: # all values are equal lower = np.double(upper - 1) lower = np.floor(lower) offset = np.round(np.floor(lower)) scale_factor = np.floor(RANGE / (upper - lower)) # values scaled in [0, 1000] for nuc in alphabet: for j in range(motif_width): scaled_score = np.round( (motif_matrix[nucsmap[nuc], j] - (offset)) * scale_factor) motif_matrixsc[nucsmap[nuc], j] = scaled_score # make sure the values are integers motif_matrixsc = motif_matrixsc.astype(int) min_val = int(motif_matrixsc.min()) # scaled min max_val = int(motif_matrixsc.max()) # scaled max return motif_matrixsc, min_val, max_val, int(scale_factor), offset
def get_motif_pwm(motif_file: str, args_obj: Findmotif, cores: int, debug: bool) -> List[Motif]: """Construction of Motif object from PWM file. The motif PWM is processed in order to obtain the corresponding scoring matrix (values scaled in [0,1000]) and the corresponding P-value matrix, which is used to assign statistical significance to motif occurrence candidates scores. To store all these informations is created a Motif object. ... Parameters ---------- motif_file : str path to motif PWM file (MEME or JASPAR format) args_obj : Findmotif arguments container cores : int CPU cores to use during motif processing (used only when processing MEME motif files with multiple PWMs) debug : bool trace the full error stack Returns ------- List[Motif] Motif objects """ bgs: dict = args_obj.bgfile pseudo: float = args_obj.pseudo no_reverse: bool = args_obj.noreverse verbose: bool = args_obj.verbose errmsg: str if not isinstance(motif_file, str): errmsg = "Expected str, got {}.\n" exception_handler(TypeError, errmsg.format(type(motif_file).__name__), debug) if not os.path.isfile(motif_file): errmsg = "Unable to locate {}.\n" exception_handler(FileNotFoundError, errmsg.format(motif_file), debug) if (not isMEME_ff(motif_file, debug)) and (not isJaspar_ff( motif_file, debug)): errmsg = "Motif PWM must be in MEME or JASPAR format.\n" exception_handler(MotifFileFormatError, errmsg, debug) # chhose motif PWM parsing method if isJaspar_ff(motif_file, debug): motif = build_motif_JASPAR(motif_file, bgs, pseudo, no_reverse, verbose, debug) elif isMEME_ff(motif_file, debug): motif = build_motif_MEME(motif_file, bgs, pseudo, no_reverse, cores, verbose, debug) else: errmsg = "Motif PWM must be in MEME or JASPAR format.\n" exception_handler(MotifFileFormatError, errmsg, debug) # list instance required to proceed if not isinstance(motif, list): motif = [motif] assert isinstance(motif, list) return motif
def write_results(results: pd.DataFrame, motif: Motif, motif_num: int, args_obj: Findmotif, debug: bool) -> None: """Write GRAFIMO results in three files (TSV report, HTML report, GFF3 file). The TSV and HTML reports stores the found potential motif occurrence in tabular format The GFF3 report stores annotations for the found motif occurrence candidates. ... Parameters ---------- results : pandas.DataFrame analysis results motif_id : Motif motif motif_num : int number of searched motifs args_obj : Findmotif commandline arguments container debug : bool trace the full error stack """ if not isinstance(results, pd.DataFrame): errmsg = "Expected pandas.DataFrame, got {}.\n" exception_handler(TypeError, errmsg.format(type(results).__name__), debug) if len(results) == 0: errmsg = "No potential motif occurrence retreived.\n" exception_handler(ValueError, errmsg, debug) if not isinstance(motif, Motif): errmsg = "Expected Motif, got {}.\n" exception_handler(TypeError, errmsg.format(type(motif).__name__), debug) if not isinstance(motif_num, int): errmsg = "Expected int, got {}.\n" exception_handler(TypeError, errmsg.format(type(motif_num).__name__), debug) if motif_num <= 0: errmsg = "No motif searched. Probably something went wrong.\n" exception_handler(ValueError, errmsg, debug) if not isinstance(args_obj, Findmotif): errmsg = "Expected Findmotif, got {}.\n" exception_handler(TypeError, errmsg.format(type(args_obj).__name__), debug) # get resuls storing arguments outdir: str = args_obj.outdir no_qvalue: bool = args_obj.noqvalue top_graphs: int = args_obj.top_graphs verbose: bool = args_obj.verbose if args_obj.has_graphgenome(): vg = args_obj.graph_genome elif args_obj.has_graphgenome_dir: vg = args_obj.graph_genome_dir else: errmsg = "No genome variation graph given.\n" exception_handler(VGError, errmsg, debug) dirname_default: bool = False cwd: str = os.getcwd() if outdir == DEFAULT_OUTDIR: # to make unique the output directory we add the PID # to the name. # # This is useful when calling grafimo in different runs on the # same machine. outdir = "_".join(["grafimo_out", str(os.getpid()), motif.motifID]) dirname_default = True if not os.path.isdir(outdir): cmd = "mkdir -p {}".format(outdir) code = subprocess.call(cmd, shell=True) if code != 0: errmsg = "An error occurred while executing {}.\n" exception_handler(SubprocessError, errmsg.format(cmd), debug) os.chdir(outdir) else: os.chdir(outdir) # overwrite the content of the directory print("\nWriting results in %s.\n" % outdir) if not dirname_default and motif_num > 1: prefix = "_".join(["grafimo_out", motif.motifID ]) # each file is labeled with the motif ID else: prefix = "grafimo_out" if verbose: start_tsv: float = time.time() # write the TSV results.to_csv(".".join([prefix, "tsv"]), sep='\t', encoding='utf-8') if verbose: end_tsv: float = time.time() print("%s.tsv written in %.2fs" % (prefix, (end_tsv - start_tsv))) start_html: float = time.time() # write the HTML results.to_html(".".join([prefix, "html"])) if verbose: end_html: float = time.time() print("%s.html written in %.2fs" % (prefix, (end_html - start_html))) start_gff: float = time.time() # write the GFF3 writeGFF3(prefix, results, no_qvalue, debug) if verbose: end_gff: float = time.time() print("%s.gff written in %.2fs" % (prefix, (end_gff - start_gff))) # get the graphs of the top n regions if top_graphs > 0: regions = set() for r in results["sequence_name"].tolist(): if len(regions) >= top_graphs: break # abort loop regions.add(r) # avoid repeated regions # regions = set(results["sequence_name"].tolist()[:top_graphs]) if len(regions) == 0: errmsg = "No region obtained, the results seems to be empty.\n" exception_handler(ValueError, errmsg, debug) if len(regions) < top_graphs: warnmsg = "WARNING: requested %d regions, obtained %d.\n" print(warnmsg % (top_graphs, len(regions))) if verbose: print("Extracting %d region variation graphs" % len(regions)) # create the directory for the regions images if motif_num > 1: image_dir = "_".join(["top_graphs", motif.motifID]) else: image_dir = "top_graphs" if verbose: print("Graphs will be stored in %s." % image_dir) cmd = "mkdir -p {0}".format(image_dir) code = subprocess.call(cmd, shell=True) if code != 0: errmsg = "An error ocurred while executing {}." exception_handler(SubprocessError, errmsg.format(cmd), debug) assert os.path.isdir(image_dir) os.chdir(image_dir) print("Writing the top %d graphs in %s\n" % (len(regions), image_dir)) try: for r in regions: if verbose: print("Computing the PNG image of {}".format(r)) if args_obj.has_graphgenome(): get_region_graph(r, args_obj.chroms_prefix, args_obj.namemap, debug, graph_genome=args_obj.graph_genome) elif args_obj.has_graphgenome_dir(): get_region_graph( r, args_obj.chroms_prefix, args_obj.namemap, debug, graph_genome_dir=args_obj.graph_genome_dir) else: errmsg = "Unknown VG type. Unable to print regions PNG images.\n" exception_handler(ValueError, errmsg, debug) except: errmsg = "An error occurred while computing PNG image of {}.\n" exception_handler(VGError, errmsg.format(r), debug) os.chdir(cwd)
def get_region_graph( region: str, chroms_prefix: str, namemap: dict, debug: bool, graph_genome: Optional[str] = None, graph_genome_dir: Optional[str] = None, ) -> None: """Compute the PNG image of genomic regions encoded in genome variation graph(s). ... Parameters ---------- region : str genomic region chroms_prefix : str chromosome prefix namemap : dict chromosome names map debug : bool trace the full error stack graph_genome : str path to genome variation graph graph_genome_dir : str path to directory of genome variation graphs """ if not isinstance(region, str): errmsg = "Expected str, got {}.\n" exception_handler(TypeError, errmsg.format(type(region).__name__), debug) if not isinstance(chroms_prefix, str): errmsg = "Expected str, got {}.\n" exception_handler(TypeError, errmsg.format(type(chroms_prefix).__name__), debug) if not isinstance(namemap, dict): errmsg = "Expected dict, got {}.\n" exception_handler(TypeError, errmsg.format(type(namemap).__name__), debug) if graph_genome is None and graph_genome_dir is None: errmsg = "graph_genome and graph_genome_dir cannot be both None.\n" exception_handler(ValueError, errmsg, debug) if graph_genome is not None and graph_genome_dir is not None: errmsg = "graph_genome and graph_genome_dir cannot be both not None.\n" exception_handler(ValueError, errmsg, debug) if graph_genome is not None: if not isinstance(graph_genome, str): errmsg = "Expected str, got {}.\n" exception_handler(TypeError, errmsg.format(type(graph_genome).__name__), debug) if not os.path.isfile(graph_genome): errmsg = "Unable to locate {}.\n" exception_handler(FileNotFoundError, errmsg.format(graph_genome), debug) if graph_genome_dir is not None: if not isinstance(graph_genome_dir, str): errmsg = "Expected str, got {}.\n" exception_handler(TypeError, errmsg.format(type(graph_genome_dir).__name__), debug) if not os.path.isdir(graph_genome_dir): errmsg = "Unable to locate {}." exception_handler(FileNotFoundError, errmsg.format(graph_genome_dir), debug) if graph_genome and graph_genome_dir is None: has_graphgenome = True else: has_graphgenome = False # graph_genome is None and graph_genome_dir == True if has_graphgenome: # single genome variation graph vgregion = "".join([".", region, ".vg"]) cmd = "vg find -x {} -E -p {} > {}".format(graph_genome, region, vgregion) code = subprocess.call(cmd, shell=True) if code != 0: errmsg = "An error occurred while executing {}.\n" exception_handler(SubprocessError, errmsg.format(cmd), debug) else: # has_graphgenome == False chrom = region.split(":")[0] if bool(namemap): chrom = namemap[chrom] chrname = "".join([chroms_prefix, chrom]) xg = os.path.join(graph_genome_dir, ".".join([chrname, "xg"])) vgregion = "".join([".", region, ".vg"]) cmd = "vg find -x {} -E -p {} > {}".format(xg, region, vgregion) code = subprocess.call(cmd, shell=True) if code != 0: errmsg = "An error occurred while executing {}.\n" exception_handler(SubprocessError, errmsg.format(cmd), debug) dotregion = "".join([".", region, ".dot"]) cmd = "vg view {} -dp > {}".format(vgregion, dotregion) code = subprocess.call(cmd, shell=True) if code != 0: errmsg = "An error occurred while executing {}.\n" exception_handler(SubprocessError, errmsg.format(cmd), debug) pngimage = ".".join([region, "png"]) cmd = "dot -Tpng {} -o {}".format(dotregion, pngimage) code = subprocess.call(cmd, shell=True) if code != 0: errmsg = "An error occurred while executing {}.\n" exception_handler(SubprocessError, errmsg.format(cmd), debug) # remove unused files cmd = "rm -rf .*.vg" code = subprocess.call(cmd, shell=True) if code != 0: errmsg = "An error occurred while executing {}.\n" exception_handler(SubprocessError, errmsg.format(cmd), debug) cmd = "rm -rf .*.dot" code = subprocess.call(cmd, shell=True) if code != 0: errmsg = "An error occurred while executing {}.\n" exception_handler(SubprocessError, errmsg.format(), debug)
def read_JASPAR_motif(motif_file: str, bg_file: str, pseudocount: float, no_reverse: bool, verbose: bool, debug: bool) -> Motif: """Read a motif PWM in JASPAR format. The data read are then used to build the scoring matrix for the motif, the P-value matrix, etc. ... Parameters ---------- motif_file : str path to the motif PWM in JASPAR format bg_file path to the background file in Markov Background Format (http://meme-suite.org/doc/bfile-format.html). pseudocount : float value to add to motif PWM counts no_reverse : bool if False only the forward strand will be considered, otherwise both forward and reverse are considered verbose : bool print additional information debug: trace the full error stack Returns ------- Motif Motif object """ nucs: List[str] = list() counts: List[float] = list() if verbose: start_rm: float = time.time() try: ifstream = open(motif_file, mode="r") readlines = 0 # check for empty files # begin parsing header: str = str(ifstream.readline().strip()[1:]) if not header: # empty file? errmsg = "{} seems to empty.\n" exception_handler(IOError, errmsg.format(motif_file), debug) motifID, motifName = header.split('\t')[0:2] readlines += 1 while True: line = ifstream.readline().strip() if not line: break # EOF or empty file? nuc = line.strip()[:1] count = list(map(float, line.strip()[1:].split()[1:][:-1])) nucs.append(nuc.upper()) counts.append(count) readlines += 1 if readlines <= 1: # only header read ? errmsg = "{} seems to be empty.\n" exception_handler(IOError, errmsg.format(motif_file), debug) except: errmsg = "An error occurred while reading {}.\n" exception_handler(MotifFileReadError, errmsg.format(motif_file), debug) else: if any([len(c) != len(counts[0]) for c in counts]): errmsg = "Motif counts width mismatch.\n" exception_handler(ValueError, errmsg, debug) nucsmap = dict() # used with np object for i in range(len(nucs)): nucsmap.update({nucs[i]: i}) motif_counts: pd.DataFrame = pd.DataFrame( data=counts, index=nucs) # motif count matrix motif_width: int = int(len(counts[0])) alphabet: list = sorted(nucs) # compute background if bg_file == UNIF: bgs = get_uniformBG(alphabet, debug) elif os.path.isfile(bg_file): bgs = readBGfile(bg_file, debug) else: errmsg = "Unable to parse {}.\n" exception_handler(BGFileError, errmsg.format(bg_file), debug) bgs = pseudo_bg(bgs, no_reverse, debug) # add pseudocount to bg # motif probability matrix motif_probs = (motif_counts / motif_counts.sum(0)) motif_probs = norm_motif(motif_probs, motif_width, alphabet, debug) motif_probs = apply_pseudocount_jaspar(motif_counts.to_numpy(), motif_probs.to_numpy(), pseudocount, bgs, motif_width, alphabet, nucsmap, debug) motif: Motif = Motif(motif_probs, motif_width, alphabet, motifID, motifName, nucsmap) motif.setBg(bgs) if verbose: end_rm: float = time.time() msg: str = "Read motif %s in %.2fs" % (motifID, (end_rm - start_rm)) print(msg) finally: ifstream.close() return motif
def build_motif_MEME(motif_file: str, bg_file: str, pseudocount: float, no_reverse: bool, cores: int, verbose: bool, debug: bool) -> List[Motif]: """Read motif PWMs in MEME format. It is computed the scoring matrix from the values given with the PWM and the P-value matrix to assign a statistical significance to each motif occurrence candidate, based on the resulting log-odds score. ... Parameters: motif_file : str path to the motif PWM bg_file path to the background file in Markov Background Format (http://meme-suite.org/doc/bfile-format.html). pseudocount : float value to add to motif PWM counts no_reverse : bool if False only the forward strand will be considered, otherwise both forward and reverse are considered cores : int number of CPU cores (used when MEME file has more than one PWM) verbose : bool print additional information debug : bool trace the full error stack Returns ------- Motif Motif object storing the data contained in motif_file """ if not isinstance(motif_file, str): errmsg = "Expected str, got {}.\n" exception_handler(TypeError, errmsg.format(type(motif_file).__name__), debug) if not os.path.isfile(motif_file): errmsg = "Unable to locate {}.\n" exception_handler(FileNotFoundError, errmsg.format(motif_file), debug) if not isMEME_ff(motif_file, debug): errmsg = "Required MEME motif PWM parsing, but {} is not in MEME format.\n" exception_handler(MotifFileFormatError, errmsg.format(motif_file), debug) if verbose: start_rm_all: float = time.time() motif_lst: List[Motif] = read_MEME_motif(motif_file, bg_file, pseudocount, no_reverse, verbose, debug) motif_num: int = len(motif_lst) if verbose: end_rm_all: float = time.time() print("Read all motifs in %s in %.2fs." % (motif_file, (end_rm_all - start_rm_all))) print("\nRead {} motifs in {}".format(motif_num, motif_file)) print("\nProcessing motifs\n") complete_motifs = list() # fully processed motifs if verbose: start_mp_all: str = time.time() if motif_num >= cores: # worth to use multiprocessing original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN) pool: mp.Pool = mp.Pool(processes=cores) # overwrite the default SIGINT handler to exit gracefully # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python signal.signal(signal.SIGINT, original_sigint_handler) try: args = [(motif, debug) for motif in motif_lst] res = (pool.starmap_async(process_motif_for_logodds, args)) it: int = 0 # ---- progress bar while (True): if res.ready(): # when finished call for the last time printProgressBar() printProgressBar(tot, tot, prefix='Progress:', suffix='Complete', length=50) break if it == 0: tot = res._number_left remaining = res._number_left printProgressBar((tot - remaining), tot, prefix='Progress:', suffix='Complete', length=50) time.sleep(1) it += 1 complete_motifs += res.get(60 * 60 * 60) # does not ignore signals except KeyboardInterrupt: pool.terminate() sigint_handler() else: pool.close() if verbose: end_mp_all: float = time.time() print("Processed motif(s) in %s in %.2fs" % (motif_file, (end_mp_all - start_mp_all))) return complete_motifs else: for m in motif_lst: # process each found motif complete_motifs.append(process_motif_for_logodds(m, debug)) if verbose: end_mp_all: float = time.time() print("Processed motif(s) in %s in %.2fs" % (motif_file, (end_mp_all - start_mp_all))) return complete_motifs
def read_MEME_motif(motif_file: str, bg_file: str, pseudocount: float, no_reverse: bool, verbose: bool, debug: bool) -> List[Motif]: """Read motif PWM in MEME format. The data read are then used to build the scoring matrix for the motif, the P-value matrix, etc. Since a MEME file can contain one or more motifs, for each stored PWM is built the corresponding Motif object. The resulting set of motifs are stored in a list, which will constitute a MotifSet object. ... Parameters ---------- motif_file : str path to the motif PWM in JASPAR format bg_file path to the background file in Markov Background Format (http://meme-suite.org/doc/bfile-format.html). pseudocount : float value to add to motif PWM counts no_reverse : bool if False only the forward strand will be considered, otherwise both forward and reverse are considered verbose : bool print additional information debug: trace the full error stack Returns ------- List[Motif] list of Motif objects """ if not isinstance(motif_file, str): errmsg = "Expected str, got {}.\n" exception_handler(TypeError, errmsg.format(type(motif_file).__name__), debug) if not os.path.isfile(motif_file): errmsg = "Unable to locate {}.\n" exception_handler(FileNotFoundError, errmsg.format(motif_file), debug) if not isinstance(bg_file, str): errmsg = "Expected str, got {}.\n" exception_handler(TypeError, errmsg.format(type(bg_file).__name__), debug) if bg_file != UNIF and not os.path.isfile(bg_file): errmsg = "Unable to locate {}.\n" exception_handler(FileNotFoundError, errmsg.format(bg_file), debug) if not isinstance(pseudocount, float): errmsg = "Expected float, got {}.\n" exception_handler(TypeError, errmsg.format(type(pseudocount).__name__), debug) if pseudocount <= 0: errmsg = "The pseudocount must be > 0.\n" exception_handler(ValueError, errmsg, debug) if not isinstance(no_reverse, bool): errmsg = "Expected bool, got {}.\n" exception_handler(TypeError, errmsg.format(type(no_reverse).__name__), debug) motifs_raw = list() motifs: List[Motif] = list() motifs_num = 0 proceed = False # begin motif parsing try: ifstream = open(motif_file, mode="r") alphabet = __read_alphabet_meme(motif_file, ifstream, debug) # shared by all motifs nucsmap = dict() # used with np object for i in range(len(alphabet)): nucsmap.update({alphabet[i]: i}) while True: for line in ifstream: if line.startswith("MOTIF"): break # new motif instance else: assert motifs_num == len(motifs_raw) proceed = True break if proceed: break # read all motifs if verbose: start_rm = time.time() motifids = line.split() if len(motifids) == 2: # only name motif_id = motifids[1] motif_name = motif_id else: # assume first two fieds: id, name motif_id, motif_name = motifids[1:3] statistics = __read_statistics_meme(motif_file, ifstream, debug) probs = __read_counts_meme(motif_file, ifstream, statistics["width"], debug) motifs_raw.append({ "motifId": motif_id, "motifName": motif_name, "statistics": statistics, "counts": probs }) motifs_num += 1 if verbose: end_rm = time.time() print("Read motif %s in %.2fs." % (motif_name, (end_rm - start_rm))) if not proceed: errmsg = "Unexpected premature EOF in {}.\n" exception_handler(EOFError, errmsg.format(motif_file), debug) except: errmsg = "An error occurred while reading {}.\n" exception_handler(MotifFileReadError, errmsg.format(motif_file), debug) else: if bg_file == UNIF: bgs = get_uniformBG(alphabet, debug) elif os.path.isfile(bg_file): bgs = readBGfile(bg_file, debug) else: errmsg = "Unable to parse {}.\n" exception_handler(BGFileError, errmsg.format(bg_file), debug) bgs = pseudo_bg(bgs, no_reverse, debug) # add pseudocount to bg for i in range(motifs_num): mp = pd.DataFrame(np.matrix(motifs_raw[i]["counts"])) mp.index = alphabet mp = norm_motif(mp, motifs_raw[i]["statistics"]["width"], alphabet, debug) mp = apply_pseudocount_meme(mp.to_numpy(), pseudocount, motifs_raw[i]["statistics"]["nsites"], motifs_raw[i]["statistics"]["width"], bgs, alphabet, nucsmap, debug) motif: Motif = Motif(mp, motifs_raw[i]["statistics"]["width"], alphabet, motifs_raw[i]["motifId"], motifs_raw[i]["motifName"], nucsmap) motif.setBg(bgs) motifs.append(motif) finally: ifstream.close() return motifs
def main(cmdLineargs: Optional[List[str]] = None) -> None: try: # starting point of the execution time start: float = time.time() # read the command-line arguments parser: GRAFIMOArgumentParser = get_parser() if cmdLineargs is None: cmdLineargs: List[str] = sys.argv[1:] # get input args # no arguments given --> print help if len(cmdLineargs) == 0: parser.error_noargs() die(2) # the second argument must be buildvg or findmotif if ((cmdLineargs[0] != "-h" and cmdLineargs[0] != "--help") and cmdLineargs[0] != "--version" and (cmdLineargs[0] != "buildvg" and cmdLineargs[0] != "findmotif")): parser.error( "The second argument must be one between 'buildvg' and 'findmotif'" ) die(1) args: argparse.Namespace = parser.parse_args(cmdLineargs) if args.verbose: print("Parsing arguments...") start_args_parse: float = time.time() #--------------------------------------------------------------# # check commandline arguments consistency # #---------------------- general options -----------------------# # workflow type if args.workflow != "buildvg" and args.workflow != "findmotif": parser.error("Unexpected workflow given. Available options:\n" "\tbuildvg: construct VG from user data.\n" "\tfindmotif: scan VG for DNA motif(s) occurrences") die(1) # cpu cores if args.cores < 0: parser.error("Negative number of CPU cores given") elif args.cores == 0 and args.graph_genome: # when whole genome variation graph is given, it is safer to # use 1 CPU core by default. This beacuse of the space needed # to load the whole VG on RAM. # # CAVEAT: before requiring more CPU cores to be used, be sure # your system has enough memory args.cores = 1 elif args.cores == 0: # default option -> use all available CPU cores args.cores = mp.cpu_count() else: # args.cores > 0 if args.cores > mp.cpu_count(): parser.error("Too many CPU cores to use ({})".format( args.cores)) # verbosity if (not isinstance(args.verbose, bool) or (args.verbose != False and args.verbose != True)): parser.error( '\"--verbose\" does not accept any positional argument') # debugging if (not isinstance(args.debug, bool) or (args.debug != False and args.debug != True)): parser.error("\"--debug\" does not accept any positional argument") #---------------------- buildvg options -----------------------# buildvg_err_msg: str = "Unexpected arguments for \"grafimo buildvg\": \"{}\"" if args.workflow == "buildvg": if args.graph_genome_dir: parser.error(buildvg_err_msg.format("-d, --genome-graph-dir")) die(1) elif args.graph_genome: parser.error(buildvg_err_msg.format("-g, --genome-graph")) die(1) elif args.bedfile: parser.error(buildvg_err_msg.format("-b, --bedfile")) die(1) elif args.motif: parser.error(buildvg_err_msg.format("-m, --motif")) die(1) elif args.bgfile != UNIF: # if default ignored parser.error(buildvg_err_msg.format("-k, --bgfile")) die(1) elif args.pseudo != 0.1: # if default ignored parser.error(buildvg_err_msg.format("-p, --pseudo")) die(1) elif args.threshold != 1e-4: # if default ignored parser.error(buildvg_err_msg.format("-t, --thresh")) die(1) elif args.no_qvalue: parser.error(buildvg_err_msg.format("-q, --no-qvalue")) die(1) elif args.no_reverse: parser.error(buildvg_err_msg.format("-r, --no-reverse")) die(1) elif args.text_only: parser.error(buildvg_err_msg.format("-f, --text-only")) die(1) elif args.chroms_find: parser.error(buildvg_err_msg.format("--chroms-find")) die(1) elif args.chroms_prefix_find: parser.error(buildvg_err_msg.format("--chroms-prefix-find")) die(1) elif args.chroms_namemap_find != NOMAP: # if default ignored parser.error(buildvg_err_msg.format("--chroms-namemap-find")) die(1) elif args.qval_t: parser.error(buildvg_err_msg.format("--qvalueT")) die(1) elif args.recomb: parser.error(buildvg_err_msg.format("--recomb")) die(1) elif args.top_graphs != 0: # if default ignored parser.error(buildvg_err_msg.format("--top-graphs")) die(1) elif not args.linear_genome: parser.error("No reference genome given") die(1) elif not args.vcf: parser.error("No VCF file given") die(1) else: # arguments for buildvg are correct # reference genome if (args.linear_genome.split('.')[-1] != 'fa' and args.linear_genome.split('.')[-1] != 'fasta'): parser.error( "The reference genome file must be in FASTA format") die(1) else: if not os.path.isfile(args.linear_genome): parser.error("Unable to find {}".format( args.linear_genome)) die(1) if os.stat(args.linear_genome).st_size == 0: # empty file parser.error("{} seems to be empty.".format( args.linear_genome)) die(1) args.linear_genome = os.path.abspath(args.linear_genome) # VCF --> the VCF file must have been compressed with # bgzip (https://github.com/samtools/tabix) if (args.vcf.split(".")[-1] != "gz" and args.vcf.split(".")[-2] != "vcf"): parser.error( "Wrong VCF file given. The VCF file must have been " "compressed with bgzip (e.g. myvcf.vcf.gz)") die(1) else: if not os.path.isfile(args.vcf): parser.error('Unable to find {}'.format(args.vcf)) die(1) if os.stat(args.vcf).st_size == 0: # empty file parser.error("{} seems to be empty.".format(args.vcf)) die(1) args.vcf = os.path.abspath(args.vcf) # chromosome to construct VG if len(args.chroms_build) == 0: args.chroms_build = [ALL_CHROMS] # use all chromosome else: if anydup(args.chroms_build): parser.error( "Duplicated chromosome names given to \"--chroms-build\"" ) # chromosome name-map if args.chroms_namemap_build != NOMAP: if not os.path.isfile(args.chroms_namemap_build): parser.error("Unable to locate {}".format( args.chroms_namemap_build)) if (args.chroms_prefix_build and args.chroms_namemap_build != NOMAP): parser.error( "\"--chroms-prefix-build\" and \"chroms-namemap-build\" " "cannot used together. Choose one of those options") # if no out directory is specified the VGs are stored in # the current directory if args.out == "": args.out = os.path.abspath("./") workflow: BuildVG = BuildVG(args) if args.verbose: end_args_parse: float = time.time() print("Arguments parsed in %.2fs." % (end_args_parse - start_args_parse)) # end if # end if #---------------------- findmotif options -----------------------# findmotif_err_msg: str = "Unexpected arguments for \"grafimo findmotif\": \"{}\"" if args.workflow == "findmotif": if args.linear_genome: parser.error(findmotif_err_msg.format("-l, --linear-genome")) die(1) elif args.vcf: parser.error(findmotif_err_msg.format("-v, --vcf")) die(1) elif args.chroms_build: parser.error(findmotif_err_msg.format("--chroms-build")) elif args.chroms_prefix_build: parser.error(findmotif_err_msg.format("--chroms-prefix-build")) elif args.chroms_namemap_build != NOMAP: parser.error( findmotif_err_msg.format("--chroms-namemap-build")) elif args.reindex: # if default ignored parser.error(findmotif_err_msg.format("--reindex")) die(1) elif not args.graph_genome_dir and not args.graph_genome: parser.error( "No arguments given for both \"--genome-graph\" and \"--genome-graph-dir\"" ) die(1) elif not args.bedfile: parser.error("No BED file given") die(1) elif not args.motif: parser.error("No motif PWM given") die(1) else: # only one between graph_genome and graph_genome_dir is allowed if args.graph_genome and args.graph_genome_dir: parser.error( "Only one argument between \"--genome-graph\" and \"--genome-graph-dir\"" " can be used") die(1) # genome graph if args.graph_genome: if (args.graph_genome.split('.')[-1] != "xg" and args.graph_genome.split('.')[-1] != "vg"): parser.error( "Unrecognized genome variation graph format. Only" "VG and XG format are allowed") die(1) elif not os.path.isfile(args.graph_genome): parser.error("Unable to locate {}".format( args.graph_genome)) die(1) else: # using absolute path avoid potential problems args.graph_genome = os.path.abspath(args.graph_genome) # genome graphs directory if args.graph_genome_dir: if not os.path.isdir(args.graph_genome_dir): parser.error("Unable to locate {}".format( args.graph_genome_dir)) die(1) if len(glob(os.path.join(args.graph_genome_dir, "*.xg"))) <= 0: parser.error( "No genome variation graph found in {}".format( args.graph_genome_dir)) die(1) else: # using absolute path avoid potential problems args.graph_genome_dir = os.path.abspath( args.graph_genome_dir) # BED file if args.bedfile: if not isbed(args.bedfile, args.debug): parser.error( "The genomic coordinates must be given in UCSC BED files" ) die(1) else: if not os.path.isfile(args.bedfile): parser.error("Unable to locate {}".format( args.bedfile)) else: parser.error("No BED file given") # motif pwm if not args.motif: parser.error("No motif PWM given") else: motifs: List[str] = args.motif for m in motifs: if not isMEME_ff(m, args.debug) and not isJaspar_ff( m, args.debug): parser.error( "Unrecognized motif PWM file format. " "{} does not follow the MEME or JASPAR format rules" .format(m)) die(1) if not os.path.isfile(m): parser.error("Unable to locate {}".format(m)) # background file if args.bgfile != UNIF: if not os.path.isfile(args.bgfile): parser.error("Unable to locate {}".format(args.bgfile)) # pseudocount if args.pseudo <= 0: parser.error( "Pseudocount values must be > 0, got {}".format( args.pseudo)) die(1) # statistical significance threshold if args.threshold <= 0 or args.threshold > 1: parser.error( "Motif statistical significance threshold must be between 0 and 1" ) die(1) # q-value flag if (not isinstance(args.no_qvalue, bool) or (args.no_qvalue != False and args.no_qvalue != True)): parser.error( "\"--qvalue\" accepts only True or False values") die(1) # no reverse flag if (not isinstance(args.no_reverse, bool) or (args.no_reverse != False and args.no_reverse != True)): parser.error( "\"--no-reverse\" accepts only True or False values") die(1) # text only flag if (not isinstance(args.text_only, bool) or (args.text_only != False and args.text_only != True)): parser.error( "\"--text-only\" accepts only True or False values") die(1) # chromosome to consider during VG scan if len(args.chroms_find) == 0: args.chroms_find = [ALL_CHROMS] # use all chromosome else: if anydup(args.chroms_find): parser.error( "Duplicated chromosome names given to \"--chroms-find\"" ) # chromosome name-map if args.chroms_namemap_find != NOMAP: if not os.path.isfile(args.chroms_namemap_find): parser.error("Unable to locate {}".format( args.chroms_namemap_find)) if (args.chroms_prefix_find and args.chroms_namemap_find != NOMAP): parser.error( "\"--chroms-prefix-find\" and \"chroms-namemap-find\" " "cannot used together. Choose one of those options") # recomb flag if (not isinstance(args.recomb, bool) or (args.recomb != False and args.recomb != True)): parser.error( "\"--recomb\" accepts only True or False values") die(1) # out directory if args.out == "": # default option args.out = DEFAULT_OUTDIR print(args.out) # threshold on q-value flag if (not isinstance(args.qval_t, bool) or (args.qval_t != False and args.qval_t != True)): parser.error( "\"--qvalueT accepts only True or False values") die(1) elif args.no_qvalue == True and args.qval_t == True: parser.error( "Unable to apply statistical significance threshold on" " q-values if you don't want them") die(1) # number of graph regions to store as PNG images if args.top_graphs < 0: parser.error("Negative number of regions to display") workflow: Findmotif = Findmotif(args) if args.verbose: end_args_parse: float = time.time() print("Arguments parsed in %.2fs." % (end_args_parse - start_args_parse)) # end if # end if # chck that external dependencies are satisfied if args.verbose: sys.stderr.write( "Checking GRAFIMO external dependencies {}\n".format(EXT_DEPS)) start_deps: float = time.time() satisfied: bool deps_lack: List[str] satisfied, deps_lack = check_deps() if not satisfied and len(deps_lack) > 0: errmsg = "Some dependencies are not satisfied: {}.\nPlease solve them before running GRAFIMO.\n" exception_handler(DependencyError, errmsg.format(deps_lack), args.debug) elif not satisfied and len(deps_lack) <= 0: errmsg = "Dependencies satisfied, but unable to recover them.\n Be sure they are in system PATH.\n" exception_handler(DependencyError, errmsg, args.debug) if args.verbose and satisfied: end_deps: float = time.time() print("Dependencies satisfied.") print("Dependencies checked in %.2fs." % (end_deps - start_deps)) #--------------------------------------------------------------- # dependency check was ok, so we go to workflow selection: # * construction of the genome variation graph for # each chromosome or a user defined subset of them # * scan of a precomputed VG or a set of precomputed VG if isinstance(workflow, BuildVG): buildvg(workflow, args.debug) elif isinstance(workflow, Findmotif): findmotif(workflow, args.debug) else: errmsg = "Expected BuildVG or Findmotif, got {}.\n" exception_handler(TypeError, errmsg.format(type(workflow).__name__), args.debug) end: float = time.time() # GRAFIMO execution finishes here print("Elapsed time %.2fs." % (end - start)) except KeyboardInterrupt: sigint_handler() finally: pass
def scan_graph( motif: Motif, args_obj: Findmotif, debug: bool ) -> str: """Obtain all the sequences of length K from the genome variation graph. K is the motif width. The k-mers are extracted from the genomic regions defined in a UCSC BED file or ENCODE narrowPeak file. By default are extracted only those k-mers found on the alterantive genome sequences encoded in the scanned genome variation graph(s). It is possible to consider all the possible recombinant which can be obtained from the set of genetic variants encoded in the VG (--recomb option). To perform k-mers extraction are followed the paths (haplotypes) encoded in VGs (defined as (V,E,P), where V are set of nodes, E the set of edges, and P the set of paths or the haplotypes). ... Parameters ---------- motif : Motif DNA motif args_obj : Findmotif commandline arguments container debug : bool trace the full error stack Returns ------- str location of sequences files """ if not isinstance(motif, Motif): errmsg = "Expected Motif, got {}.\n" exception_handler(TypeError, errmsg.format(type(motif).__name__), debug) if not isinstance(args_obj, Findmotif): errmsg = "Expected Findmotif, got {}.\n" exception_handler(TypeError, errmsg.format(type(args_obj).__name__), debug) if args_obj.has_graphgenome(): # single VG vg = args_obj.graph_genome if not isVGindexed(vg, debug): errmsg = "The genome variation graph is not indexed, index it before proceeding.\n" exception_handler(VGError, errmsg, debug) elif args_obj.has_graphgenome_dir(): vg = args_obj.graph_genome_dir else: errmsg = "Unexpected genome variation graph given.\n" exception_handler(VGError, errmsg, debug) bedfile: str = args_obj.bedfile chroms: List[str] = args_obj.chroms chroms_prefix: str = args_obj.chroms_prefix namemap: dict = args_obj.namemap cores: int = args_obj.cores motif_width: int = motif.width # modify global var value global verbose verbose = args_obj.verbose # sequence extraction begin try: print("\nExtracting regions defined in {}.\n".format(bedfile)) if verbose: start_bp = time.time() regions, region_num = get_regions_bed(bedfile, debug) if verbose: end_bp = time.time() print("%s parsed in %.2fs. Found %d regions.\n" % (bedfile, (end_bp - start_bp), region_num)) if args_obj.chroms_num == 1 and chroms[0] == ALL_CHROMS: chroms = [c.split("chr")[1] for c in regions.keys()] tmpwd: str = tempfile.mkdtemp(prefix='grafimo_') # create a tmp dir cwd: str = os.getcwd() # get the current location os.chdir(tmpwd) # enter the tmp dir # create a list of queries queries: List[str] = list() # overwrite the default SIGINT handler to exit gracefully # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN) pool: mp.Pool = mp.Pool(processes=cores) # use no. cores processes signal.signal(signal.SIGINT, original_sigint_handler) if args_obj.has_graphgenome_dir(): for chrom in chroms: if not bool(namemap): chrname = "".join([chroms_prefix, chrom]) else: try: if chrom.startswith("chr"): chrname = namemap[chrom.split("chr")[1]] else: chrname = namemap[chrom] except: errmsg = "Missing out name map for chromosome {}.\n" exception_handler(KeyError, errmsg.format(chrom), debug) if chrom.startswith("chr"): positions = regions[chrom] else: positions = regions["".join(["chr", chrom])] for pos in positions: start: int = pos[0] stop: int = pos[1] if bool(namemap): if chrom.startswith("chr"): c = namemap[chrom.split("chr")[1]] else: c = chrom elif chroms_prefix: c = chrname.split(chroms_prefix)[1] else: c = chrname region_index:str = "-".join( [":".join([c, str(start)]), str(stop)] ) region_name: str = "-".join( ["_".join([chrname, str(start)]), str(stop)] ) seqs: str = os.path.join(".", ".".join([region_name, "tsv"])) xg: str = os.path.join(vg, ".".join([chrname, "xg"])) # the GBWT must have the same prefix as XG gbwt: str = os.path.join(vg, ".".join([chrname, "gbwt"])) if not os.path.isfile(xg): errmsg = "Unable to locate {}. Are your VGs named with \"chr\"? Consider using --chroms-prefix-find or chroms-namemap-find.\n" exception_handler(VGError, errmsg.format(xg), debug) if not os.path.isfile(gbwt): errmsg = "Unable to locate {}. Are your VGs named with \"chr\"? Consider using --chroms-prefix-find or chroms-namemap-find.\n" exception_handler(VGError, errmsg.format(gbwt), debug) query: str = "vg find -p {} -x {} -H {} -K {} -E > {}".format( region_index, xg, gbwt, motif_width, seqs ) queries.append(query) get_kmers(queries, pool, debug, verbose) elif args_obj.has_graphgenome(): for chrom in chroms: if not bool(namemap): chrname = "".join([chroms_prefix, chrom]) else: try: chrname = namemap[chrom] except: errmsg = "Missing out name map for chromosome {}.\n" exception_handler(KeyError, errmsg.format(chrom), debug) if chrom.startswith("chr"): if chrom not in regions.keys(): errmsg = "{} does not appear among the chromosomes available in {}.\n" exception_handler(KeyError, errmsg.format(chrom, bedfile), debug) positions = regions[chrom] else: if ("".join(["chr", chrom])) not in regions.keys(): errmsg = "{} does not appear among the chromosomes available in {}.\n" exception_handler(KeyError, errmsg.format(chrom, bedfile), debug) positions = regions["".join(["chr", chrom])] for pos in positions: start: int = pos[0] stop: int = pos[1] if chroms_prefix: c = chrname.split(chroms_prefix)[1] else: c = chrname region_index:str = "-".join( [":".join([c, str(start)]), str(stop)] ) region_name: str = "-".join( ["_".join([chrname, str(start)]), str(stop)] ) seqs: str = os.path.join(".", ".".join([region_name, "tsv"])) xg: str = vg xg_prefix: str = xg.split(".xg")[0] # the GBWT must have the same prefix as XG gbwt: str = ".".join([xg_prefix, "gbwt"]) if not os.path.exists(xg): errmsg = "Unable to locate {}. Are your VGs named with \"chr\"? Consider using --chroms-prefix-find or chroms-namemap-find.\n" exception_handler(VGError, errmsg.format(xg), debug) if not os.path.isfile(gbwt): errmsg = "Unable to locate {}. Are your VGs named with \"chr\"? Consider using --chroms-prefix-find or chroms-namemap-find.\n" exception_handler(VGError, errmsg.format(gbwt), debug) query: str = "vg find -p {} -x {} -H {} -K {} -E > {}".format( region_index, xg, gbwt, motif_width, seqs ) queries.append(query) get_kmers(queries, pool, verbose) except: errmsg = "An error occurred while scanning {}.\n" if args_obj.has_graphgenome_dir(): exception_handler(VGError, errmsg.format(xg), debug) elif args_obj.has_graphgenome(): exception_handler(VGError, errmsg.format(vg), debug) else: errmsg = "Chromosome name mismatch. Check chromosome name consistency.\n" exception_handler(VGError, errmsg, debug) sequence_loc: str = os.getcwd() os.chdir(cwd) return sequence_loc