class refcache: def __init__(self, fasta_file, cacheSize=5000000): self.fasta_file = fasta_file self.FA = FastaFile(fasta_file) self.chroms = self.FA.references self._get_offsets() self.chrom_qualities = { chrom: self.detect_quality(chrom) for chrom in self.chroms } self.chrom_lens = { c: self.FA.get_reference_length(c) for c in self.chroms } self.cacheSize = cacheSize self.start = {c: 0 for c in self.chroms} self.end = {c: min(cacheSize, self.chrom_lens[c]) for c in self.chroms} self.chrom_caches = { c: self.FA.fetch(c, 0, self.end[c]) for c in self.chroms } def __del__(self): self.FA.close() def _get_offsets(self): self.chrom_offsets = {} fai = '%s.fai' % (self.fasta_file) with open(fai, 'r') as FAI: for split_line in map(lambda x: x.rstrip('\n').split('\t'), FAI): self.chrom_offsets[split_line[0]] = int(split_line[2]) def detect_quality(self, chrom): fasta_name = '>%s' % (chrom) with open(self.fasta_file, 'r') as FA: FA.seek(max(0, self.chrom_offsets[chrom] - 200)) for line in filter(lambda x: x[0] == '>', FA): split_line = line.rstrip('\n').split(' ') if split_line[0] == fasta_name: return _split2quality(split_line) def fetch(self, chrom, pos, pos2): assert (pos2 <= self.chrom_lens[chrom]) if pos2 - pos + 1 >= self.cacheSize: logger.debug( "Region was too large for refcache, you should consider increasing the cache size to %i" % ((pos2 - pos1 + 1) * 10)) return self.FA.fetch(chrom, pos, pos2) if pos < self.start[chrom] or pos2 > self.end[chrom]: self.start[chrom] = pos self.end[chrom] = min(pos + self.cacheSize, self.chrom_lens[chrom]) self.chrom_caches[chrom] = self.FA.fetch(chrom, self.start[chrom], self.end[chrom]) assert (pos >= self.start[chrom]) sI = pos - self.start[chrom] eI = pos2 - self.start[chrom] return self.chrom_caches[chrom][sI:eI]
class FastaHandler: """ Handles fasta files using pyfaidx API """ def __init__(self, reference_file_path): """ create fasta file object given file path to a fasta reference file :param fasta_file_path: full path to a fasta reference file """ self.fasta_file_path = reference_file_path try: self.fasta = FastaFile(self.fasta_file_path) except: raise IOError("FASTA FILE READ ERROR") def get_sequence(self, chromosome_name, start, stop): """ Return the sequence of a query region :param chromosome_name: Chromosome name :param start: Region start :param stop: Region end :return: Sequence of the region """ return self.fasta.fetch(region=chromosome_name, start=start, end=stop).upper() def get_chr_sequence_length(self, chromosome_name): """ Get sequence length of a chromosome. This is used for selecting windows of parallel processing. :param chromosome_name: Chromosome name :return: Length of the chromosome reference sequence """ return self.fasta.get_reference_length(chromosome_name) def get_contig_names(self): return self.fasta.references def get_ref_of_region(self, contig, site): """ Return a string containing reference of a site :param contig: Contig [ex chr3] :param site: Site [ex 100000-200000] :return: """ ret_val = "" error_val = 0 try: ret_val = self.fasta.fetch(region=contig + site).upper() except: print("ERROR IN REF FETCH: ", contig, site) error_val = 1 return ret_val, error_val def close(self): self.fasta.close()
class IndexedFasta(DataSource): name = "indexed_bedfile" version = "0.1.0" container = "python" partition_access = True description = "A bgzipped and indexed fasta file" def __init__(self, urlpath, metadata=None): self._urlpath = urlpath self._dataset = None self._dtype = None self._chroms = None super().__init__(metadata=metadata) def _open_dataset(self): self._dataset = FastaFile(self._urlpath) def _get_schema(self): if self._dataset is None: self._open_dataset() self._chroms = list(self._dataset.references) chrom_lengths = [{ "chrom": t[0], "length": t[1] } for t in zip(self._dataset.references, self._dataset.lengths)] return Schema( datashape=None, dtype=None, shape=None, npartitions=len(self._chroms), extra_metadata={"chroms": chrom_lengths}, ) def _get_partition(self, i): chrom = self._chroms[i] return [{"seqid": chrom, "seq": self._dataset.fetch(chrom)}] def read_chunked(self): self._load_metadata() for i in range(self.npartitions): yield self._get_partition(i) def to_dask(self): from dask import bag as db self._load_metadata() return db.from_delayed([ dask.delayed(self._get_partition(i)) for i in range(self.npartitions) ]) def _close(self): # close any files, sockets, etc if self._dataset is not None: self._dataset.close()
def _chrom_sizes(fasta_file): """Get the chromosome sizes for a fasta file """ from pysam import FastaFile fa = FastaFile(fasta_file) chrom_lens = OrderedDict([(name, l) for name, l in zip(fa.references, fa.lengths)]) if len(chrom_lens) == 0: raise ValueError(f"no chromosomes found in fasta file: {fasta_file}. " "Make sure the file path is correct and that the fasta index " "file {fasta_file}.fai is up to date") fa.close() return chrom_lens
def close(self): if self._fh: self._fh.close() self._fh = None subprocess.check_call([self._bgzip_exe, "--force", self._basepath]) os.rename(self._basepath + ".gz", self.filename) # open file with FastaFile to create indexes, then make all read-only _fh = FastaFile(self.filename) _fh.close() os.chmod(self.filename, stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH) os.chmod(self.filename + ".fai", stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH) os.chmod(self.filename + ".gzi", stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH) logger.info("{} written; added {} sequences".format(self.filename, len(self._added)))
def close(self): if self._fh: self._fh.close() self._fh = None subprocess.check_call([self._bgzip_exe, "--force", self._basepath]) os.rename(self._basepath + ".gz", self.filename) # open file with FastaFile to create indexes, then make all read-only _fh = FastaFile(self.filename) _fh.close() os.chmod(self.filename, stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH) os.chmod(self.filename + ".fai", stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH) os.chmod(self.filename + ".gzi", stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH) logger.info("{} written; added {} sequences".format(self.filename, len(self._added)))
def get_contig_list_from_fasta(fasta_path, with_length=False): """Obtain list of contigs froma fasta file, all alternative contigs are pooled into the string MISC_ALT_CONTIGS_SCMO Args: fasta_path (str or pysam.FastaFile) : Path or handle to fasta file with_length(bool): return list of lengths Returns: contig_list (list ) : List of contigs + ['MISC_ALT_CONTIGS_SCMO'] if any alt contig is present in the fasta file """ contig_list = [] has_alt = False if with_length: lens = [] if type(fasta_path) is str: fa = FastaFile(fasta_path) elif type(fasta_path) is FastaFile: fa = fasta_path else: raise TypeError('Supply pysam.FastaFile or str') for reference, length in zip(fa.references, fa.lengths): if is_main_chromosome(reference): contig_list.append(reference) if with_length: lens.append(length) else: has_alt = True # Close handle if we just opened one if type(fasta_path) is str: fa.close() if has_alt: contig_list.append('MISC_ALT_CONTIGS_SCMO') if with_length: lens.append(None) if with_length: return contig_list, lens return contig_list
def generate_header(reference_fa: str, tag: str) -> VariantHeader: """ Generates the header for the minimal VCF. :param reference_fa: Path to reference fasta file. :param tag: The filter tag to use. """ header = VariantHeader() header.filters.add(tag, None, None, "Failed dToxoG") fasta = FastaFile(reference_fa) try: for contig in fasta.references: header.contigs.add(contig, length=fasta.get_reference_length(contig)) finally: fasta.close() return header
class SequenceExtractor(object): """ Extracting sequences from FASTA file by interval objects. """ def __init__(self, path): self._fasta = FastaFile(path) def get_sequence(self, gi): seqs = [] for x, y in gi.blocks: seqs.append(self._fasta.fetch(gi.chrom, x, y)) seq = Seq("".join(seqs)) if gi.reverse: seq = seq.reverse_complement() return seq def close(self): if not self._fasta.closed: self._fasta.close() def __del__(self): self.close()
def export_bw(self, regions, output_prefix, fasta_file=None, contrib_method='grad', pred_summaries=['profile/wn', 'counts/pre-act'], batch_size=512, scale_contribution=False, chromosomes=None): """Export predictions and model contributions to big-wig files Args: regions: list of genomic regions output_prefix: output file prefix batch_size: scale_contribution: if True, multiple the contribution scores by the predicted count value chromosomes: a list of chromosome names consisting a genome """ from pysam import FastaFile # pred_summary: which operation to use for the profile gradients logger.info("Get model predictions and contribution scores") out = self.predict_regions(regions, contrib_method=contrib_method, pred_summaries=pred_summaries, fasta_file=fasta_file, batch_size=batch_size) logger.info("Setup bigWigs for writing") # Get the genome lengths if fasta_file is None: fasta_file = self.fasta_file fa = FastaFile(fasta_file) if chromosomes is None: genome = OrderedDict([(c, l) for c, l in zip(fa.references, fa.lengths)]) else: genome = OrderedDict([(c, l) for c, l in zip(fa.references, fa.lengths) if c in chromosomes]) fa.close() output_feats = ['preds.pos', 'preds.neg', 'contrib.profile', 'contrib.counts'] # make sure the regions are in the right order first_chr = list(np.unique(np.array([interval.chrom for interval in regions]))) last_chr = [c for c, l in genome.items() if c not in first_chr] genome = [(c, genome[c]) for c in first_chr + last_chr] # open bigWigs for writing bws = {} for task in self.tasks: bws[task] = {} for feat in output_feats: delim = "." if not output_prefix.endswith("/") else "" bw_preds_pos = pyBigWig.open(f"{output_prefix}{delim}{task}.{feat}.bw", "w") bw_preds_pos.addHeader(genome) bws[task][feat] = bw_preds_pos def add_entry(bw, arr, interval, start_idx=0): """Macro for adding an entry to the bigwig file Args: bw: pyBigWig file handle arr: 1-dimensional numpy array interval: genomic interval pybedtools.Interval start_idx: how many starting values in the array to skip """ assert arr.ndim == 1 assert start_idx < len(arr) if interval.stop - interval.start != len(arr): logger.warning(f"interval.stop - interval.start ({interval.stop - interval.start})!= len(arr) ({len(arr)})") logger.warning(f"Skipping the entry: {interval}") return bw.addEntries(interval.chrom, interval.start + start_idx, values=arr[start_idx:], span=1, step=1) def to_1d_contrib(hyp_contrib, seq): # mask the hyp_contrib + add them up return (hyp_contrib * seq).sum(axis=-1) # interval logic to handle overlapping intervals # assumption: all intervals are sorted w.r.t the start coordinate # strategy: don't write values at the same position twice (skip those) # # graphical representation: # ... ] - prev_stop # [ ] - new interval 1 # [ ] - added chunk from interval 1 # [ ] - new interval 2 - skip # [ ] - new interval 3, fully add logger.info("Writing to bigWigs") prev_stop = None # Keep track of what the previous interval already covered prev_chrom = None for i in tqdm(range(len(out))): interval = out[i]['interval'] if prev_chrom != interval.chrom: # Encountered a new chromosome prev_stop = 0 # Restart the end-counter prev_chrom = interval.chrom if prev_stop >= interval.stop: # Nothing new to add to that range continue start_idx = max(prev_stop - interval.start, 0) for tid, task in enumerate(self.tasks): # Write predictions preds = out[i]['pred'][task] add_entry(bws[task]['preds.pos'], preds[:, 0], interval, start_idx) add_entry(bws[task]['preds.neg'], preds[:, 1], interval, start_idx) # Get the contribution scores seq = out[i]['seq'] hyp_contrib = out[i]['contrib_score'] if scale_contribution: si_profile = preds.sum() # Total number of counts in the region si_counts = preds.sum() else: si_profile = 1 si_counts = 1 # profile - multipl if np.all(seq.astype(bool).sum(axis=-1).max() == 1): continue add_entry(bws[task]['contrib.profile'], to_1d_contrib(hyp_contrib[f'{task}/profile'], seq) * si_profile, interval, start_idx) add_entry(bws[task]['contrib.counts'], to_1d_contrib(hyp_contrib[f'{task}/count'], seq) * si_counts, interval, start_idx) prev_stop = max(interval.stop, prev_stop) logger.info("Done writing. Closing bigWigs") # Close all the big-wig files for task in self.tasks: for feat in output_feats: bws[task][feat].close() logger.info(f"Done! Output files stored as: {output_prefix}{delim}*")
def write_final_vcf(int_duplication_candidates, inversion_candidates, tandem_duplication_candidates, deletion_candidates, novel_insertion_candidates, breakend_candidates, version, contig_names, contig_lengths, types_to_output, options): vcf_output = open(options.working_dir + '/variants.vcf', 'w') # Write header lines print("##fileformat=VCFv4.2", file=vcf_output) print("##fileDate={0}".format(time.strftime("%Y-%m-%d|%I:%M:%S%p|%Z|%z")), file=vcf_output) print("##source=SVIM-v{0}".format(version), file=vcf_output) for contig_name, contig_length in zip(contig_names, contig_lengths): print("##contig=<ID={0},length={1}>".format(contig_name, contig_length), file=vcf_output) if "DEL" in types_to_output: print("##ALT=<ID=DEL,Description=\"Deletion\">", file=vcf_output) if "INV" in types_to_output: print("##ALT=<ID=INV,Description=\"Inversion\">", file=vcf_output) if (not options.tandem_duplications_as_insertions and "DUP:TANDEM" in types_to_output) or \ (not options.interspersed_duplications_as_insertions and "DUP:INT" in types_to_output): print("##ALT=<ID=DUP,Description=\"Duplication\">", file=vcf_output) if not options.tandem_duplications_as_insertions and "DUP:TANDEM" in types_to_output: print("##ALT=<ID=DUP:TANDEM,Description=\"Tandem Duplication\">", file=vcf_output) if not options.interspersed_duplications_as_insertions and "DUP:INT" in types_to_output: print("##ALT=<ID=DUP:INT,Description=\"Interspersed Duplication\">", file=vcf_output) if "INS" in types_to_output: print("##ALT=<ID=INS,Description=\"Insertion\">", file=vcf_output) if "BND" in types_to_output: print("##ALT=<ID=BND,Description=\"Breakend\">", file=vcf_output) print( "##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">", file=vcf_output) print( "##INFO=<ID=CUTPASTE,Number=0,Type=Flag,Description=\"Genomic origin of interspersed duplication seems to be deleted\">", file=vcf_output) print( "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">", file=vcf_output) print( "##INFO=<ID=SVLEN,Number=1,Type=Integer,Description=\"Difference in length between REF and ALT alleles\">", file=vcf_output) print( "##INFO=<ID=SUPPORT,Number=1,Type=Integer,Description=\"Number of reads supporting this variant\">", file=vcf_output) print( "##INFO=<ID=STD_SPAN,Number=1,Type=Float,Description=\"Standard deviation in span of merged SV signatures\">", file=vcf_output) print( "##INFO=<ID=STD_POS,Number=1,Type=Float,Description=\"Standard deviation in position of merged SV signatures\">", file=vcf_output) print( "##INFO=<ID=STD_POS1,Number=1,Type=Float,Description=\"Standard deviation of breakend 1 position\">", file=vcf_output) print( "##INFO=<ID=STD_POS2,Number=1,Type=Float,Description=\"Standard deviation of breakend 2 position\">", file=vcf_output) if options.insertion_sequences: print( "##INFO=<ID=SEQS,Number=.,Type=String,Description=\"Insertion sequences from all supporting reads\">", file=vcf_output) if options.read_names: print( "##INFO=<ID=READS,Number=.,Type=String,Description=\"Names of all supporting reads\">", file=vcf_output) if options.zmws: print( "##INFO=<ID=ZMWS,Number=1,Type=Integer,Description=\"Number of supporting ZMWs (PacBio only)\">", file=vcf_output) print( "##FILTER=<ID=hom_ref,Description=\"Genotype is homozygous reference\">", file=vcf_output) print( "##FILTER=<ID=not_fully_covered,Description=\"Tandem duplication is not fully covered by a single read\">", file=vcf_output) print("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">", file=vcf_output) print("##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read depth\">", file=vcf_output) print( "##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Read depth for each allele\">", file=vcf_output) if not options.tandem_duplications_as_insertions and "DUP:TANDEM" in types_to_output: print( "##FORMAT=<ID=CN,Number=1,Type=Integer,Description=\"Copy number of tandem duplication (e.g. 2 for one additional copy)\">", file=vcf_output) print("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" + options.sample, file=vcf_output) # Open reference genome sequence file sequence_alleles = options.sequence_alleles if sequence_alleles: try: reference = FastaFile(options.genome) except ValueError: logging.warning( "The given reference genome is missing an index file ({path}.fai). Sequence alleles cannot be retrieved." .format(options.genome)) sequence_alleles = False except IOError: logging.warning( "The given reference genome is missing ({path}). Sequence alleles cannot be retrieved." .format(options.genome)) sequence_alleles = False else: reference = None # Prepare VCF entries depending on command-line parameters vcf_entries = [] if "DEL" in types_to_output: for candidate in deletion_candidates: vcf_entries.append( (candidate.get_source(), candidate.get_vcf_entry(sequence_alleles, reference, options.read_names, options.zmws), "DEL")) if "INV" in types_to_output: for candidate in inversion_candidates: vcf_entries.append( (candidate.get_source(), candidate.get_vcf_entry(sequence_alleles, reference, options.read_names, options.zmws), "INV")) if "INS" in types_to_output: for candidate in novel_insertion_candidates: vcf_entries.append( (candidate.get_destination(), candidate.get_vcf_entry(sequence_alleles, reference, options.insertion_sequences, options.read_names, options.zmws), "INS")) if options.tandem_duplications_as_insertions: if "INS" in types_to_output: for candidate in tandem_duplication_candidates: vcf_entries.append( (candidate.get_destination(), candidate.get_vcf_entry_as_ins(options.read_names, options.zmws), "INS")) else: if "DUP:TANDEM" in types_to_output: for candidate in tandem_duplication_candidates: vcf_entries.append( (candidate.get_source(), candidate.get_vcf_entry_as_dup(options.read_names, options.zmws), "DUP_TANDEM")) if options.interspersed_duplications_as_insertions: if "INS" in types_to_output: for candidate in int_duplication_candidates: vcf_entries.append( (candidate.get_destination(), candidate.get_vcf_entry_as_ins(options.read_names, options.zmws), "INS")) else: if "DUP:INT" in types_to_output: for candidate in int_duplication_candidates: vcf_entries.append( (candidate.get_source(), candidate.get_vcf_entry_as_dup(options.read_names, options.zmws), "DUP_INT")) if "BND" in types_to_output: for candidate in breakend_candidates: vcf_entries.append( ((candidate.get_source()[0], candidate.get_source()[1], candidate.get_source()[1] + 1), candidate.get_vcf_entry(options.read_names, options.zmws), "BND")) vcf_entries.append( ((candidate.get_destination()[0], candidate.get_destination()[1], candidate.get_destination()[1] + 1), candidate.get_vcf_entry_reverse(options.read_names, options.zmws), "BND")) if sequence_alleles: reference.close() # Sort and write entries to VCF svtype_counter = defaultdict(int) for source, entry, svtype in sorted_nicely(vcf_entries): variant_id = "svim.{svtype}.{number}".format( svtype=svtype, number=svtype_counter[svtype] + 1) entry_with_id = entry.replace("PLACEHOLDERFORID", variant_id, 1) svtype_counter[svtype] += 1 print(entry_with_id, file=vcf_output) vcf_output.close()