def annotate_indel_on_db(row, fasta, dbsnp, clnvr, chr_prefixed): """Check if there are equivalent indels on dbSNP and ClinVar for each indel. If exists, annotate with SNP info. Args: row (pandas.Series): with 'chr', 'pos', 'is_ins', 'indel_seq' lables fasta (str): path to .fa dbsnp (str): path to 00-All.151.indel.vcf.gz clnvr (str): path to clinvar.indel.vcf.gz chr_prefixed (bool): True if chromosome names in BAM are "chr"-prefixed Returns: report (IndelSnpFeatures): idl object reporting SNP info """ chr = row["chr"] pos = row["pos"] idl_type = row["is_ins"] idl_seq = row["indel_seq"] # obj representing the indel in reference genome idl = curate_indel_in_genome(fasta, chr, pos, idl_type, idl_seq, chr_prefixed) # obj representing report of the indel report = IndelSnpFeatures(chr, pos, idl_type, idl_seq) # search for equivalent indels over pos +/- search_window nt search_window = 50 start, end = pos - search_window, pos + search_window chr_vcf = row["chr"].replace("chr", "") for record in dbsnp.fetch(chr_vcf, start, end, parser=pysam.asTuple()): bambinos = vcf2bambino(record) for bb in bambinos: if idl_type == bb.idl_type and len(idl_seq) == len(bb.idl_seq): # indel on db representing in reference genome db_idl = curate_indel_in_genome(fasta, chr, bb.pos, bb.idl_type, bb.idl_seq, chr_prefixed) if idl == db_idl: rs = record[2] report.add_dbsnp_id(rs) report.add_dbsnp_freq(dbsnp_freq(record)) # report.add_dbsnp_origin(dbsnp_origin(record)) report.add_dbsnp_common(dbsnp_common(record)) for record in clnvr.fetch(chr_vcf, start, end, parser=pysam.asTuple()): bambinos = vcf2bambino(record) for bb in bambinos: if idl_type == bb.idl_type and len(idl_seq) == len(bb.idl_seq): db_idl = curate_indel_in_genome(fasta, chr, bb.pos, bb.idl_type, bb.idl_seq, chr_prefixed) if idl == db_idl: id = record[2] report.add_clnvr_id(id) report.add_clnvr_freq(clnvr_freq(record)) # report.add_clnvr_origin(clnvr_origin(record)) report.add_clnvr_info(cln_info(record)) return report
def test_tabix_multi_ps_open(self): with open(self.tabix_ref,"rb") as fh1: with open(self.tabix_ref,"rb") as fh2: ps1 = pysam.tabix_file_iterator(fh1,pysam.asTuple()) ps2 = pysam.tabix_file_iterator(fh2,pysam.asTuple()) reader = MockReader(ps1,ps2,self.tabix_ref,tabix=True) for expected, found in zip(reader,self.bed_lines+self.bed_lines): self.assertEqual(expected.strip("\n"),found.strip("\n"))
def test_tabix_multi_ps_open(self): with open(self.tabix_ref, "rb") as fh1: with open(self.tabix_ref, "rb") as fh2: ps1 = pysam.tabix_file_iterator(fh1, pysam.asTuple()) ps2 = pysam.tabix_file_iterator(fh2, pysam.asTuple()) reader = MockReader(ps1, ps2, self.tabix_ref, tabix=True) for expected, found in zip(reader, self.bed_lines + self.bed_lines): self.assertEqual(expected.strip("\n"), found.strip("\n"))
def get_coverage(self, chrom, start, stop): if not self._binned: for row in self._tabixfile.fetch(chrom, start, stop+1, parser=pysam.asTuple()): yield json.loads(row[2]) else: # Right now we don't include the region_end column in our coverage files, # so there's no way to make sure we get the bin overlapping the start of our query region. # To deal with it for now, we'll just use start-50 # TODO: include region_end in coverage files. for row in self._tabixfile.fetch(chrom, max(1, start-50), stop+1, parser=pysam.asTuple()): d = json.loads(row[2]) if d['end'] < start or d['start'] > stop: continue d['start'] = max(d['start'], start) d['end'] = min(d['end'], stop) yield d
def get(self, chrom, position, ref, alt): if self.has_chr_prefix and not chrom.startswith('chr'): chrom = 'chr' + chrom elif not self.has_chr_prefix and chrom.startswith('chr'): chrom = chrom[3:] if not self.overlaps(chrom, position): self.chrom = chrom self.start = position self.stop = position + self.step_bp self.data = dict() for f in self.files: with pysam.Tabixfile(f, 'r') as tabix: for row in tabix.fetch(self.chrom, self.start - 1, self.stop + 1, parser=pysam.asTuple()): name = ':'.join(row[:4]) cadd_raw, cadd_phred = map(float, row[4:6]) if name in self.data: if self.data[name][1] < cadd_phred: self.data[name] = (cadd_raw, cadd_phred) else: self.data[name] = (cadd_raw, cadd_phred) return self.data.get(':'.join((chrom, str(position), ref, alt)), (None, None))
def _get_schema(self): if self._dataset is None: self._open_dataset() self._chroms = list(self._dataset.contigs) rec = next(self._dataset.fetch(self._chroms[0], parser=asTuple())) num_fields = len(rec) chrom_coord_dtype = np.int64 dtypes = { "chrom": pd.CategorialDtype(self._chroms + ["NULL"], ordered=True), "start": chrom_coord_dtype, "end": chrom_coord_dtype, "name": str, "score": np.float32, "strand": bool, } self._dtype = { key: dtypes[key] for key in list(dtypes.keys())[:num_fields] } return Schema( datashape=None, dtype=self._dtype, shape=(None, len(self._dtype)), npartitions=len(self._chroms), extra_metadata={}, )
def getphastscores(phastconsbed, gcoords): scores = [] #all scores scoresd = {} #{geneid : [scores]} tbx = pysam.Tabixfile(phastconsbed) nt = 0 ntwithscores = 0 for gene in gcoords: pcscores = [] #list of all pc scores for coords in this gene chrm = gcoords[gene][0] coords = gcoords[gene][2] for coord in coords: nt += 1 for row in tbx.fetch(chrm, coord, coord + 1, parser=pysam.asTuple()): score = row[4] if score: ntwithscores += 1 scores.append(score) pcscores.append(score) if pcscores: scoresd[gene] = pcscores print 'Interrogated {0} nucleotides. Found phastcons scores for {1} ({2}%) of them.'.format( nt, ntwithscores, round((ntwithscores / float(nt)), 4) * 100) return scores, scoresd
def parse_annotations(chrom, pos): AF_supAFR = CSQ = 'NA' if chrom == 'X': if pos <= 2699520: replace = 'X_PAR1' elif pos >= 154931044: replace = 'X_PAR2' else: replace = 'X_nonPAR' else: replace = chrom path_vcf = '../../../SHAPEIT/out_annotate_2016Dec28/{}.minAC1.no_mask.without_related.vcf.gz'.format(replace) tbx = pysam.TabixFile(path_vcf) for row in tbx.fetch(chrom, pos - 1, pos, parser=pysam.asTuple()): for _ in row[7].split(';'): if _ == 'DB': continue k, v = _.split('=') if k == 'AF_supAFR': AF_supAFR = v elif k == 'CSQ': CSQ = v return AF_supAFR, CSQ
def testIteratorUncompressed(self): '''test iteration from uncompressed file.''' tmpfilename = 'tmp_testIteratorUncompressed' infile = gzip.open(self.filename, "rb") outfile = open(tmpfilename, "wb") outfile.write(infile.read()) outfile.close() infile.close() with open(tmpfilename) as infile: for x, r in enumerate(pysam.tabix_iterator(infile, pysam.asTuple())): self.assertEqual(self.compare[x], list(r)) self.assertEqual(len(self.compare[x]), len(r)) # test indexing for c in range(0, len(r)): self.assertEqual(self.compare[x][c], r[c]) # test slicing access for c in range(0, len(r) - 1): for cc in range(c + 1, len(r)): self.assertEqual(self.compare[x][c:cc], r[c:cc]) os.unlink(tmpfilename)
def annotate(self, bedline, genome): c = bedline.rstrip().rsplit("\t") chr = c[0] start = c[1] end = c[2] if not re.search('chr', chr): raise LookupError("chromosome names must start with chr: " + chr) return [] if (self.genome != genome): raise LookupError( "tried to compare a %s bedfile to a %s annotation." % (genome, self.genome)) return [] else: annotations = [] if (chr and start and end): if self.tabixContigs.has_key(chr): tabixTupleParse = self.tabix.fetch(reference=chr, start=int(start), end=int(end), parser=pysam.asTuple()) for tabixTuple in tabixTupleParse: annotations.append(tabixTuple[3]) return uniqann(annotations) else: return [] else: raise LookupError( "can't find chr,start,end. File must be tab-delimited") return []
def load_segmented_data(filepath, interval): res = genomic_interval_set() tabix = pysam.TabixFile(filepath) for row in tabix.fetch(interval.chrom, interval.start, interval.end, parser = pysam.asTuple()): chrom=row[0] start = int(row[1]) end = int(row[2]) try: name=row[3] except: name='.' try: score=float(row[4]) except: score=-np.inf try: strand=row[5] except: strand='+' res += genomic_interval(chrom, start, end, name=name, score=score, strand=strand) tabix.close() return res
def snp_cal(chromo, window_start, window_end): rows = tuple( parsevcf.fetch(region="%s:%s-%s" % (chromo, window_start, window_end), parser=pysam.asTuple())) sites_total, sites_unmasked, sites_passing, sites_variant = 0, 0, 0, 0 calls = [0] * len(samples) hets = [0] * len(samples) for line in rows: sites_total += 1 if "CpGRep" in line[6]: continue sites_unmasked += 1 if "FAIL" in line[6]: continue if "WARN" in line[6]: continue sites_passing += 1 if line[4] != '.': sites_variant += 1 for i in range(0, len(samples)): GT = line[i + 9] if GT[:1] != '.': calls[i] += 1 if GT[:3] == '0/1': hets[i] += 1 if GT[:3] == '0|1': hets[i] += 1 output.write( '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (chromo, window_start, sites_total, sites_unmasked, sites_passing, sites_variant, '\t'.join(map(str, calls)), '\t'.join(map(str, hets))))
def annotate_variants_list(args, select_cursor, update_cursor): """ Populate a new, user-defined column in the variants table with a INTEGER indicating the count of overlaps between the variant and the annotation file. """ add_requested_column(args.col_name, update_cursor) # For each, use Tabix to count overlaps with the user-defined # annotation file. Update the variant row with the count. annos = pysam.Tabixfile(args.anno_file) select_cursor.execute("SELECT chrom, start, end, variant_id FROM variants") for row in select_cursor: hit_list = [] for hit in annos.fetch(str(row['chrom']), int(row['start']), int(row['end']), parser=pysam.asTuple()): try: hit_list.append(hit[int(args.col_extract) - 1]) except IndexError: sys.exit("Column " + args.col_extract + " exceeds \ the number of columns in your \ annotation file. Exiting.") hits = ",".join(hit_list) if len(hit_list): update_qry = "UPDATE variants SET " + args.col_name + " = '" + hits + \ "' WHERE variant_id = " + str(row['variant_id']) else: update_qry = "UPDATE variants SET " + args.col_name + " = NULL" + \ " WHERE variant_id = " + str(row['variant_id']) update_cursor.execute(update_qry)
def load_counts(self, discfile, window_in, window_out): reg = '{0}:{1}-{2}' def _get_coords(pos, strand): if strand == '+': start, end = pos - window_out, pos + window_in else: start, end = pos - window_in, pos + window_out return start, end startA, endA = _get_coords(self.posA, self.strandA) startB, endB = _get_coords(self.posB, self.strandB) region = reg.format(self.chrA, startA, endA) counts = defaultdict(int) pairs = discfile.fetch(region=region, parser=pysam.asTuple()) for pair in pairs: pair = _DiscPair(*pair) # Pairs were selected based on window around chrA; check chrB if pair.chrB != self.chrB: continue if not (startB <= pair.posB < endB): continue # Require pairs match breakpoint strand if pair.strandA != self.strandA or pair.strandB != self.strandB: continue counts[pair.sample] += 1 self.pair_counts = pd.DataFrame.from_dict({'count': counts})
def testTabixIndexedTsvCreation(self): inFile = "testdata/ESP6500SI-V2.chr1.snps_indels.head.25.txt" destDir = "out" # chr, startPos, endPos resultIndexedFile = TabixIndexer.index(destDir=destDir, inputFilename=inFile, fileColumnNumList=[0, 1, 1]) self.assertTrue(os.path.exists(resultIndexedFile), "No index file was generated.") chrom = "1" start = "69594" end = "69594" tsvRecords = None tsvReader = pysam.Tabixfile(filename=resultIndexedFile) # initialize the tsv reader try: tsvRecords = tsvReader.fetch(chrom, int(start)-1, int(end), parser=pysam.asTuple()) except ValueError: pass tsvRecord = None for tsvRecord in tsvRecords: self.assertEqual(tsvRecord[5], "2,6190", "Value in column sixth does not match the expected value.") self.assertIsNotNone(tsvRecord, "No record for %s:%s-%s was found." % (chrom, start, end)) os.remove(resultIndexedFile)
def __iter__(self): from pysam import Tabixfile, asTuple f = Tabixfile(self.filename, mode='r') try: # header row if self.header is not None: yield self.header else: # assume last header line has fields h = list(f.header) if len(h) > 0: header_line = text_type(h[-1], encoding='ascii') yield tuple(header_line.split('\t')) # data rows for row in f.fetch(reference=self.reference, start=self.start, end=self.stop, region=self.region, parser=asTuple()): yield tuple(row) except: raise finally: f.close()
def get_snp_data(*args, **kwargs): ''' proxy for TabixFile.fetch ''' kwargs['multiple_iterators'] = True return TabixFile(SNP_FILE, parser=asTuple()).\ fetch(*args, **kwargs)
def testIteratorUncompressed(self): '''test iteration from uncompressed file.''' tmpfilename = 'tmp_testIteratorUncompressed' infile = gzip.open(self.filename, "rb") outfile = open(tmpfilename, "wb") outfile.write(infile.read()) outfile.close() infile.close() with open(tmpfilename) as infile: for x, r in enumerate(pysam.tabix_iterator( infile, pysam.asTuple())): self.assertEqual(self.compare[x], list(r)) self.assertEqual(len(self.compare[x]), len(r)) # test indexing for c in range(0, len(r)): self.assertEqual(self.compare[x][c], r[c]) # test slicing access for c in range(0, len(r) - 1): for cc in range(c + 1, len(r)): self.assertEqual(self.compare[x][c:cc], r[c:cc]) os.unlink(tmpfilename)
def vci_query(vci_file, region, fasta_file): # ./bin/g2gtools vciquery -v data/mm/REF2CAST.vci.gz -r "1:13009000-13009800" -d start = time.time() vci_file = g2g_utils.check_file(vci_file, 'r') LOG.info("VCI File: {}".format(vci_file)) LOG.info("Region: {}".format(region)) vci_f = VCIFile(vci_file, seq_ids=[region.seq_id]) vci_f.parse(False) mappings = vci_f.find_mappings(region.seq_id, region.start, region.end) for m in mappings: LOG.debug(m) start_pos = mappings[0].to_start end_pos = mappings[-1].to_end LOG.debug("Converted Region: {}:{}-{}".format(region.seq_id, start_pos + 1, end_pos + 1)) for line in vci_f.fetch(reference=region.seq_id, start=start_pos, end=end_pos, parser=pysam.asTuple()): print(str(line)) LOG.info("VCI parsed: {0}".format(g2g_utils.format_time( start, time.time())))
def fetch_highest_scores(self, chrom, pos_begin, pos_end): result = dict() stripped_chrom = handle_chrom_prefix(self.chr_prefix, chrom) try: for line in self.accessor.direct_infile.fetch( stripped_chrom, pos_begin - 1, pos_end, parser=pysam.asTuple()): line = LineAdapter(self.accessor.score_file, line) for column in self.score_names: score_index = self.schema.col_names.index(column) score_value = float(line[score_index]) \ if str.lower(line[score_index]) \ != self.config.general.no_score_value else np.nan result[column] = max(score_value, result.get(column, np.nan)) return result except ValueError as ex: print( f"could not find region {chrom}:{pos_begin}-{pos_end} " f"in {self.score_filename}: ", ex, file=sys.stderr, ) return result
def getMinMaxPositions(depthFile, contig): with closing(pysam.TabixFile(depthFile)) as tabix: first_entry = None for first_entry in tabix.fetch(contig, 0, parser=pysam.asTuple()): break last_Mbp = 0 while any(True for _ in tabix.fetch( contig, last_Mbp, parser=pysam.asTuple())): last_Mbp += 5000000 last_entry = None for last_entry in tabix.fetch(contig, last_Mbp - 5000000, parser=pysam.asTuple()): pass return (long(first_entry[1]) if first_entry is not None else None, long(last_entry[1]) if last_entry is not None else None)
def bed_regions(bed_file, chromo): open_bed = pysam.TabixFile(bed_file) for line in open_bed.fetch(chromo, parser=pysam.asTuple()): start = int(line[1]) end = int(line[2]) yield start, end
def _create_column_dict_from_tabix_index(self, mutation): mut_start = int(mutation.start) mut_end = int(mutation.end) chrom = mutation.chr vals = {} try: # tabix needs position - 1 tsv_records = self.tsv_reader.fetch(chrom, mut_start - 1, mut_end, parser=pysam.asTuple()) i = -1 for i, tsv_record in enumerate(tsv_records): if not tsv_record: # skip in case no records are found continue logging.getLogger(__name__).debug("Got a record.") # Determine whether the new tsv record matches mutation or not if self._is_matching(mutation, tsv_record): for colName in self.output_tsv_headers: if colName.strip() == "": continue val = tsv_record[self.tsv_headers[colName]] if colName not in vals: vals[colName] = [val] else: vals[colName] += [val] logging.getLogger(__name__).debug("Processed %d records." % (i + 1)) except ValueError as ve: msg = "Exception when looking for tsv records. Empty set of records being returned: " + repr( ve) logging.getLogger(__name__).debug(msg) return vals
def get_info_from_variants(chrom, start, stop, field): """ Check the format of the CHROM field in VCF file Arguments: chrom (str): chromosome of search region start (str): start position of search region stop (str): stop position of search region field (str): field to extract from VCF file Returns: list (tuple): list of tuples by (chromosome, field value) """ tuples = [] format_index = get_format_index() field_index = get_format_field_index(field) try: for entry in tbx.fetch(chrom, int(start), int(stop), parser=asTuple()): if float(entry[5]) > 20: tuples.append( (chrom, get_field_value(entry, format_index, field_index))) except ValueError: print("No variants found in region {}:{}:{}".format( chrom, start, stop)) pass return tuples
def annotate(self,bedline,genome): c = bedline.rstrip().rsplit("\t") chr = c[0] start = c[1] end = c[2] if not re.search('chr',chr): raise LookupError("chromosome names must start with chr: " + chr) return [] if (self.genome != genome): raise LookupError("tried to compare a %s bedfile to a %s annotation." % (genome,self.genome)) return [] else: annotations = [] if (chr and start and end): if self.tabixContigs.has_key(chr): tabixTupleParse = self.tabix.fetch(reference=chr, start=int(start), end=int(end), parser=pysam.asTuple()) for tabixTuple in tabixTupleParse: annotations.append(tabixTuple[3]) return uniqann(annotations) else: return [] else: raise LookupError("can't find chr,start,end. File must be tab-delimited") return []
def annotate_indel_on_db(idl, idl_report, db, genome, chr_prefixed, vcf_chr_prefixed, preset): chr, pos, idl_type, idl_seq = idl.chr, idl.pos, idl.idl_type, idl.idl_seq # search for equivalent indels over pos +/- search_window nt search_window = 50 start, end = pos - search_window, pos + search_window chr_vcf = chr.replace("chr", "") chr_vcf = "chr" + chr_vcf if vcf_chr_prefixed else chr_vcf for record in db.fetch(chr_vcf, start, end, parser=pysam.asTuple()): bambinos = vcf2bambino(record) for bb in bambinos: if idl_type == bb.idl_type and len(idl_seq) == len(bb.idl_seq): # indel on db representing in reference genome db_idl = curate_indel_in_genome(genome, chr, bb.pos, bb.idl_type, bb.idl_seq, chr_prefixed) if idl == db_idl: if preset == "dbsnp": idl_report.add_dbsnp_id(record[2]) idl_report.add_dbsnp_freq(dbsnp_freq(record)) # idl_report.add_dbsnp_origin(dbsnp_origin(record)) idl_report.add_dbsnp_common(dbsnp_common(record)) elif preset == "clinvar": idl_report.add_clnvr_id(id) idl_report.add_clnvr_freq(clnvr_freq(record)) # idl_report.add_clnvr_origin(clnvr_origin(record)) idl_report.add_clnvr_info(cln_info(record)) else: idl_report.add_germline_id(record[2]) return idl_report
def testCopy(self): a = self.tabix.fetch(parser=pysam.asTuple()).next() b = copy.copy(a) self.assertEqual(a, b) a = self.tabix.fetch(parser=pysam.asGTF()).next() b = copy.copy(a) self.assertEqual(a, b)
def readDepthChunk(depthFile, contig, start, end): chunk = dict() if start > 0: start -= 1 with closing(pysam.Tabixfile(depthFile)) as tabix: for row in tabix.fetch(contig, start, end, parser=pysam.asTuple()): chunk[long(row[1])] = int(row[3]) return chunk
def testTuple( self ): for x, r in enumerate(self.tabix.fetch( parser = pysam.asTuple() )): self.assertEqual( self.compare[x], list(r) ) self.assertEqual( len(self.compare[x]), len(r) ) for c in range(0,len(r)): self.assertEqual( self.compare[x][c], r[c] )
def aggregate(self, chrom): import pysam filepath = self.filepath binsize = self.gs.binsize idmap = self.gs.idmap chromsizes = self.gs.chromsizes chrom_binoffset = self.gs.chrom_binoffset chrom_abspos = self.gs.chrom_abspos start_abspos = self.gs.start_abspos C2, P2 = self.C2, self.P2 these_bins = self.gs.fetch(chrom) rows = [] with pysam.TabixFile(filepath, 'r', encoding='ascii') as f: parser = pysam.asTuple() accumulator = Counter() for bin1_id, bin1 in these_bins.iterrows(): for line in f.fetch(chrom, bin1.start, bin1.end, parser=parser): chrom2 = line[C2] pos2 = int(line[P2]) try: cid2 = idmap[chrom2] except KeyError: # this chrom2 is not requested continue if binsize is None: lo = chrom_binoffset[cid2] hi = chrom_binoffset[cid2 + 1] bin2_id = lo + np.searchsorted( start_abspos[lo:hi], chrom_abspos[cid2] + pos2, side='right') - 1 else: bin2_id = chrom_binoffset[cid2] + (pos2 // binsize) accumulator[bin2_id] += 1 if not accumulator: continue rows.append( pandas.DataFrame( { 'bin1_id': bin1_id, 'bin2_id': list(accumulator.keys()), 'count': list(accumulator.values()) }, columns=['bin1_id', 'bin2_id', 'count']).sort_values('bin2_id')) accumulator.clear() logger.info(chrom) return pandas.concat(rows, axis=0) if len(rows) else None
def bed_regions(bed_file, chromo): open_bed = pysam.TabixFile(bed_file) coords = [] for line in open_bed.fetch(chromo, parser=pysam.asTuple()): start = int(line[1]) end = int(line[2]) coords += range(start, end) return coords
def by_region(region, version, species, limit=None): """Perform the search by region. Args: region (str): The region to look for SNPs. version (int): The Ensembl version number. species (str): The Ensembl species identifier. limit (int, optional): Maximum number of SNPs to return, ``None`` for all. Returns: list: All the SNPs in `region`. Each element is another ``list`` with the following values: * chromosome * position * SNP identifier * reference allele * alternate allele Raises: ValueError: When `region` is empty. """ LOG = utils.get_logger() LOG.debug(sqlite3.version_info) LOG.debug(sqlite3.version) LOG.debug('range={}'.format(region)) LOG.debug('version={}'.format(version)) LOG.debug('species_id={}'.format(species)) LOG.debug('limit={}'.format(limit)) try: if not region: raise ValueError('no ids were passed in') new_region = fetch_utils.str_to_region(region) tabix_file = fetch_utils.get_tabix_file(version, species) tbx = pysam.TabixFile(tabix_file) start_time = time.time() snps = [] for row in tbx.fetch('{}'.format(new_region.chromosome), new_region.start_position, new_region.end_position, parser=pysam.asTuple()): snps.append(list(row[:5])) LOG.info('Done: {}'.format(utils.format_time(start_time, time.time()))) return snps except Exception as e: LOG.error('Error: {}'.format(e)) return None
def testUnset(self): for x, r in enumerate(self.tabix.fetch(parser=pysam.asTuple())): self.assertEqual(self.compare[x], list(r)) c = list(r) e = list(r) for y in range(len(r)): r[y] = c[y] = None e[y] = "" self.assertEqual(c, list(r)) self.assertEqual("\t".join(e), str(r))
def match(self, chrm, position, indels=True): ''' match single position in CADD file; default indels to true b/c will be using this more with the indel file''' tbxFh = self._indel if indels else self._snv caddChr = 'MT' if chrm == 'M' else xstr(chrm) try: return tbxFh.fetch(caddChr, int(position) - 1, int(position), parser=pysam.asTuple()) except ValueError as e: # happens sometimes on chrm M/MT warning("WARNING", e) return []
def _region_reset(self, region): region = handle_chrom_prefix(self._has_chrom_prefix, region) try: self.lines_iterator = self.infile.fetch(region=region, parser=pysam.asTuple()) except ValueError as ex: print(f"could not find region {region} in {self.filename}:", ex, file=sys.stderr) self.lines_iterator = None
def testUnset( self ): for x, r in enumerate(self.tabix.fetch( parser = pysam.asTuple() )): self.assertEqual( self.compare[x], list(r) ) c = list(r) e = list(r) for y in range(len(r)): r[y] = c[y] = None e[y] = "" self.assertEqual( c, list(r) ) self.assertEqual( "\t".join(e), str(r) )
def main(): parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('vcf') parser.add_argument('famfile', type=argparse.FileType('r')) parser.add_argument('-c', '--countfile', required=True) parser.add_argument('-d', '--discfile', required=True) parser.add_argument('--discfile-index') parser.add_argument('--countfile-index') parser.add_argument('--background', type=int, default=160) parser.add_argument('--max-parents', type=float, default=10) parser.add_argument('petest', type=argparse.FileType('w'), help='fout') parser.add_argument('srtest', type=argparse.FileType('w'), help='fout') args = parser.parse_args() vcf = pysam.VariantFile(args.vcf) fam = parse_famfile(args.famfile) if args.discfile_index is None: discfile = pysam.TabixFile(args.discfile, parser=pysam.asTuple()) else: discfile = pysam.TabixFile(args.discfile, index=args.discfile_index, parser=pysam.asTuple()) if args.countfile_index is None: countfile = pysam.TabixFile(args.countfile, parser=pysam.asTuple()) else: countfile = pysam.TabixFile(args.countfile, index=args.countfile_index, parser=pysam.asTuple()) header = 'name sample log_pval called_median bg_median'.split() args.petest.write('\t'.join(header) + '\n') header = 'name sample coord pos log_pval called_median bg_median'.split() args.srtest.write('\t'.join(header) + '\n') runner = DenovoTestRunner(vcf, fam, countfile, discfile, args.petest, args.srtest, args.background, args.max_parents) runner.run()
def get_gapped_wnds(self,chr,tbx_gaps): gapped_wnds = [] for t in tbx_gaps.fetch(chr,parser=pysam.asTuple()): _chr,start,end = t wnd_start = np.searchsorted(self.starts,start) wnd_end = np.searchsorted(self.starts,end) gapped_wnds.append(tuple([wnd_start,wnd_end])) return gapped_wnds
def get_gapped_wnds(self, chr, tbx_gaps): gapped_wnds = [] for t in tbx_gaps.fetch(chr, parser=pysam.asTuple()): _chr, start, end = t wnd_start = np.searchsorted(self.starts, start) wnd_end = np.searchsorted(self.starts, end) gapped_wnds.append(tuple([wnd_start, wnd_end])) return gapped_wnds
def get_ref_alt_from_dbSNP(chrom, pos, path_vcf): tbx = pysam.TabixFile(path_vcf) for row in tbx.fetch(chrom, pos - 1, pos, parser=pysam.asTuple()): if len(row[3]) == 1 and 1 in map(len, row[4].split(',')): break else: stop_not_found_in_dbSNP assert ',' not in row[4], row return row[3], row[4]
def __init__(self, task_queue, results_queue, family, args): multiprocessing.Process.__init__(self) self.task_queue = task_queue self.family = family self.results_queue = results_queue self.verbosity = args.verbose self.phased = args.phased self.cadd_file = args.cadd_file[0] self.chr_prefix = args.chr_prefix if self.cadd_file: self.cadd_file = Tabixfile(self.cadd_file, parser = asTuple())
def get_overlapping_wnds(self,chr,tbx): wnd_starts, wnd_ends = self.get_wnds_by_chr(chr) bnds = np.array([ [int(l[1]),int(l[2])] for l in tbx.fetch(chr,parser=pysam.asTuple()) ]) start_idxs = np.searchsorted(wnd_starts,bnds[:,0]) end_idxs = np.searchsorted(wnd_starts,bnds[:,1]) #print start_idxs #print end_idxs ret = np.c_[start_idxs,end_idxs] return ret
def parse_dbSNP(args, chrom, pos, ref, alt): ## todo: check this function... I might miss variants in dbSNP... tbx = pysam.TabixFile(args.dbSNP) row = None for row in tbx.fetch(chrom, pos - 1 - 1, pos, parser=pysam.asTuple()): print(row, file=sys.stderr) if any([ ## SNP. row[3] == ref and alt in row[4].split(','), ## SNP in dbSNP and MNP in preselected.txt all([ row[3] == ref[0], len(set(alt[0].split(',')) & set(row[4].split(','))) > 0, len(ref) in map(len, alt.split(',')), len(row[3]) == 1, ]), ## Insertion. all([ int(row[1]) == pos, len(row[3]) == 1, ref == '-', ## One or more ALTs overlap (e.g. rs3835252). len(set(x[1:] for x in row[4].split(',')) & set(alt.split(','))) >= 1, ]), ## Deletion. all([ int(row[1]) == pos, len(row[4]) == 1, alt == '-', row[3][:1] == ref, len(row[3]) > 1, len(row[4]) == 1, ]), ## Deletion. all([ int(row[1]) + 1 == pos, len(row[3]) == len(ref) + 1, set(map(len, row[4].split(','))) == set([1]), alt == '-', row[3][1:] == ref, ]), ]): rsID = row[2] break ## Not found in dbSNP. else: rsID = '' return rsID
def testRead(self): for x, r in enumerate(self.tabix.fetch(parser=pysam.asTuple())): self.assertEqual(self.compare[x], list(r)) self.assertEqual(len(self.compare[x]), len(r)) # test indexing for c in range(0, len(r)): self.assertEqual(self.compare[x][c], r[c]) # test slicing access for c in range(0, len(r) - 1): for cc in range(c + 1, len(r)): self.assertEqual(self.compare[x][c:cc], r[c:cc])
def get_dup_overlap(self, tbx_dups): #assuming dup file is collapsed min_s = min(self.all_starts) max_e = max(self.all_ends) t = 0 for l in tbx_dups.fetch(self.chr,min_s,max_e,parser=pysam.asTuple()): c,s,e = l s,e = int(s), int(e) curr_s = s>min_s and s or min_s curr_e = e<max_e and e or max_e t += curr_e-curr_s if t==0: return 0.0 return float(t) / float(max_e - min_s)
def testWrite(self): for x, r in enumerate(self.tabix.fetch(parser=pysam.asTuple())): self.assertEqual(self.compare[x], list(r)) c = list(r) for y in range(len(r)): r[y] = "test_%05i" % y c[y] = "test_%05i" % y self.assertEqual([x for x in c], list(r)) self.assertEqual("\t".join(c), str(r)) # check second assignment for y in range(len(r)): r[y] = "test_%05i" % y self.assertEqual([x for x in c], list(r)) self.assertEqual("\t".join(c), str(r))
def __init__(self, inFile, parser=pysam.asTuple()): # inFile is passed in but is never used, and yet the TabixReader works. How?! # This inFile magic is because Tabixfile is a Cython object that uses def __cinit__ # rather than def __init__; the former is called automatically exactly once for the # base class prior to any use of __init__. Use of subsequent __init__ should obey # normal inheritance rules (assuming inheritance from object and it's not an old-style class). # So __cinit__ sets the input file, and then our __init__ (which doesn't override a base method) # is called and sets the parser. # This could all break in the future if pysam moves away from __cinit__, but doing so would # reduce performance and so it seems unlikely. # See: # https://github.com/cython/cython/blob/master/docs/src/userguide/special_methods.rst#id19 # https://github.com/pysam-developers/pysam/blob/master/pysam/libctabix.pyx#L331 #super(TabixReader, self).__init__(inFile, parser=parser) self.parser = parser
def testIteratorCompressed(self): """test iteration from compressed file.""" with gzip.open(self.filename) as infile: for x, r in enumerate(pysam.tabix_iterator(infile, pysam.asTuple())): self.assertEqual(self.compare[x], list(r)) self.assertEqual(len(self.compare[x]), len(r)) # test indexing for c in range(0, len(r)): self.assertEqual(self.compare[x][c], r[c]) # test slicing access for c in range(0, len(r) - 1): for cc in range(c + 1, len(r)): self.assertEqual(self.compare[x][c:cc], r[c:cc])
def plot_GC(chr,tbx_gc,cp_vect,starts,ends): F=open("GC.txt",'w') cp_vect = cp_vect.astype(np.float64) var = get_windowed_variance(cp_vect,50) F.write("var\tgc\n") for i in xrange(50,starts.shape[0],101): s = starts[i-50] e = ends[i+50] gc = np.mean(np.array([float(l[3]) for l in tbx_gc.fetch(chr,s,e,parser=pysam.asTuple())])) if var[i] != 0: print >>F,"%f\t%f"%( var[i],gc ) exit(1)
def load_data(filepath, interval, data_columns=[5], dtype=np.float): """ Loads numeric data columns from a BED-format TABIX file """ res = np.zeros((len(interval), len(data_columns)), dtype = dtype) tabix = pysam.TabixFile(filepath) for row in tabix.fetch(interval.chrom, interval.start, interval.end, parser = pysam.asTuple()): i = int(row[1])- interval.start for j, col in enumerate(data_columns): res[i, j] = dtype(row[col-1]) tabix.close() return res
def get_snps(pid): ''' return sequences mentioned in SNPData.csv ''' coords = map(make_coord_string, snps.COORDINATES.values()) search_args = { 'coordinate': ','.join(coords), 'patient': pid, '_count': 100000 } seq_bundle = call_api('/Sequence', search_args) seqs = (entry['content'] for entry in seq_bundle['entry']) translation_f = TabixFile(SNP_TRANSLATION_FNAME, parser=asTuple()) return jsonify({ get_rsid(translation_f, seq): seq['observedSeq'] for seq in seqs })
def get_callset(self, exclude_tbxs=[], min_exclude_ratio=0.3, min_exclude_len=20000): """ return segments and their copies in genome coordinates adding subtraction of gaps """ c=callset() wnd_starts,wnd_ends,cps = self.segment_edges wnd_starts,wnd_ends,cps = np.array(wnd_starts), np.array(wnd_ends), np.array(cps) for i in xrange(len(wnd_starts)-1): start, end = self.starts[wnd_starts[i]], self.ends[wnd_ends[i]] wnd_start, wnd_end = wnd_starts[i], wnd_ends[i] #exclude totally anything in these tbxs for exclude_tbx in exclude_tbxs: ex_starts, ex_ends = [], [] for l in exclude_tbx.fetch(self.chr,start,end,parser=pysam.asTuple()): _chr,_s,_e = l _s, _e = int(_s), int(_e) if _e-_s > min_exclude_len: ex_starts.append(_s) ex_ends.append(_e) n_exclude = len(ex_starts) if n_exclude: ex_coords = self.get_exclude_coords(ex_starts, ex_ends) wnd_start_ends = self.subtract_excluded(wnd_start, wnd_end, ex_coords) else: wnd_start_ends = [tuple([wnd_start, wnd_end])] for i in xrange(len(wnd_start_ends)): wnd_start = wnd_start_ends[i][0] wnd_end = wnd_start_ends[i][1] c.add_call(self.chr, self.starts[wnd_start], self.ends[wnd_end], np.mean(self.cp_data[wnd_start:wnd_end]), wnd_start, wnd_end, self.cp_data[wnd_start:wnd_end]) return c
def testRead(self): for x, r in enumerate(self.tabix.fetch(parser=pysam.asTuple())): c = self.compare[x] self.assertEqual(c, list(r)) self.assertEqual(len(c), len(r)) # test indexing for y in range(0, len(r)): self.assertEqual(c[y], r[y]) # test slicing access for y in range(0, len(r) - 1): for cc in range(y + 1, len(r)): self.assertEqual(c[y:cc], r[y:cc]) self.assertEqual("\t".join(map(str, c)), str(r))
def _tabix_iteradaptor(stream): """Open `stream` as an iterator over a `tabix`_ file, returning raw strings from tabix data. Parameters ---------- streams : open file-like, :class:`pysam.ctabix.tabix_file_iterator` Returns ------- generator Generator of tab-delimited string records in `tabix`_ file """ if not isinstance(stream,(pysam.ctabix.tabix_generic_iterator, pysam.ctabix.tabix_file_iterator) ): stream = pysam.tabix_file_iterator(stream,pysam.asTuple()) return (str(X) for X in stream)
def process(self): if self.genelist: gene_list = pysam.TabixFile(self.genelist) with open(self.input, 'r') as fin: with open(self.output, 'w') as fout: for line in fin: line = line.strip() if line.startswith('#chr'): header = line.split('\t') if self.genelist: header += ['PIDD_GENE', 'Inheritance', 'Phenotype'] fout.write('\t'.join(header) + '\n') continue elif line.startswith('##'): continue try: row = OrderedDict(zip(header, line.split('\t'))) except: continue if self.genelist: PIDD_GENE = [] Inheritance = [] Phenotype = [] try: for genepanel_line in gene_list.fetch(row['#chr'], int(row['start']), int(row['end']), parser=pysam.asTuple()): PIDD_GENE += [genepanel_line[3]] Inheritance += [genepanel_line[4]] Phenotype += [genepanel_line[5]] row['PIDD_GENE'] = "|".join(PIDD_GENE) if PIDD_GENE else 'NA' row['Inheritance'] = "|".join(Inheritance) if Inheritance else 'NA' row['Phenotype'] ="|".join(Phenotype) if Phenotype else 'NA' except ValueError: pass if self.filter_line(row): fout.write('\t'.join(row.values()) + '\n')
def _get_hits(coords, annotation, parser_type): """Retrieve BED information, recovering if BED annotation file does have a chromosome. """ if parser_type == "bed": parser = pysam.asBed() elif parser_type == "vcf": parser = pysam.asVCF() elif parser_type == "tuple": parser = pysam.asTuple() elif parser_type is None: parser = None else: raise ValueError("Unexpected parser type: %s" % parser) chrom, start, end = coords try: hit_iter = annotation.fetch(str(chrom), start, end, parser=parser) # catch invalid region errors raised by ctabix except ValueError: hit_iter = [] return hit_iter
def __init__(self, filepath, chromsizes, bins): try: import pysam except ImportError: raise ImportError("pysam is required to read tabix files") n_bins = len(bins) self.idmap = pandas.Series(index=chromsizes.keys(), data=range(len(chromsizes))) self.bins = bins self.binsize = get_binsize(bins) self.pairsfile = pysam.TabixFile(filepath, 'r', encoding='ascii') self.parser = pysam.asTuple() # number of lines in file p1 = subprocess.Popen(['pigz', '-p', '8', '-dc', filepath], stdout=subprocess.PIPE) p2 = subprocess.Popen(['wc', '-l'], stdin=p1.stdout, stdout=subprocess.PIPE) self.n_records = int(p2.communicate()[0]) # convert genomic coords of bin starts to absolute self.idmap = pandas.Series(index=chromsizes.keys(), data=range(len(chromsizes))) bin_chrom_ids = self.idmap[bins['chrom']].values self.cumul_length = np.r_[0, np.cumsum(chromsizes)] self.abs_start_coords = self.cumul_length[bin_chrom_ids] + bins['start'] # chrom offset index: chrom_id -> offset in bins chrom_nbins = bins.groupby(bin_chrom_ids, sort=False).size() self.chrom_offset = np.r_[0, np.cumsum(chrom_nbins)]