def read_table(filename, optfile=None): data = defaultdict(dict) with maybe_gzip_open(filename) as ifp: for line in ifp: line = line.strip() if not line or line.startswith('#'): continue tokens = line.split('\t') chrom, pos = tokens[:2] alt = tokens[4].split(',')[0] info = tokens[7] # Split AC and AN from INFO field fields = info.split(';') fields.sort() ac = an = None for field in fields: if field.startswith('AC='): ac = int(field.split('=')[1]) if an is not None: break elif field.startswith('AN='): an = int(field.split('=')[1]) if ac is not None: break assert ac is not None and an is not None, \ "Error: entry with AC and AN: %s" % line data[chrom.lstrip('chr')][(int(pos), alt)] = float(ac) / an if optfile and not os.path.isfile(optfile): print >>sys.stderr, "Saving optimized table to:", optfile with maybe_gzip_open(optfile, 'wb') as ofp: cPickle.dump(data, ofp, cPickle.HIGHEST_PROTOCOL) return data
def read_table(filename, optfile=None): data = defaultdict(dict) with maybe_gzip_open(filename) as ifp: for line in ifp: line = line.strip() if not line or line.startswith('#'): continue tokens = line.split('\t') chrom, pos = tokens[:2] alt = tokens[4].split(',')[0] info = tokens[7] # Split AC and AN from INFO field fields = info.split(';') fields.sort() ac = an = None for field in fields: if field.startswith('AC='): ac = int(field.split('=')[1]) if an is not None: break elif field.startswith('AN='): an = int(field.split('=')[1]) if ac is not None: break assert ac is not None and an is not None, \ "Error: entry with AC and AN: %s" % line data[chrom.lstrip('chr')][(int(pos), alt)] = float(ac) / an if optfile and not os.path.isfile(optfile): print >> sys.stderr, "Saving optimized table to:", optfile with maybe_gzip_open(optfile, 'wb') as ofp: cPickle.dump(data, ofp, cPickle.HIGHEST_PROTOCOL) return data
def read_table(filename, optfile=None): data = defaultdict(dict) with maybe_gzip_open(filename) as ifp: for line in ifp: line = line.strip() if not line or line.startswith('#'): continue value, chrom, pos = line.split() data[chrom.lstrip('chr')][int(pos)] = float(value) if optfile and not os.path.isfile(optfile): print >>sys.stderr, "Saving optimized table to:", optfile with maybe_gzip_open(optfile, 'wb') as ofp: cPickle.dump(data, ofp, cPickle.HIGHEST_PROTOCOL) return data
def read_table(filename, optfile=None): data = defaultdict(dict) with maybe_gzip_open(filename) as ifp: for line in ifp: line = line.strip() if not line or line.startswith('#'): continue value, chrom, pos = line.split() data[chrom.lstrip('chr')][int(pos)] = float(value) if optfile and not os.path.isfile(optfile): print >> sys.stderr, "Saving optimized table to:", optfile with maybe_gzip_open(optfile, 'wb') as ofp: cPickle.dump(data, ofp, cPickle.HIGHEST_PROTOCOL) return data
def script(filename, quiet=False, verbose=False, **kwargs): fields = ['MES', 'dMES', 'MES+', 'MES-', 'MEC-MC?', 'MEC-CS?', 'MES-KM?'] print '#%s' % '\t'.join(fields) NULL = [None] * len(fields) seqs = [] with maybe_gzip_open(filename) as ifp: for line in ifp: line = line.strip() if line: seqs.append(Seq(line)) sites = {} scores = {} # Accumulate sites for each side for side in [3, 5]: sites[side] = set() for s in seqs: sites[side].update(s.iter_seqs(side)) # Score ALL sites at once! scores[side] = score_sites(side, sites[side]) # Print stats for each object, given queried scores for s in seqs: # Compute field-wise max of rows for 3' and 5' max_row = imap(max, s.score(3, scores[3]), s.score(5, scores[5])) print_row(max_row)
def get_genes(gene_filename=None, cache_filename=None, genome_filename=None, **kwargs): """Loads (potentially cached) dict: gene_name -> set(genes) If not cached, genome_filename FASTA expected to provide sequence data """ assert gene_filename and genome_filename or cache_filename if cache_filename is not None and os.path.isfile(cache_filename): print >> sys.stderr, "Loading genes from pickled file: %s" % cache_filename with maybe_gzip_open(cache_filename) as ifp: genes = cPickle.load(ifp) else: genome = Genome(genome_filename) genes = defaultdict(set) missed_chroms = set() n_zero_len = 0 for entry in iter_ucsc_genes(gene_filename): chrom = entry['chrom'] if not chrom.startswith('chr'): chrom = 'chr%s' % chrom if chrom not in genome: if chrom not in missed_chroms: print >>sys.stderr, "Could not find sequence for %s" \ " in: %s" % (chrom, genome_filename) missed_chroms.add(chrom) continue # Substitute id with gene name entry['seq'] = genome[chrom] try: t = Transcript(**entry) except AssertionError, e: if "Zero-length CDS" in str(e): n_zero_len += 1 else: print >>sys.stderr, "Skipping transcript: %s: %s" \ % (entry['gene'], e) continue if t.valid(): genes[entry['gene']].add(t) if n_zero_len: print >>sys.stderr, "Skipped %d transcripts with zero-length CDS" \ " annotations" % n_zero_len if missed_chroms: print >>sys.stderr, "Missing sequences with gene annotations: %s" \ % ', '.join(sorted(missed_chroms)) genes = dict(genes) # remove defaultdict if cache_filename: print >> sys.stderr, "Saving genes to pickled file: %s" % cache_filename with open(cache_filename, 'wb') as ofp: cPickle.dump(genes, ofp, cPickle.HIGHEST_PROTOCOL)
def read_examples(filename): lines = [] header = [] ncols = None with maybe_gzip_open(filename) as ifp: header = ifp.readline().strip() assert header.startswith('#') header = header.replace('#', '') for line in ifp: line = line.strip() if not line: continue assert not line.startswith('#') tokens = [float(val) for val in line.split()] if ncols is None: ncols = len(tokens) else: assert ncols == len(tokens), \ "Found row in %s with %d columns (%d expected)" % (filename, len(tokens), ncols) lines.append(tokens) try: data = array(lines, dtype=float) except ValueError: print >> sys.stderr # Find class column cols = header.split() if cols[0] == 'class': class_col = 0 else: class_col = None return header, class_col, data
def read_examples(filename): lines = [] header = [] ncols = None with maybe_gzip_open(filename) as ifp: header = ifp.readline().strip() assert header.startswith('#') header = header.replace('#', '') for line in ifp: line = line.strip() if not line: continue assert not line.startswith('#') tokens = [float(val) for val in line.split()] if ncols is None: ncols = len(tokens) else: assert ncols == len(tokens), \ "Found row in %s with %d columns (%d expected)" % (filename, len(tokens), ncols) lines.append(tokens) try: data = array(lines, dtype=float) except ValueError: print >>sys.stderr # Find class column cols = header.split() if cols[0] == 'class': class_col = 0 else: class_col = None return header, class_col, data
def iter_sequences(filename, domain=None, **kwargs): def get_mut_seqs(seq): pre, post = seq.split('/') pre, old = pre.split('[') new, post = post.split(']') if domain: pre_len = min(len(pre), domain) post_len = min(len(post), domain) # If too close to one end of sequence, accomodate if pre_len < domain: post_len = min(len(post), 2 * domain - pre_len) if post_len < domain: pre_len = min(len(pre), 2 * domain - post_len) pre = pre[-pre_len:] post = post[:post_len] assert len(pre) + len(post) == 2 * domain return pre + old + post, pre + new + post with maybe_gzip_open(filename) as ifp: for line in ifp: seq = line.strip().upper() try: premrna = seq.replace('|', '') postmrna = ''.join(seq.split('|')[::2]) yield get_mut_seqs(premrna), get_mut_seqs(postmrna) except (ValueError, AssertionError): print >> sys.stderr, "Error, invalid sequence: %s" % seq yield None
def annotate_variants(genes, filename): fields = ['chrom', 'pos', 'id', 'ref', 'alt', 'gene', 'tx', 'strand', 'codon', 'frame', 'premrna'] print '#%s' % '\t'.join(fields) with maybe_gzip_open(filename) as ifp: for line in ifp: line = line.rstrip() if not line or line.startswith('#'): continue tokens = line.split() chrom, pos, id, ref, alt, gene_id, tx_id = tokens[:7] chrom = chrom[3:] if chrom.startswith('chr') else chrom pos = int(pos) tx = get_transcript(genes, pos, ref, alt, gene_id, tx_id) if not tx: logging.warning('Transcript not found for variant: %s' % line) continue # Get codon, frame, and mrna cds_offset = tx.project_to_cds(pos) aa_pos = int(cds_offset / 3) + 1 codon = tx.get_codon(aa_pos) frame = cds_offset % 3 mut_str = tx.mutation_str(pos, ref, alt) print '\t'.join([chrom, str(pos), id, ref, alt, tx.gene(), tx.tx(), tx.strand(), codon, str(frame), mut_str] + tokens[7:])
def iter_sequences(filename, domain=None, **kwargs): def get_mut_seqs(seq): pre, post = seq.split('/') pre, old = pre.split('[') new, post = post.split(']') if domain: pre_len = min(len(pre), domain) post_len = min(len(post), domain) # If too close to one end of sequence, accomodate if pre_len < domain: post_len = min(len(post), 2*domain - pre_len) if post_len < domain: pre_len = min(len(pre), 2*domain - post_len) pre = pre[-pre_len:] post = post[:post_len] assert len(pre) + len(post) == 2 * domain return pre + old + post, pre + new + post with maybe_gzip_open(filename) as ifp: for line in ifp: seq = line.strip().upper() try: premrna = seq.replace('|', '') postmrna = ''.join(seq.split('|')[::2]) yield get_mut_seqs(premrna), get_mut_seqs(postmrna) except (ValueError, AssertionError): print >>sys.stderr, "Error, invalid sequence: %s" % seq yield None
def annotate_variants(genes, filename): fields = ['chrom', 'pos', 'id', 'ref', 'alt', 'gene', 'tx', 'strand', 'codon', 'frame', 'premrna'] print '#%s' % '\t'.join(fields) with maybe_gzip_open(filename) as ifp: for line in ifp: line = line.rstrip() if not line or line.startswith('#'): continue tokens = line.split() chrom, pos, id, ref, alt, gene_id, tx_id = tokens[:7] chrom = chrom[3:] if chrom.startswith('chr') else chrom pos = int(pos) tx = get_transcript(genes, pos, ref, alt, gene_id, tx_id) if not tx: continue # Get codon, frame, and mrna cds_offset = tx.project_to_cds(pos) aa_pos = int(cds_offset / 3) + 1 codon = tx.get_codon(aa_pos) frame = cds_offset % 3 mut_str = tx.mutation_str(pos, ref, alt) print '\t'.join([chrom, str(pos), id, ref, alt, tx.gene(), tx.tx(), tx.strand(), codon, str(frame), mut_str] + tokens[7:])
def script(filename, quiet=False, verbose=False, **kwargs): fields = ['f_premrna', 'f_mrna'] #, 'splice_dist'] print '#%s' % '\t'.join(fields) with maybe_gzip_open(filename) as ifp: for line in ifp: line = line.strip().upper() assert line.count('/') == 1 pre, post = line.split('/') # Trim off mutation nucs and brackets pre = pre[:-2] # e,g, '[A' post = post[2:] # e.g. 'C]' pre_chunks = pre.split('|') post_chunks = post.split('|') # Assume mutation is in exon premrna_f = min(len(pre), len(post)) \ / (len(pre) + len(post) + 1) pre_cds = ''.join(pre_chunks[::2]) post_cds = ''.join(post_chunks[::2]) mrna_f = min(len(pre_cds), len(post_cds)) \ / (len(pre_cds) + len(post_cds) + 1) #splice_dist = min(len(pre_chunks[-1]), len(post_chunks[0])) print '%.4f\t%.4f' % (premrna_f, mrna_f) #, splice_dist)
def get_genes(gene_filename=None, cache_filename=None, genome_filename=None, **kwargs): """Loads (potentially cached) dict: gene_name -> set(genes) If not cached, genome_filename FASTA expected to provide sequence data """ assert gene_filename and genome_filename or cache_filename if cache_filename is not None and os.path.isfile(cache_filename): print >>sys.stderr, "Loading genes from pickled file: %s" % cache_filename with maybe_gzip_open(cache_filename) as ifp: genes = cPickle.load(ifp) else: genome = Genome(genome_filename) genes = defaultdict(set) missed_chroms = set() n_zero_len = 0 for entry in iter_ucsc_genes(gene_filename): chrom = entry['chrom'] if not chrom.startswith('chr'): chrom = 'chr%s' % chrom if chrom not in genome: if chrom not in missed_chroms: print >>sys.stderr, "Could not find sequence for %s" \ " in: %s" % (chrom, genome_filename) missed_chroms.add(chrom) continue # Substitute id with gene name entry['seq'] = genome[chrom] try: t = Transcript(**entry) except AssertionError, e: if "Zero-length CDS" in str(e): n_zero_len += 1 else: print >>sys.stderr, "Skipping transcript: %s: %s" \ % (entry['gene'], e) continue if t.valid(): genes[entry['gene']].add(t) if n_zero_len: print >>sys.stderr, "Skipped %d transcripts with zero-length CDS" \ " annotations" % n_zero_len if missed_chroms: print >>sys.stderr, "Missing sequences with gene annotations: %s" \ % ', '.join(sorted(missed_chroms)) genes = dict(genes) # remove defaultdict if cache_filename: print >>sys.stderr, "Saving genes to pickled file: %s" % cache_filename with open(cache_filename, 'wb') as ofp: cPickle.dump(genes, ofp, cPickle.HIGHEST_PROTOCOL)
def load_data(vector_filename, log=sys.stderr): print >>log, "Loading vector data from file: %s" % vector_filename with maybe_gzip_open(vector_filename) as ifp: data = loadtxt(ifp, dtype=float) # Pop solution column solutions = data[:, 0] data = data[:,1:] print >>log, "Loaded data with %d examples and %d features" % data.shape return solutions, data
def iter_sequences(filename): seq_re = re.compile(r'([ACGT]*)\[([ACGT])/([ACGT])\]([ACGT]*)') with maybe_gzip_open(filename) as ifp: for line in ifp: seq = line.strip().upper() mut_exons = [chunk for chunk in seq.split('|') if '/' in chunk] assert len(mut_exons) == 1 exon = mut_exons[0] m = seq_re.search(exon) if m: pre, old, new, post = m.groups() yield pre, old, new, post else: print >> sys.stderr, "Error, invalid sequence: %s" % seq yield None
def iter_sequences(filename): seq_re = re.compile(r'([ACGT]*)\[([ACGT])/([ACGT])\]([ACGT]*)') with maybe_gzip_open(filename) as ifp: for line in ifp: seq = line.strip().upper() mut_exons = [chunk for chunk in seq.split('|') if '/' in chunk] assert len(mut_exons) == 1 exon = mut_exons[0] m = seq_re.search(exon) if m: pre, old, new, post = m.groups() yield pre, old, new, post else: print >>sys.stderr, "Error, invalid sequence: %s" % seq yield None
def iter_lines(filename): with maybe_gzip_open(filename) as ifp: for line in ifp: line = line.strip() if not line or line.startswith('#'): continue ref, alt, strand, codon, offset = line.split()[:5] assert strand in set(['+', '-', '.']) assert len(ref) == len(alt) == 1 assert len(codon) == 3 offset = int(offset) if strand == '-': ref = COMPLEMENT[ref] alt = COMPLEMENT[alt] assert codon[offset] == ref new_codon = codon[:offset] + alt + codon[offset+1:] yield codon, new_codon
def iter_lines(filename): with maybe_gzip_open(filename) as ifp: for line in ifp: line = line.strip() if not line or line.startswith('#'): continue ref, alt, strand, codon, offset = line.split()[:5] assert strand in set(['+', '-', '.']) assert len(ref) == len(alt) == 1 assert len(codon) == 3 offset = int(offset) if strand == '-': ref = COMPLEMENT[ref] alt = COMPLEMENT[alt] assert codon[offset] == ref new_codon = codon[:offset] + alt + codon[offset + 1:] yield codon, new_codon
def random_controls(genes, filename, match_cpg=False, avoid_splice=False): fields = ['chrom', 'pos', 'id', 'ref', 'alt', 'gene', 'tx'] print '#%s' % '\t'.join(fields) with maybe_gzip_open(filename) as ifp: for line in ifp: line = line.rstrip() if not line or line.startswith('#'): continue tokens = line.split() chrom, pos, id, ref, alt, gene_id, tx_id = tokens[:7] chrom = chrom[3:] if chrom.startswith('chr') else chrom pos = int(pos) tx = get_transcript(genes, pos, ref, alt, gene_id, tx_id) if not tx: continue if match_cpg: offset = tx.project_to_premrna(pos) pre = tx.premrna()[offset - 1:offset] post = tx.premrna()[offset + 1:offset + 2] tx_ref = ref tx_alt = alt if tx.strand() == '-': tx_ref = ref.translate(COMPLEMENT_TAB) tx_alt = alt.translate(COMPLEMENT_TAB) assert tx_ref == tx.premrna()[offset] cpg = bool((pre and pre[0] == 'C' and (tx_ref == 'G' or tx_alt == 'G')) or (post and post[0] == 'G' and (tx_ref == 'C' or tx_alt == 'C'))) else: cpg = None cds_offset, new_ref, new_alt = \ random_synonymous_site(tx, cpg=cpg, avoid_splice=avoid_splice) new_pos = tx.project_from_cds(cds_offset) if tx.strand() == '-': new_ref = COMPLEMENT[new_ref] new_alt = COMPLEMENT[new_alt] print '\t'.join( [chrom, str(new_pos), id, new_ref, new_alt, gene_id, tx.tx()] + tokens[7:])
def random_controls(genes, filename, match_cpg=False, avoid_splice=False): fields = ['chrom', 'pos', 'id', 'ref', 'alt', 'gene', 'tx'] print '#%s' % '\t'.join(fields) with maybe_gzip_open(filename) as ifp: for line in ifp: line = line.rstrip() if not line or line.startswith('#'): continue tokens = line.split() chrom, pos, id, ref, alt, gene_id, tx_id = tokens[:7] chrom = chrom[3:] if chrom.startswith('chr') else chrom pos = int(pos) tx = get_transcript(genes, pos, ref, alt, gene_id, tx_id) if not tx: continue if match_cpg: offset = tx.project_to_premrna(pos) pre = tx.premrna()[offset-1:offset] post = tx.premrna()[offset+1:offset+2] tx_ref = ref tx_alt = alt if tx.strand() == '-': tx_ref = ref.translate(COMPLEMENT_TAB) tx_alt = alt.translate(COMPLEMENT_TAB) assert tx_ref == tx.premrna()[offset] cpg = bool((pre and pre[0] == 'C' and (tx_ref == 'G' or tx_alt == 'G')) or (post and post[0] == 'G' and (tx_ref == 'C' or tx_alt == 'C'))) else: cpg=None cds_offset, new_ref, new_alt = \ random_synonymous_site(tx, cpg=cpg, avoid_splice=avoid_splice) new_pos = tx.project_from_cds(cds_offset) if tx.strand() == '-': new_ref = COMPLEMENT[new_ref] new_alt = COMPLEMENT[new_alt] print '\t'.join([chrom, str(new_pos), id, new_ref, new_alt, gene_id, tx.tx()] + tokens[7:])
def iter_ucsc_genes(filename): with maybe_gzip_open(filename) as ifp: for line in ifp: if line.startswith('#'): continue tokens = line.strip().split() bin, name, chrom, strand, txStart, txEnd, cdsStart, cdsEnd, \ exonCount, exonStarts, exonEnds, score, name2 = tokens[:13] chrom = chrom[3:] if chrom.startswith('chr') else chrom exonStarts = exonStarts.strip(',').split(',') exonEnds = exonEnds.strip(',').split(',') yield {'chrom': chrom, 'tx_start': txStart, 'tx_end': txEnd, 'strand': strand, 'cds_start': cdsStart, 'cds_end': cdsEnd, 'exon_starts': exonStarts, 'exon_ends': exonEnds, 'gene': name2, 'tx': name}
line = line.strip() if not line or line.startswith('#'): continue value, chrom, pos = line.split() data[chrom.lstrip('chr')][int(pos)] = float(value) if optfile and not os.path.isfile(optfile): print >> sys.stderr, "Saving optimized table to:", optfile with maybe_gzip_open(optfile, 'wb') as ofp: cPickle.dump(data, ofp, cPickle.HIGHEST_PROTOCOL) return data if optfile and os.path.isfile(optfile): print >> sys.stderr, "Loading optimized table from:", optfile with maybe_gzip_open(optfile, 'rb') as ifp: table = cPickle.load(ifp) else: print >> sys.stderr, "Loading table from:", tablefile table = read_table(tablefile, optfile) print '#GERP++' for line in sys.stdin: line = line.strip() if not line or line.startswith('#'): continue chrom, pos = line.split(None) try: value = '%.4f' % table[chrom.lstrip('chr')][int(pos)] except (IndexError, KeyError): value = 'na'
def filter_variants(genes, filename, protein_coords=False): # Do chromosomal binning to efficiently lookup overlapping transcripts n_bins = 2048 tx_locations = defaultdict(lambda: defaultdict(list)) for gene, txs in genes.iteritems(): for tx in txs: assert tx.gene() == gene start = tx._cds_start + 1 end = tx._cds_end i = int(start / n_bins) j = int(end / n_bins) for bin in xrange(i, j+1): tx_locations[tx.chrom()][bin].append((start, end, tx)) def find_overlapping_transcripts(chrom, pos): bin = int(pos / n_bins) txs = tx_locations[chrom][bin] return [tx for (start, end, tx) in txs if start <= pos <= end] fields = ['chrom', 'pos', 'id', 'ref', 'alt', 'gene', 'tx'] print '#%s' % '\t'.join(fields) n_total = 0 n_kept = 0 with maybe_gzip_open(filename) as ifp: for line in ifp: line = line.rstrip() if not line or line.startswith('#'): continue n_total += 1 tokens = line.split() if protein_coords: gene, codon, aa, mut = tokens[:4] rest = tokens[1:] match = get_transcript_from_protein(genes, gene, codon, aa, mut) if match is None: tx = None else: (tx, chrom, pos, ref, alt) = match id = '.' else: chrom, pos, id, ref, alts = tokens[:5] rest = tokens[5:] chrom = chrom[3:] if chrom.startswith('chr') else chrom alt = alts.split(',')[0] # Only process SNVs if len(ref) != 1 or len(alt) != 1: continue pos = int(pos) txs = [] for tx in find_overlapping_transcripts(chrom, pos): try: if tx.is_synonymous(pos, ref, alt): txs.append(tx) except AssertionError: continue tx = max(txs) if txs else None # Take longest valid transcript if not tx: continue n_kept += 1 print '\t'.join([chrom, str(pos), id, ref, alt, tx.gene(), tx.tx()] + rest) print >>sys.stderr, "Found %d synonymous variants (%d dropped)" % \ (n_kept, n_total - n_kept)
def filter_variants(genes, filename, protein_coords=False): # Do chromosomal binning to efficiently lookup overlapping transcripts n_bins = 2048 tx_locations = defaultdict(lambda: defaultdict(list)) for gene, txs in genes.iteritems(): for tx in txs: assert tx.gene() == gene start = tx._cds_start + 1 end = tx._cds_end i = int(start / n_bins) j = int(end / n_bins) for bin in xrange(i, j+1): tx_locations[tx.chrom()][bin].append((start, end, tx)) def find_overlapping_transcripts(chrom, pos): bin = int(pos / n_bins) txs = tx_locations[chrom][bin] return [tx for (start, end, tx) in txs if start <= pos <= end] fields = ['chrom', 'pos', 'id', 'ref', 'alt', 'gene', 'tx'] print '#%s' % '\t'.join(fields) n_total = 0 n_kept = 0 with maybe_gzip_open(filename) as ifp: for line in ifp: line = line.rstrip() if not line or line.startswith('#'): continue n_total += 1 tokens = line.split() if protein_coords: gene, codon, aa, mut = tokens[:4] rest = tokens[1:] match = get_transcript_from_protein(genes, gene, codon, aa, mut) if match is None: tx = None else: (tx, chrom, pos, ref, alt) = match id = '.' else: chrom, pos, id, ref, alts = tokens[:5] rest = tokens[5:] chrom = chrom[3:] if chrom.startswith('chr') else chrom alt = alts.split(',')[0] # Only process SNVs ref = ref.strip() alt = alt.strip() if len(ref) != 1 or len(alt) != 1: logging.debug('Dropping non-SNP line: %s' % line) continue pos = int(pos) txs = [] for tx in find_overlapping_transcripts(chrom, pos): try: if tx.is_synonymous(pos, ref, alt): txs.append(tx) except AssertionError: continue tx = max(txs) if txs else None # Take longest valid transcript if not tx: logging.debug('Variant is not synonymous on any transcript: %s' % line) continue n_kept += 1 print '\t'.join([chrom, str(pos), id, ref, alt, tx.gene(), tx.tx()] + rest) logging.info("Found %d synonymous variants (%d dropped)" % \ (n_kept, n_total - n_kept))
line = line.strip() if not line or line.startswith('#'): continue value, chrom, pos = line.split() data[chrom.lstrip('chr')][int(pos)] = float(value) if optfile and not os.path.isfile(optfile): print >>sys.stderr, "Saving optimized table to:", optfile with maybe_gzip_open(optfile, 'wb') as ofp: cPickle.dump(data, ofp, cPickle.HIGHEST_PROTOCOL) return data if optfile and os.path.isfile(optfile): print >>sys.stderr, "Loading optimized table from:", optfile with maybe_gzip_open(optfile, 'rb') as ifp: table = cPickle.load(ifp) else: print >>sys.stderr, "Loading table from:", tablefile table = read_table(tablefile, optfile) print '#GERP++' for line in sys.stdin: line = line.strip() if not line or line.startswith('#'): continue chrom, pos = line.split(None) try: value = '%.4f' % table[chrom.lstrip('chr')][int(pos)] except (IndexError, KeyError): value = 'na'