def populate_graph(self): """ Add edges from acceptors to donors, donors to acceptors, on distinct strands. Record the exon number within the transcript. """ print "Populating graph..." t1 = time.time() for table_name in self.tables: print "Adding splice edges from table %s" %(table_name) for item in self.tables[table_name]: chrom, startvals, endvals, strand, gene = item startvals = map(int, startvals.split(",")[:-1]) # Adds +1 since downloaded UCSC tables are 0-based start! startvals = map(str, [x + 1 for x in startvals]) endvals = endvals.split(",")[:-1] indices = range(len(startvals)) if strand == "-": # If it's a minus strand event, walk the transcript from # end (in order of transcription) startvals = startvals[::-1] endvals = endvals[::-1] # Zero-based exon number curr_exon_num = 0 for curr_i, next_i in utils.iter_by_pair(indices, step=1): # Splice from end of current exon to start of next exonp donor_unit = Unit((chrom, startvals[curr_i], strand), (chrom, endvals[curr_i], strand)) acceptor_unit = Unit((chrom, startvals[next_i], strand), (chrom, endvals[next_i], strand)) if strand == "-": # Reverse start/end of donor and acceptor units # if it's a minus strand event donor_unit = Unit((chrom, endvals[curr_i], strand), (chrom, startvals[curr_i], strand)) acceptor_unit = Unit((chrom, endvals[next_i], strand), (chrom, startvals[next_i], strand)) # donor_unit, acceptor_unit = acceptor_unit, donor_unit ## Record the exon number for donor and acceptor # Donor is exon number N self.add_unit_number(donor_unit, curr_exon_num, strand) # Acceptor is exon number N+1 self.add_unit_number(donor_unit, curr_exon_num + 1, strand=strand) self.add_edge(donor_unit, acceptor_unit, strand=strand) curr_exon_num += 1 t2 = time.time() print "Populating graph took %.2f seconds" %(t2 - t1)
def prob_score(self, subseq): """ Score probability of subseq in sequence. """ if len(subseq) == 0: return 0 # Score first base total_logscore = np.log(self.base_freqs[subseq[0]]) for prev_base, next_base in utils.iter_by_pair(subseq, 1): # Score current dinucleotide curr_dinuc = "%s%s" %(prev_base, next_base) # Divide by sum of all other transitions from the previous base curr_dinuc_freq = self.du[curr_dinuc] denom_dinuc_freqs = np.sum(self.get_dinuc_freqs_from(prev_base)) total_logscore += (np.log(curr_dinuc_freq) - \ np.log(denom_dinuc_freqs)) total_score = np.exp(total_logscore) return total_score
def prob_score(self, subseq): """ Score probability of subseq in sequence. """ if len(subseq) == 0: return 0 # Score first base total_logscore = np.log(self.base_freqs[subseq[0]]) for prev_base, next_base in utils.iter_by_pair(subseq, 1): # Score current dinucleotide curr_dinuc = "%s%s" % (prev_base, next_base) # Divide by sum of all other transitions from the previous base curr_dinuc_freq = self.du[curr_dinuc] denom_dinuc_freqs = np.sum(self.get_dinuc_freqs_from(prev_base)) total_logscore += (np.log(curr_dinuc_freq) - \ np.log(denom_dinuc_freqs)) total_score = np.exp(total_logscore) return total_score
def populate_graph(self): """ Add edges from acceptors to donors, donors to acceptors, on distinct strands. """ print "Populating graph..." t1 = time.time() for table_name in self.tables: print "Adding splice edges from table %s" %(table_name) for item in self.tables[table_name]: chrom, startvals, endvals, strand, gene = item startvals = map(int, startvals.split(",")[:-1]) # Adds +1 since downloaded UCSC tables are 0-based start! startvals = map(str, [x + 1 for x in startvals]) endvals = endvals.split(",")[:-1] indices = range(len(startvals)) if strand == "-": # If it's a minus strand event, walk the transcript from # end (in order of transcription) startvals = startvals[::-1] endvals = endvals[::-1] for curr_i, next_i in utils.iter_by_pair(indices, step=1): # Splice from end of current exon to start of next exonp donor_unit = Unit((chrom, startvals[curr_i], strand), (chrom, endvals[curr_i], strand)) acceptor_unit = Unit((chrom, startvals[next_i], strand), (chrom, endvals[next_i], strand)) if strand == "-": # Reverse start/end of donor and acceptor units # if it's a minus strand event donor_unit = Unit((chrom, endvals[curr_i], strand), (chrom, startvals[curr_i], strand)) acceptor_unit = Unit((chrom, endvals[next_i], strand), (chrom, startvals[next_i], strand)) # donor_unit, acceptor_unit = acceptor_unit, donor_unit # Record splice site as edge self.add_edge(donor_unit, acceptor_unit, strand=strand) t2 = time.time() print "Populating graph took %.2f seconds" %(t2 - t1)