def generic_merge(splicegraph_gff_fname, gff_fname, output_gff_fname, genome, exon_id_suffix, coords_diff_cutoff=10, SE_merge=False): # Load SpliceGraph skipped exons splicegraph_in = pybedtools.BedTool(splicegraph_gff_fname) splicegraph_exons = \ splicegraph_in.filter(lambda x: x.attrs["ID"].endswith(exon_id_suffix)) # New annotation's skipped exons new_in = pybedtools.BedTool(gff_fname) new_exons = new_in.filter(lambda x: x.attrs["ID"].endswith(exon_id_suffix)) # Intersect splicegraph exons with new exons intersected_gff = splicegraph_exons.intersect(new_exons, wao=True, s=True) # Compile the overlaps for each exon and the exon it # overlaps with exons_to_overlaps = defaultdict(list) for exon in intersected_gff: curr_overlap = int(exon.fields[-1]) exons_to_overlaps[exon.attrs["ID"]].append((curr_overlap, exon)) # If the maximum overlap between the SpliceGraph exon and # all exons in the new GFF annotation is LESS than 'coords_diff_cutoff' # then keep the SpliceGraph exon in the annotation # Name of SpliceGraph event trios to include in merged annotation splicegraph_trios_to_add = [] # Mapping from SpliceGraph potentially redundant trios # to the new trios that they overlap with sg_redundant_trios = defaultdict(list) for exon_id in exons_to_overlaps: trio_id = exon_id.rsplit(".", 2)[0] # Get maximum overlap overlaps = \ [overlap_exon[0] \ for overlap_exon in exons_to_overlaps[exon_id]] overlapping_trios = \ [overlap_exon[1].attrs["ID"].rsplit(".", 2)[0] \ for overlap_exon in exons_to_overlaps[exon_id]] max_ind, max_overlap = utils.max_item(overlaps) if max_overlap < coords_diff_cutoff: splicegraph_trios_to_add.append(trio_id) else: # Look at all new trios it overlaps with for new_overlapping_trio_id in overlapping_trios: if new_overlapping_trio_id != trio_id: sg_redundant_trios[trio_id].append(new_overlapping_trio_id) else: # Skip identical trios continue # Collect potentially redundant trios sg_redundant_trios[trio_id].append(new_overlapping_trio_id) # If we're dealing with SEs, then do an SE specific # merge for the potentially redundant trios num_sg_trios = len(splicegraph_trios_to_add) print "Added %d trios from SpliceGraph" %(num_sg_trios) output_combined_gff_events(splicegraph_gff_fname, splicegraph_trios_to_add, gff_fname, output_gff_fname, genome)
def generic_merge(splicegraph_gff_fname, gff_fname, output_gff_fname, genome, exon_id_suffix, coords_diff_cutoff=10, SE_merge=False): # Load SpliceGraph skipped exons splicegraph_in = pybedtools.BedTool(splicegraph_gff_fname) splicegraph_exons = \ splicegraph_in.filter(lambda x: x.attrs["ID"].endswith(exon_id_suffix)) # New annotation's skipped exons new_in = pybedtools.BedTool(gff_fname) new_exons = new_in.filter(lambda x: x.attrs["ID"].endswith(exon_id_suffix)) # Intersect splicegraph exons with new exons intersected_gff = splicegraph_exons.intersect(new_exons, wao=True, s=True) # Compile the overlaps for each exon and the exon it # overlaps with exons_to_overlaps = defaultdict(list) for exon in intersected_gff: curr_overlap = int(exon.fields[-1]) exons_to_overlaps[exon.attrs["ID"]].append((curr_overlap, exon)) # If the maximum overlap between the SpliceGraph exon and # all exons in the new GFF annotation is LESS than 'coords_diff_cutoff' # then keep the SpliceGraph exon in the annotation # Name of SpliceGraph event trios to include in merged annotation splicegraph_trios_to_add = [] # Mapping from SpliceGraph potentially redundant trios # to the new trios that they overlap with sg_redundant_trios = defaultdict(list) for exon_id in exons_to_overlaps: trio_id = exon_id.rsplit(".", 2)[0] # Get maximum overlap overlaps = \ [overlap_exon[0] \ for overlap_exon in exons_to_overlaps[exon_id]] overlapping_trios = \ [overlap_exon[1].attrs["ID"].rsplit(".", 2)[0] \ for overlap_exon in exons_to_overlaps[exon_id]] max_ind, max_overlap = utils.max_item(overlaps) if max_overlap < coords_diff_cutoff: splicegraph_trios_to_add.append(trio_id) else: # Look at all new trios it overlaps with for new_overlapping_trio_id in overlapping_trios: if new_overlapping_trio_id != trio_id: sg_redundant_trios[trio_id].append(new_overlapping_trio_id) else: # Skip identical trios continue # Collect potentially redundant trios sg_redundant_trios[trio_id].append(new_overlapping_trio_id) # If we're dealing with SEs, then do an SE specific # merge for the potentially redundant trios num_sg_trios = len(splicegraph_trios_to_add) print "Added %d trios from SpliceGraph" % (num_sg_trios) output_combined_gff_events(splicegraph_gff_fname, splicegraph_trios_to_add, gff_fname, output_gff_fname, genome)
def obs_over_exp_counts_dinuc(self, subseqs): """ Get observed over expected ratio of counts (non-log!) of subsequences in all sequences. """ entries = [] t1 = time.time() num_seqs = 0 for curr_seq in self.seqs: # Sequence name is FASTA header without leading '>' seq_name = curr_seq[0][1:] # Get observed and expected counts for all subseqs # in the current sequence obs_counts, exp_counts = self.count_subseqs(curr_seq[1], subseqs) # Calculate ratios ratios = obs_counts / exp_counts # All ratios ratios_str = ",".join(["%.3f" % (r) for r in ratios]) # The maximum ratio max_ratio_indx, max_ratio = utils.max_item(ratios) # Get the observed counts of the kmer with highest ratio max_ratio_obs_count = int(obs_counts[max_ratio_indx]) obs_counts_str = ",".join(["%d" % (int(oc)) for oc in obs_counts]) exp_counts_str = ",".join(["%.2f" % (ec) for ec in exp_counts]) # Collect raw counts and ratio in order: # sequence name, obs counts, exp counts, obs / exp ratios entries.append([ seq_name, max_ratio, max_ratio_obs_count, obs_counts_str, exp_counts_str, ratios_str ]) num_seqs += 1 if num_seqs == 100: print "Quitting early!" print "=" * 10 break t2 = time.time() print "Counting occurrences in %d sequences took %.2f seconds" \ %(num_seqs, (t2 - t1)) col_names = [ "header", "max_ratio", "max_ratio_obs_count", "obs_counts", "exp_counts", "ratios" ] entries = \ pandas.DataFrame(np.array(entries), columns=col_names).set_index("header") # Sort in descending order entries.sort(column=["max_ratio"], ascending=False, inplace=True) return entries
def obs_over_exp_counts_dinuc(self, subseqs): """ Get observed over expected ratio of counts (non-log!) of subsequences in all sequences. """ entries = [] t1 = time.time() num_seqs = 0 for curr_seq in self.seqs: # Sequence name is FASTA header without leading '>' seq_name = curr_seq[0][1:] # Get observed and expected counts for all subseqs # in the current sequence obs_counts, exp_counts = self.count_subseqs(curr_seq[1], subseqs) # Calculate ratios ratios = obs_counts / exp_counts # All ratios ratios_str = ",".join(["%.3f" % (r) for r in ratios]) # The maximum ratio max_ratio_indx, max_ratio = utils.max_item(ratios) # Get the observed counts of the kmer with highest ratio max_ratio_obs_count = int(obs_counts[max_ratio_indx]) obs_counts_str = ",".join(["%d" % (int(oc)) for oc in obs_counts]) exp_counts_str = ",".join(["%.2f" % (ec) for ec in exp_counts]) # Collect raw counts and ratio in order: # sequence name, obs counts, exp counts, obs / exp ratios entries.append([seq_name, max_ratio, max_ratio_obs_count, obs_counts_str, exp_counts_str, ratios_str]) num_seqs += 1 if num_seqs == 100: print "Quitting early!" print "=" * 10 break t2 = time.time() print "Counting occurrences in %d sequences took %.2f seconds" % (num_seqs, (t2 - t1)) col_names = ["header", "max_ratio", "max_ratio_obs_count", "obs_counts", "exp_counts", "ratios"] entries = pandas.DataFrame(np.array(entries), columns=col_names).set_index("header") # Sort in descending order entries.sort(column=["max_ratio"], ascending=False, inplace=True) return entries
def output_utr_table(tables_dir, utr_gff_fname, output_dir, choice_rule="longest"): """ Output a UTR table (one UTR per gene) given a UTR GFF file. Possible rules for choosing the UTR ('choice_rule'): - longest, uses longest UTR - shortest, uses shortest UTR Outputs a GFF file. """ print "Outputting UTR table from %s" % (utr_gff_fname) output_basename = os.path.basename(utr_gff_fname).rsplit(".", 1)[0] utils.make_dir(output_dir) output_fname = os.path.join(output_dir, "%s.gff" % (output_basename)) print " - Output file: %s" % (output_fname) if not os.path.isfile(utr_gff_fname): raise Exception, "Cannot find %s" % (utr_gff_fname) # Load table table_fname = os.path.join(tables_dir, "ensGene.kgXref.combined.txt") table_df = pandas.read_table(table_fname, sep="\t") trans_to_gene = {} # Map transcripts to genes for row, entry in table_df.iterrows(): trans_to_gene[entry["name"]] = entry["name2"] # Mapping from gene ID to a dictionary mapping each # UTR to its length genes_to_utr_lens = defaultdict(lambda: defaultdict(int)) print "Computing lengths of UTRs.." gff_utrs = pybedtools.BedTool(utr_gff_fname) # Compute lengths of UTRs for each gene for entry in gff_utrs: # Get transcript that UTR belongs to trans_id = entry.attrs["Parent"] # Get UTR id utr_id = entry.attrs["ID"] # Get the gene it corresponds to gene_id = trans_to_gene[trans_id] # Compute length of UTRs # Length of UTR utr_len = len(entry) genes_to_utr_lens[gene_id][utr_id] = utr_len # Select UTR for each gene gene_to_chosen_utr = {} for gene in genes_to_utr_lens: all_utrs = genes_to_utr_lens[gene].items() utr_lens = [curr_utr[1] for curr_utr in all_utrs] if choice_rule == "longest": utr_indx = utils.max_item(utr_lens)[0] chosen_utr = all_utrs[utr_indx] gene_to_chosen_utr[gene] = chosen_utr else: raise Exception, "Unsupported choice rule %s" % (choice_rule) # Now select the relevant entries for outputting. Also # add relevant information about genes/length gff_utrs = pybedtools.BedTool(utr_gff_fname) gff_out = open(output_fname, "w") for entry in gff_utrs: # Current UTR id curr_utr_id = entry.attrs["ID"] # Current UTR's transcript curr_utr_trans = entry.attrs["Parent"] # Get the current UTR's gene curr_utr_gene = trans_to_gene[curr_utr_trans] # If this UTR is the chosen UTR, output it if gene_to_chosen_utr[curr_utr_gene][0] == curr_utr_id: # Look up the gene ID it belongs to curr_gene_id = trans_to_gene[curr_utr_trans] entry.attrs["gene_id"] = curr_gene_id entry.attrs["region_len"] = \ str(gene_to_chosen_utr[curr_utr_gene][1]) gff_out.write("%s" % (str(entry))) gff_out.close() return output_fname
def output_utr_table(tables_dir, utr_gff_fname, output_dir, choice_rule="longest"): """ Output a UTR table (one UTR per gene) given a UTR GFF file. Possible rules for choosing the UTR ('choice_rule'): - longest, uses longest UTR - shortest, uses shortest UTR Outputs a GFF file. """ print "Outputting UTR table from %s" %(utr_gff_fname) output_basename = os.path.basename(utr_gff_fname).rsplit(".", 1)[0] utils.make_dir(output_dir) output_fname = os.path.join(output_dir, "%s.gff" %(output_basename)) print " - Output file: %s" %(output_fname) if not os.path.isfile(utr_gff_fname): raise Exception, "Cannot find %s" %(utr_gff_fname) # Load table table_fname = os.path.join(tables_dir, "ensGene.kgXref.combined.txt") table_df = pandas.read_table(table_fname, sep="\t") trans_to_gene = {} # Map transcripts to genes for row, entry in table_df.iterrows(): trans_to_gene[entry["name"]] = entry["name2"] # Mapping from gene ID to a dictionary mapping each # UTR to its length genes_to_utr_lens = defaultdict(lambda: defaultdict(int)) print "Computing lengths of UTRs.." gff_utrs = pybedtools.BedTool(utr_gff_fname) # Compute lengths of UTRs for each gene for entry in gff_utrs: # Get transcript that UTR belongs to trans_id = entry.attrs["Parent"] # Get UTR id utr_id = entry.attrs["ID"] # Get the gene it corresponds to gene_id = trans_to_gene[trans_id] # Compute length of UTRs # Length of UTR utr_len = len(entry) genes_to_utr_lens[gene_id][utr_id] = utr_len # Select UTR for each gene gene_to_chosen_utr = {} for gene in genes_to_utr_lens: all_utrs = genes_to_utr_lens[gene].items() utr_lens = [curr_utr[1] for curr_utr in all_utrs] if choice_rule == "longest": utr_indx = utils.max_item(utr_lens)[0] chosen_utr = all_utrs[utr_indx] gene_to_chosen_utr[gene] = chosen_utr else: raise Exception, "Unsupported choice rule %s" %(choice_rule) # Now select the relevant entries for outputting. Also # add relevant information about genes/length gff_utrs = pybedtools.BedTool(utr_gff_fname) gff_out = open(output_fname, "w") for entry in gff_utrs: # Current UTR id curr_utr_id = entry.attrs["ID"] # Current UTR's transcript curr_utr_trans = entry.attrs["Parent"] # Get the current UTR's gene curr_utr_gene = trans_to_gene[curr_utr_trans] # If this UTR is the chosen UTR, output it if gene_to_chosen_utr[curr_utr_gene][0] == curr_utr_id: # Look up the gene ID it belongs to curr_gene_id = trans_to_gene[curr_utr_trans] entry.attrs["gene_id"] = curr_gene_id entry.attrs["region_len"] = \ str(gene_to_chosen_utr[curr_utr_gene][1]) gff_out.write("%s" %(str(entry))) gff_out.close() return output_fname