def process_single_transcript_forSVM(input_transcript, path_to_harringtonine_reads, path_to_harringtonine_psite): print 'Working on ' + input_transcript.get_name() + '...' #Set up harringtonine reads harringtonine_reads = BAMGenomeArray(path_to_harringtonine_reads) harringtonine_reads.set_mapping( VariableFivePrimeMapFactory.from_file( open(path_to_harringtonine_psite))) #Set up vectors to append into positive_vectors = [] negative_vectors = [] #Ensure transcript competant to be a test example start_codon_nt = input_transcript.cds_start if not start_codon_nt - 25 > 0 or not start_codon_nt + 190 < input_transcript.get_length( ): #Ingolia says -18 on each side of initiation site scoring window. Noting that scoring window is -7 to 40 nt from given site, accounting for negative vector should be 150+40=190 on positive bound. return positive_vectors, negative_vectors #Create the vectors count_vector = input_transcript.get_counts(harringtonine_reads) positive_vectors.append( construct_Ingolia_vector(start_codon_nt, count_vector)) for z in [-6, -3, 3, 9, 18, 30, 60, 90, 120, 150]: negative_vectors.append( construct_Ingolia_vector(start_codon_nt + z, count_vector)) # print '...Done!' return positive_vectors, negative_vectors
def __init__(self, alignment_file, bam_file): """ Initiates an Alignment class object with the given pysam.AlignmentFile. Creates and stores plastid BAMGenomeArray from the alignment_file, using a plastid fivePrimeMapping factory. :param alignment_file: pysam.AlignmentFile """ self.bam_file = bam_file self.alignment_file = alignment_file self.bam_array = BAMGenomeArray(alignment_file, mapping=FivePrimeMapFactory()) # check if alignment file read successfully if self.bam_array is None: error_message = "Unknown problem occurred while reading the alignment file" % alignment_file self.logger.error(error_message) raise Exception(error_message) # set the number of chromosomes from the bam array self.num_chromosomes = len(self.bam_array.chroms()) # report success self.logger.debug("Read in the alignment file %s with %d chromosomes" % (alignment_file.filename, self.num_chromosomes))
def count_CHX_single_transcript(input_transcript, path_to_CHX_reads, path_to_CHX_psite): print 'Working On ' + str(input_transcript.get_name()) + '...' #Prepare reads and transcript object CHX_reads = BAMGenomeArray(path_to_CHX_reads) CHX_reads.set_mapping( VariableFivePrimeMapFactory.from_file(open(path_to_CHX_psite))) cds_transcript = input_transcript.get_cds() #Count features counts = np.nansum(cds_transcript.get_counts(CHX_reads)) length = cds_transcript.get_length() rpnt_cds = counts / length print '...Done!' return input_transcript, counts, length, rpnt_cds
def fetch_vectors(filename): pickle_path = global_args.output_dir + \ global_args.annotation_file[:-4].split("/")[-1] + ".sav" if os.path.isfile(pickle_path) == False: create_assembly_dill(global_args.annotation_file) gtf_coords_file = list(dill.load(open(pickle_path, "rb"))) allowed_ids = set(allowed_transcript_ids(global_args.gene_set)) alignments = BAMGenomeArray(filename, mapping=FivePrimeMapFactory()) print("Genomes loaded for %s " % filename) vector_array = [] name_array = [] for transcript in gtf_coords_file: if any([ transcript.attr.get('Name') in allowed_ids, transcript.get_name() in allowed_ids ]): readvec = transcript.get_counts(alignments) if np.sum(readvec) > 30 and len(readvec) % 3 == 0: if global_args.annotation_file.endswith("gtf"): name_array.append(transcript.get_name()) elif global_args.annotation_file.endswith("gff"): name_array.append(transcript.attr.get('Name')) readvec = np.reshape(readvec, (-1, 3)) vector_array.append(np.sum(readvec, axis=0)) vector_array = np.vstack(vector_array) sum_array = vector_array.sum(0) / np.sum(vector_array.sum(0)) vector_array = vector_array / vector_array.sum(1)[:, np.newaxis] return vector_array, name_array, sum_array
def fetch_vectors(filename): allowed_ids = set(allowed_transcript_ids(global_args.gene_set)) alignments = BAMGenomeArray(filename, mapping=FivePrimeMapFactory()) count_vectors = [] count_vectors_term = [] for transcript in extend_gtf_frame(global_args.annotation_file): if any([ transcript.attr.get('Name') in allowed_ids, transcript.get_name() in allowed_ids ]): try: value_array = transcript.get_counts(alignments) count_vectors.append(value_array[:global_args.offset * 2]) count_vectors_term.append(value_array[-global_args.offset * 2:]) except ValueError: pass vector_array = np.vstack(count_vectors) vector_array_term = np.vstack(count_vectors_term) if global_args.normalize == True: vector_array = vector_array[~np.all(vector_array == 0, axis=1)] vector_array = vector_array / vector_array.sum(1)[:, np.newaxis] vector_array_term = vector_array_term[ ~np.all(vector_array_term == 0, axis=1)] vector_array_term = vector_array_term / vector_array_term.sum( 1)[:, np.newaxis] metagene = vector_array.sum(axis=0) metagene_term = vector_array_term.sum(axis=0) return metagene, metagene_term
def main(args, loglevel): logging.basicConfig(format="%(levelname)s: %(message)s", level=loglevel) logging.debug(b"Building sequence dictionary") seq_dict = SeqIO.index(args.fasta, "fasta") logging.debug("Reading Annotations") transcripts = list( GFF3_TranscriptAssembler(open(args.gff), add_three_for_stop=args.add_three)) logging.debug("Reading Alignments") alignments = BAMGenomeArray([args.bam]) alignments.set_mapping(ThreePrimeMapFactory(offset=args.offset)) alignments.add_filter( "size", SizeFilterFactory(min=args.min_length, max=args.max_length)) outfh = open(args.outfile, 'wb') outfh.write(b"%s\n" % b"\t".join( (b"gene_id", b"gene_name", b"codon_seq", b"codon_index", b"codon_count_sum", b"position_1_count", b"position_2_count", b"position_3_count"))) for (i, transcript) in enumerate(transcripts): if (i == 0 or (i + 1) % 100 == 0): logging.info("Evaluated %s genes" % (i + 1)) logging.debug(transcript.get_name()) logging.debug(pprint.pformat(transcript.attr)) if len(transcript) <= 0: logging.warn("Transcript %s is length zero (0), skipping!", transcript.get_name()) continue if transcript.attr.get("pseudo", None) == "true": logging.warn("Transcript %s is a pseudogene, skipping!", transcript.get_name()) continue transcript_seq = transcript.get_sequence(seq_dict) transcript_counts = transcript.get_counts(alignments) num_codons = len(transcript_counts) / 3 if num_codons != round(num_codons): logging.warn("Transcript %s length (%i) is not a multiple of " "three, skipping!" % (transcript.get_name(), len(transcript_counts))) continue logging.debug("Trancript length %i basepairs, %f codons" % (len(transcript_counts), num_codons)) for codon_index in range(1, int(numpy.floor(num_codons))): codon_start = (codon_index - 1) * 3 codon_stop = codon_start + 3 codon_seq = transcript_seq[codon_start:codon_stop] codon_counts = transcript_counts[codon_start:codon_stop] codon_count_sum = sum(codon_counts) outfh.write( "%s\t%s\t%s\t%i\t%i\t%i\t%i\t%i\n" % (transcript.get_name(), transcript.attr.get( "gene", ""), codon_seq, codon_index, codon_count_sum, codon_counts[0], codon_counts[1], codon_counts[2]))
def fetch_vectors(filenames): allowed_ids = set(allowed_transcript_ids(global_args.gene_set)) alignments = BAMGenomeArray(filenames, mapping=FivePrimeMapFactory()) print("Genomes loaded!") except_count = 0 count_vectors_start = [] count_vectors_term = [] for transcript in extend_gtf_frame(global_args.annotation_file): if any([ transcript.attr.get('Name') in allowed_ids, transcript.get_name() in allowed_ids ]): try: value_array = transcript.get_counts(alignments) if global_args.shortest < ( len(value_array) - global_args.offset * 2) < global_args.longest: if np.sum(value_array[(-global_args.longest - global_args.offset):]) > 1: count_vectors_term.append( np.concatenate( (np.zeros(global_args.longest, dtype=int), value_array))[-global_args.longest - global_args.offset:]) if np.sum(value_array[:global_args.longest + global_args.offset]) > 1: count_vectors_start.append( np.concatenate( (value_array, np.zeros(global_args.longest, dtype=int)))[:global_args.longest + global_args.offset]) except ValueError: except_count += 1 vector_array_start = np.vstack(count_vectors_start) vector_array_term = np.vstack(count_vectors_term) print("Vectors retrieved!") print("Removed %i transcripts!" % except_count) if global_args.normalize == True: vector_normsum_start = np.sum(vector_array_start, axis=1) vector_array_start = vector_array_start / \ vector_normsum_start[:, np.newaxis] vector_normsum_term = np.sum(vector_array_term, axis=1) vector_array_term = vector_array_term / \ vector_normsum_term[:, np.newaxis] metagene_start = vector_array_start.sum(axis=0) metagene_stack_start = np.reshape(metagene_start, (-1, 3)) metagene_stack_start = np.hsplit(metagene_stack_start, 3) metagene_term = vector_array_term.sum(axis=0) metagene_stack_term = np.reshape(metagene_term, (-1, 3)) metagene_stack_term = np.hsplit(metagene_stack_term, 3) frames_start = [] for arr in metagene_stack_start: frame_vector = np.concatenate((np.zeros(2), arr.T[0], np.zeros(2))) window_iter = (np.sum(frame_vector[i:i + 5]) for i in range(len(frame_vector[2:-3]))) frames_start.append(np.fromiter(window_iter, dtype=float)) frames_term = [] for arr in metagene_stack_term: frame_vector = np.concatenate((np.zeros(2), arr.T[0], np.zeros(2))) window_iter = (np.sum(frame_vector[i:i + 5]) for i in range(len(frame_vector[2:-3]))) frames_term.append(np.fromiter(window_iter, dtype=float)) print("Frames split") return frames_start, frames_term
def main(args, loglevel): logging.basicConfig(format="%(levelname)s: %(message)s", level=loglevel) logging.debug("Building sequence dictionary") seq_dict = SeqIO.index(args.fasta, "fasta") logging.debug("Reading Annotations") if args.gff: transcripts = list( GFF3_TranscriptAssembler(open(args.gff), add_three_for_stop=args.add_three)) elif args.gtf: transcripts = list( GTF2_TranscriptAssembler(open(args.gtf), add_three_for_stop=args.add_three)) logging.debug("Reading Alignments") alignments = BAMGenomeArray([args.bam]) if sum([args.threeprime, args.fiveprime]) != 1: logging.error("Must specify only one and at least one mapping type " "(--fiveprime or --threeprime)") exit(1) if args.threeprime: alignments.set_mapping(ThreePrimeMapFactory(offset=args.offset)) elif args.fiveprime: alignments.set_mapping(FivePrimeMapFactory(offset=args.offset)) alignments.add_filter( "size", SizeFilterFactory(min=args.min_length, max=args.max_length)) outfh = open(args.outfile, 'w') outfh.write("%s\n" % "\t".join( ("transcript_id", "gene_id", "codon_seq", "codon_index", "codon_count_sum", "position_1_count", "position_2_count", "position_3_count"))) for (i, transcript) in enumerate(transcripts): if (i == 0 or (i + 1) % 100 == 0): logging.info("Evaluated %s genes" % (i + 1)) logging.debug(transcript.get_name()) logging.debug(pprint.pformat(transcript.attr)) if (transcript.get_cds().get_length() <= 0): logging.info("Transcript %s has zero (0) length CDS, skipping!", transcript.get_name()) continue if transcript.attr.get("pseudo", None) == "true": logging.info("Transcript %s is a pseudogene, skipping!", transcript.get_name()) continue logging.debug('Transcript {} attributes: {}'.format( transcript.get_name(), transcript.attr)) # Many Ensembl MT annotations have incomplete codon records. # These are coded with an `ensembl_end_phase` attribute # These should be filled in with 'A's, which come from the # polyA tail transcript_cds = transcript.get_cds() transcript_seq = transcript_cds.get_sequence(seq_dict) end_phase = transcript_cds.get_length() % 3 extra_bases = 0 if end_phase != 0: extra_bases = 3 - end_phase logging.warning("Transcript %s CDS length (%i) is not a multiple " "of three, adding %i \"A\" bases" % (transcript.get_name(), transcript_cds.get_length(), extra_bases)) transcript_seq = transcript_seq + "A" * extra_bases last_segment = transcript_cds[-1] logging.debug(last_segment) transcript_cds.add_segments( GenomicSegment(last_segment.chrom, last_segment.end, last_segment.end + extra_bases, last_segment.strand)) num_codons = int(numpy.floor(len(transcript_seq) / 3)) logging.debug("Trancript %s length %i basepairs, %i codons" % (transcript.get_name(), len(transcript_seq), num_codons)) logging.debug('>{} {}\n{}'.format(transcript.get_name(), transcript.get_gene(), transcript_seq.upper())) start_codon = transcript_seq[:3].upper() stop_codon = transcript_seq[-3:].upper() if start_codon not in args.start_codons: logging.error('Transcript {} start codon "{}" is not valid'.format( transcript.get_name(), start_codon)) if stop_codon not in args.stop_codons: logging.error('Transcript {} stop codon "{}" is not valid'.format( transcript.get_name(), stop_codon)) logging.debug(transcript_cds.as_gff3()) transcript_counts = transcript_cds.get_counts(alignments) for codon_index in range(1, num_codons + 1): codon_start = (codon_index - 1) * 3 codon_stop = codon_start + 3 codon_seq = transcript_seq[codon_start:codon_stop] codon_counts = transcript_counts[codon_start:codon_stop] codon_count_sum = sum(codon_counts) transcript_id = transcript.get_name() if ":" in transcript_id: prefix, transcript_id = transcript_id.split(":", 1) gene_ids_raw = transcript.attr.get("Parent", "") gene_ids = [] for gene_id_raw in gene_ids_raw: if ":" in gene_id_raw: prefix, gene_id = gene_id_raw.split(":", 1) gene_ids.append(gene_id) outfh.write("%s\t%s\t%s\t%i\t%i\t%i\t%i\t%i\n" % (transcript_id, ",".join(gene_ids), codon_seq.upper(), codon_index, codon_count_sum, codon_counts[0], codon_counts[1], codon_counts[2]))
# rpy: http://rpy.sourceforge.net/ # ROCR (in R) https://rocr.bioinf.mpi-sb.mpg.de/ from plastid import Transcript, BED_Reader, BAMGenomeArray, FivePrimeMapFactory, VariableFivePrimeMapFactory, SizeFilterFactory import os import numpy os.chdir('/Users/huf/') orfBedFile = "tmp/plastid/data/orfs_Cxcr4_test.bed" riboAlignmentFile_B_resting_chr1_29nt = "tmp/plastid/data/B_resting_chr1_29nt_12offset.bam" riboAlignmentFile_B_resting_chr1 = "tmp/plastid/data/B_resting_chr1.bam" psiteFile = "out/orf-discovery/B/ribo-seq/manuel/Resting/2016-May-18_10-44-28/plastid/psite/merged_q255_star_genome_p_offsets.txt" orfs = list(BED_Reader(orfBedFile ,return_type=Transcript)) alignments = BAMGenomeArray(riboAlignmentFile_B_resting_chr1) #alignments.set_mapping(FivePrimeMapFactory(offset=12)) maprule = VariableFivePrimeMapFactory.from_file(open(psiteFile)) alignments.set_mapping(maprule) from plastid import SizeFilterFactory size_filter = SizeFilterFactory(min=29,max=35) alignments.add_filter("size",size_filter) # create a holder for phasing phasing = numpy.zeros(3) #phasing_orf = numpy.zeros(3) # start codons are hyper-phased; stop codons can have differnt # phasing or even be de-phased depending on experimental protocol
def assign_uORFs_from_harr_peaks(input_transcript, path_to_harringtonine_reads, mapping_for_harringtonine_reads, trained_SVM, scale_logical, scaler_function, genome_twobit, canonical_start_codon, nearcanonical_start_codon, stop_codon, reads_cutoff): """Wrapper function for uORF assigment from harringtonine peak data. Workflow as follows: (1) Calls tile_SVM_utr5 to create Ingolia vectors for all positions within the 5'UTR of the input transcript and classify using trained SVM. Aggregates positive nt locations into peaks. (2) Loops through peaks and calls find_start_in_peak to annotate canonical or non-canonical start codons (3) Loops through UNIQUE start codons identified in peaks and pairs with stop codon by calling pair_start_with_stop (4) Loops through start/stop uORF pairs and extracts uORFs as plastid segmentchains, so long as the stop codon is not equal to the cds_stop for that transcript (avoids calling CDS and N-terminal extensions of CDS as uORFs) --Input-- input_transcript: plastid transcript object path_to_harringtonine_reads: path to bam file for harringtonine reads. Used to make BAMGenomeArray containing Harringtonine read aligments mapping_for_harringtonine_reads: path to psite offset file generated by psite script, applies mapping to BAMGenomeArray. trained_SVM: SVM trained to call start_codons based on Ingolia vector scale_logical: passed to tile_SVM_utr5. Was the SVM trained on a scaled training set? If true, you must provide a scaler function to transform the generated arrays prior to classification scaler_function: passed to tile_SVM_utr5. Function of type preprocessing.StandardScaler().fit(training_array) for given training_array. genome_twobit: twobit genome file canonical_start_codon: regular expression (re.compile) to find start codons, as required by find_start_in_peak nearcanonical_start_codon: regular expression (re.compile) to find near-canonical starts, which will be searched for in peak if a canonical start cannot be found using find_start_in_peak stop_codon: regex (re.compile) for stop codons, passed to pair_start_with_stop internally reads_cutoff: passed to find_start_in_peak. Used for initial QC of discovered peaks, requring that a peak has > this number of reads in the vector range (-7 to +40 relative to peak bounds) --Output-- A list with found uORFs as plastid segmentchains, named as input_transcript.get_name() + start codon location. If no uORFs are found, an empty list. """ print 'Working on ' + input_transcript.get_name() + ' ...' #Generate Harringtonine Read BAMGenomeArray Interally, as Object cannot otherwise be pickled. harringtonine_reads = BAMGenomeArray(path_to_harringtonine_reads) harringtonine_reads.set_mapping( VariableFivePrimeMapFactory.from_file( open(mapping_for_harringtonine_reads))) #Generate the count vector object used to construct Ingolia vectors count_vector = input_transcript.get_counts(harringtonine_reads) #Tile across all 5'UTR and construct vectors at each position, testing with SVM and storing only if predicted start predicted_starts = tile_SVM_utr5(input_transcript, count_vector, trained_SVM, scale_logical, scaler_function) print 'Found ' + str(len(predicted_starts)) + ' positive positions' if len(predicted_starts) == 0: print '...Done!' return [] #Concatenate positive positions into peaks positive_ranges = [ list(group) for group in mit.consecutive_groups(predicted_starts) ] print 'Concatenated positions to ' + str(len(positive_ranges)) + ' peak(s)' #Get Sequence of transcript transcript_sequence = input_transcript.get_sequence(genome_twobit) #Loop through peaks and find start codons called_start = [] for peak in positive_ranges: identified_start = find_start_in_peak( peak_range=peak, transcript_sequence=transcript_sequence, count_vector=count_vector, reads_cutoff=reads_cutoff, canonical_start_codon=canonical_start_codon, nearcanonical_start_codon=nearcanonical_start_codon) #Append Found Starts if type(identified_start) is not str: called_start.append(identified_start) #Find only unique start codons called_start = np.asarray(called_start) called_start = np.unique(called_start) if len(called_start) == 0: print 'Identified 0 start codons.' print '...Done!' return [] print 'Identified ' + str(len(called_start)) + ' unique start codon(s)' #Identify uORFs orf_start, orf_stop = pair_start_with_stop( start_codon_array=called_start, transcript_sequence=transcript_sequence, stop_codon_regex=stop_codon) if len(orf_start) == 0 or len(orf_stop) == 0: print 'Paired 0 start codons.' print '...Done!' return [] print 'Identified ' + str(len(orf_start)) + ' uORF(s)' #Probably do not need to look for unique stops because we are predicting peaks based on real data, not computationally where you worry about calling in-frame methionines as new start codons. #Extract uORF Sequences as segment chains count = 0 segmentchain_list = [] for start, stop in itertools.izip(orf_start, orf_stop): if stop != input_transcript.cds_end: discovered_subchain = input_transcript.get_subchain( start, stop ) #Cannot simply do my_transcript.get_subchain(start,stop,'ID'='x') b/c hardcoded that ID of subchain = ID transcript + 'subchain'. This apparently cannot be overwritten discovered_subchain.attr['ID'] = input_transcript.get_name( ) + '_' + str( start ) #overwrite the attribute to identify uORF uniquely. START this time since we are identifying based on unique start peak and allowing multiple uORF to share the same stop. segmentchain_list.append(discovered_subchain) count = count + 1 # # print str(count) + ' uORF(s) passed QC and were appended to list' print '...Done!' return segmentchain_list
#Calculate Distances and save to dataframe distance_table.dist_uORFstart_to_CDS[ index] = full_transcript.cds_start - start_position distance_table.dist_uORFend_to_CDS[ index] = full_transcript.cds_start - stop_position distance_table.Len_uORF[index] = feature.get_length() distance_table.to_csv( path_or_buf='./output_for_model/HEK293T_uORFs_distance_toCDSstart.tsv', sep='\t', header=True, index=False, index_label=False) #Count CHX Reads Across all the uORFs CHX_reads_array = BAMGenomeArray('file.path') CHX_reads_array.set_mapping( VariableFivePrimeMapFactory.from_file(open('file.path'))) uORF_CHX_counts = pd.DataFrame( index=range(0, len(annotated_ORFs_final)), columns=['uORF_ID', 'transcript', 'Length_uORF', 'uORF_CHX_counts']) for index, feature in enumerate(annotated_ORFs_final): print 'Working on: ' + feature.get_name() print 'Index is: ' + str(index) uORF_CHX_counts.uORF_ID[index] = feature.get_name() uORF_CHX_counts.transcript[index] = feature.attr['transcript_id'] uORF_CHX_counts.Length_uORF[index] = feature.get_length() uORF_CHX_counts.uORF_CHX_counts[index] = np.nansum( feature.get_counts(CHX_reads_array))
# subchain_window = full_transcript.get_subchain(lower_bound, upper_bound) subchain_window.attr['ID'] = i.attr['ID'] + '_' + str( '100bpwindow') + '_' + str( roi_start_in_transcript - lower_bound) #overwrite the attribute to identify uORF uniquely cds_windows_list.append(subchain_window) fout = open('./updated_quantitation/CDS_100bpwindows_aroundstart.fasta', 'w') for i in cds_windows_list: fout.write(i.get_fasta(genome_twobit)) fout.close() #Folks wanted the CDS Counts and uORF counts for everything from the CHX data. Import the Data CHX_1 = BAMGenomeArray(['path.to.reads']) CHX_1.set_mapping( VariableFivePrimeMapFactory.from_file(open('path.to.psite')) ) #doesn't read string like it should, so my workaround is to give it the open file handle instead CHX_2 = BAMGenomeArray(['path.to.reads']) CHX_2.set_mapping( VariableFivePrimeMapFactory.from_file(open('path.to.psite')) ) #doesn't read string like it should, so my workaround is to give it the open file handle instead CHX_3 = BAMGenomeArray(['path.to.reads']) CHX_3.set_mapping(VariableFivePrimeMapFactory.from_file(open('path.to.psite'))) #Count uORF reads across 3 CHX replicates uORF_counts_table = pd.DataFrame( index=range(0, len(uORF_list)),
def count_all_meta_regions_from_genewise_dicts(gene_id, full_transcript_dict, meta_uORFs_computational_dict, meta_uORFs_experimental_dict, path_to_bam, path_to_psite): """An extremely specialized function to return a list of pertinant values relevant to gene-wise ribosome profiling --Input-- gene_id: gene_id for gene of interest. Should be a key in all three dictionaries full_transcript_dict: dictionary {gene_id, list of plastid transcript objects. Transcripts should be entire annotation} meta_uORFs_computational_dict: dictionary {gene_id, SINGLE plastid transcript object with meta_roi for computationally predicted uORFs for that gene} meta_uORFs_experimental_dict: dictionary {gene_id, SINGLE plastid transcript object with meta_roi for experimentally predicted uORFs} path_to_bam: as described. passed to function as required for multithreading path_to_psite: table generated by plastid psite script giving offsets. see above note. --Ouput-- A vector of length 24: [gene_id, gene_name, utr5_counts, utr5_len, utr5_counts_maskedbyCDS, utr5_len_maskedbyCDS, cds_counts, cds_len, utr3_counts, utr3_len, utr3_counts_maskedbyCDS, utr3_len_maskedbyCDS, meta_uORF_comp_counts, meta_uORF_comp_len, meta_uORF_comp_counts_in5utronly, meta_uORF_comp_len_in5utronly, meta_uORF_comp_counts_in5utronly_noCDS, meta_uORF_comp_len_in5utronly_noCDS, meta_uORF_exp_counts, meta_uORF_exp_len, meta_uORF_exp_counts_in5utronly, meta_uORF_exp_len_in5utronly, meta_uORF_exp_counts_in5utronly_noCDS, meta_uORF_exp_len_in5utronly_noCDS] utr_'maskedbyCDS': only portion of meta-utr window that does NOT overlap CDS is counted meta_uORF...'in5utronly': only portion of meta-uORF window in the meta-5utr is counted. meta_uORF...'in5utronly_noCDS': only portion of meta-uORF window in meta-5utr and NOT in meta-CDS is counted. """ print 'Working on ' + str(gene_id) #Import Reads and Set mapping CHX_reads = BAMGenomeArray(path_to_bam) CHX_reads.set_mapping( VariableFivePrimeMapFactory.from_file(open(path_to_psite)) ) #doesn't read string like it should, so my workaround is to give it the open file handle instead #Set up meta-window annotations for given gene using full transcript annotations transcript_list = full_transcript_dict[gene_id] CDS_list = [i.get_cds() for i in transcript_list] utr5_list = [i.get_utr5() for i in transcript_list] utr3_list = [i.get_utr3() for i in transcript_list] meta_CDS = meta_roi(CDS_list) meta_utr5 = meta_roi(utr5_list) meta_utr3 = meta_roi(utr3_list) #Mask CDS overlap of 5'UTR and 3'UTR windows. For calculating 5'UTR/CDS/3'UTR ratios most efficiently meta_utr5.add_masks(*meta_CDS.segments) meta_utr3.add_masks(*meta_CDS.segments) #Calculate Everything Except for uORF Parameters utr5_counts, utr5_len = get_counts_and_lengths_masked( input_segment_chain=meta_utr5, mapped_read_array=CHX_reads, masked_logical=False, keep_true='no') utr5_counts_maskedbyCDS, utr5_len_maskedbyCDS = get_counts_and_lengths_masked( input_segment_chain=meta_utr5, mapped_read_array=CHX_reads, masked_logical=True, keep_true='no') cds_counts, cds_len = get_counts_and_lengths_masked( input_segment_chain=meta_CDS, mapped_read_array=CHX_reads, masked_logical=False, keep_true='no') utr3_counts, utr3_len = get_counts_and_lengths_masked( input_segment_chain=meta_utr3, mapped_read_array=CHX_reads, masked_logical=False, keep_true='no') utr3_counts_maskedbyCDS, utr3_len_maskedbyCDS = get_counts_and_lengths_masked( input_segment_chain=meta_utr3, mapped_read_array=CHX_reads, masked_logical=True, keep_true='no') #Build in if statements b/c may not have uORFs try: meta_uORF_comp = meta_uORFs_computational_dict[gene_id] meta_uORF_comp.add_masks(*meta_utr5.segments) meta_uORF_comp_counts, meta_uORF_comp_len = get_counts_and_lengths_masked( input_segment_chain=meta_uORF_comp, mapped_read_array=CHX_reads, masked_logical=False, keep_true='no') meta_uORF_comp_counts_in5utronly, meta_uORF_comp_len_in5utronly = get_counts_and_lengths_masked( input_segment_chain=meta_uORF_comp, mapped_read_array=CHX_reads, masked_logical=True, keep_true='yes') meta_uORF_comp_in5utr = meta_uORF_comp.get_masks_as_segmentchain() meta_uORF_comp_in5utr.add_masks(*meta_CDS.segments) meta_uORF_comp_counts_in5utronly_noCDS, meta_uORF_comp_len_in5utronly_noCDS = get_counts_and_lengths_masked( input_segment_chain=meta_uORF_comp_in5utr, mapped_read_array=CHX_reads, masked_logical=True, keep_true='no') except KeyError: meta_uORF_comp_counts, meta_uORF_comp_len = (np.nan, np.nan) meta_uORF_comp_counts_in5utronly, meta_uORF_comp_len_in5utronly = ( np.nan, np.nan) meta_uORF_comp_counts_in5utronly_noCDS, meta_uORF_comp_len_in5utronly_noCDS = ( np.nan, np.nan) try: meta_uORF_exp = meta_uORFs_experimental_dict[gene_id] meta_uORF_exp.add_masks(*meta_utr5.segments) meta_uORF_exp_counts, meta_uORF_exp_len = get_counts_and_lengths_masked( input_segment_chain=meta_uORF_exp, mapped_read_array=CHX_reads, masked_logical=False, keep_true='no') meta_uORF_exp_counts_in5utronly, meta_uORF_exp_len_in5utronly = get_counts_and_lengths_masked( input_segment_chain=meta_uORF_exp, mapped_read_array=CHX_reads, masked_logical=True, keep_true='yes') meta_uORF_exp_in5utr = meta_uORF_exp.get_masks_as_segmentchain() meta_uORF_exp_in5utr.add_masks(*meta_CDS.segments) meta_uORF_exp_counts_in5utronly_noCDS, meta_uORF_exp_len_in5utronly_noCDS = get_counts_and_lengths_masked( input_segment_chain=meta_uORF_exp_in5utr, mapped_read_array=CHX_reads, masked_logical=True, keep_true='no') except KeyError: meta_uORF_exp_counts, meta_uORF_exp_len = (np.nan, np.nan) meta_uORF_exp_counts_in5utronly, meta_uORF_exp_len_in5utronly = ( np.nan, np.nan) meta_uORF_exp_counts_in5utronly_noCDS, meta_uORF_exp_len_in5utronly_noCDS = ( np.nan, np.nan) test = [ gene_id, transcript_list[0].attr['gene_id'], utr5_counts, utr5_len, utr5_counts_maskedbyCDS, utr5_len_maskedbyCDS, cds_counts, cds_len, utr3_counts, utr3_len, utr3_counts_maskedbyCDS, utr3_len_maskedbyCDS, meta_uORF_comp_counts, meta_uORF_comp_len, meta_uORF_comp_counts_in5utronly, meta_uORF_comp_len_in5utronly, meta_uORF_comp_counts_in5utronly_noCDS, meta_uORF_comp_len_in5utronly_noCDS, meta_uORF_exp_counts, meta_uORF_exp_len, meta_uORF_exp_counts_in5utronly, meta_uORF_exp_len_in5utronly, meta_uORF_exp_counts_in5utronly_noCDS, meta_uORF_exp_len_in5utronly_noCDS ] print '...Done!' return test