def process_single_transcript_forSVM(input_transcript,
                                         path_to_harringtonine_reads,
                                         path_to_harringtonine_psite):
        print 'Working on ' + input_transcript.get_name() + '...'
        #Set up harringtonine reads
        harringtonine_reads = BAMGenomeArray(path_to_harringtonine_reads)
        harringtonine_reads.set_mapping(
            VariableFivePrimeMapFactory.from_file(
                open(path_to_harringtonine_psite)))

        #Set up vectors to append into
        positive_vectors = []
        negative_vectors = []

        #Ensure transcript competant to be a test example
        start_codon_nt = input_transcript.cds_start
        if not start_codon_nt - 25 > 0 or not start_codon_nt + 190 < input_transcript.get_length(
        ):  #Ingolia says -18 on each side of initiation site scoring window. Noting that scoring window is -7 to 40 nt from given site, accounting for negative vector should be 150+40=190 on positive bound.
            return positive_vectors, negative_vectors

        #Create the vectors
        count_vector = input_transcript.get_counts(harringtonine_reads)
        positive_vectors.append(
            construct_Ingolia_vector(start_codon_nt, count_vector))
        for z in [-6, -3, 3, 9, 18, 30, 60, 90, 120, 150]:
            negative_vectors.append(
                construct_Ingolia_vector(start_codon_nt + z, count_vector))
        #
        print '...Done!'
        return positive_vectors, negative_vectors
示例#2
0
    def __init__(self, alignment_file, bam_file):

        """
        Initiates an Alignment class object with the given pysam.AlignmentFile.
        Creates and stores plastid BAMGenomeArray from the alignment_file, using a plastid fivePrimeMapping factory.

        :param alignment_file: pysam.AlignmentFile

        """
        self.bam_file = bam_file
        self.alignment_file = alignment_file
        self.bam_array = BAMGenomeArray(alignment_file, mapping=FivePrimeMapFactory())

        # check if alignment file read successfully
        if self.bam_array is None:
            error_message = "Unknown problem occurred while reading the alignment file" % alignment_file
            self.logger.error(error_message)
            raise Exception(error_message)

        # set the number of chromosomes from the bam array
        self.num_chromosomes = len(self.bam_array.chroms())

        # report success
        self.logger.debug("Read in the alignment file %s with %d chromosomes"
                            % (alignment_file.filename, self.num_chromosomes))
    def count_CHX_single_transcript(input_transcript, path_to_CHX_reads,
                                    path_to_CHX_psite):
        print 'Working On ' + str(input_transcript.get_name()) + '...'

        #Prepare reads and transcript object
        CHX_reads = BAMGenomeArray(path_to_CHX_reads)
        CHX_reads.set_mapping(
            VariableFivePrimeMapFactory.from_file(open(path_to_CHX_psite)))
        cds_transcript = input_transcript.get_cds()

        #Count features
        counts = np.nansum(cds_transcript.get_counts(CHX_reads))
        length = cds_transcript.get_length()
        rpnt_cds = counts / length
        print '...Done!'
        return input_transcript, counts, length, rpnt_cds
def fetch_vectors(filename):
    pickle_path = global_args.output_dir + \
        global_args.annotation_file[:-4].split("/")[-1] + ".sav"
    if os.path.isfile(pickle_path) == False:
        create_assembly_dill(global_args.annotation_file)

    gtf_coords_file = list(dill.load(open(pickle_path, "rb")))
    allowed_ids = set(allowed_transcript_ids(global_args.gene_set))
    alignments = BAMGenomeArray(filename, mapping=FivePrimeMapFactory())
    print("Genomes loaded for %s " % filename)

    vector_array = []
    name_array = []

    for transcript in gtf_coords_file:
        if any([
                transcript.attr.get('Name') in allowed_ids,
                transcript.get_name() in allowed_ids
        ]):
            readvec = transcript.get_counts(alignments)
            if np.sum(readvec) > 30 and len(readvec) % 3 == 0:
                if global_args.annotation_file.endswith("gtf"):
                    name_array.append(transcript.get_name())
                elif global_args.annotation_file.endswith("gff"):
                    name_array.append(transcript.attr.get('Name'))
                readvec = np.reshape(readvec, (-1, 3))
                vector_array.append(np.sum(readvec, axis=0))

    vector_array = np.vstack(vector_array)
    sum_array = vector_array.sum(0) / np.sum(vector_array.sum(0))
    vector_array = vector_array / vector_array.sum(1)[:, np.newaxis]

    return vector_array, name_array, sum_array
def fetch_vectors(filename):
    allowed_ids = set(allowed_transcript_ids(global_args.gene_set))
    alignments = BAMGenomeArray(filename, mapping=FivePrimeMapFactory())

    count_vectors = []
    count_vectors_term = []

    for transcript in extend_gtf_frame(global_args.annotation_file):
        if any([
                transcript.attr.get('Name') in allowed_ids,
                transcript.get_name() in allowed_ids
        ]):
            try:
                value_array = transcript.get_counts(alignments)
                count_vectors.append(value_array[:global_args.offset * 2])
                count_vectors_term.append(value_array[-global_args.offset *
                                                      2:])
            except ValueError:
                pass

    vector_array = np.vstack(count_vectors)
    vector_array_term = np.vstack(count_vectors_term)

    if global_args.normalize == True:
        vector_array = vector_array[~np.all(vector_array == 0, axis=1)]
        vector_array = vector_array / vector_array.sum(1)[:, np.newaxis]
        vector_array_term = vector_array_term[
            ~np.all(vector_array_term == 0, axis=1)]
        vector_array_term = vector_array_term / vector_array_term.sum(
            1)[:, np.newaxis]

    metagene = vector_array.sum(axis=0)
    metagene_term = vector_array_term.sum(axis=0)

    return metagene, metagene_term
示例#6
0
def main(args, loglevel):

    logging.basicConfig(format="%(levelname)s: %(message)s", level=loglevel)

    logging.debug(b"Building sequence dictionary")
    seq_dict = SeqIO.index(args.fasta, "fasta")
    logging.debug("Reading Annotations")
    transcripts = list(
        GFF3_TranscriptAssembler(open(args.gff),
                                 add_three_for_stop=args.add_three))
    logging.debug("Reading Alignments")
    alignments = BAMGenomeArray([args.bam])
    alignments.set_mapping(ThreePrimeMapFactory(offset=args.offset))
    alignments.add_filter(
        "size", SizeFilterFactory(min=args.min_length, max=args.max_length))
    outfh = open(args.outfile, 'wb')
    outfh.write(b"%s\n" % b"\t".join(
        (b"gene_id", b"gene_name", b"codon_seq", b"codon_index",
         b"codon_count_sum", b"position_1_count", b"position_2_count",
         b"position_3_count")))
    for (i, transcript) in enumerate(transcripts):
        if (i == 0 or (i + 1) % 100 == 0):
            logging.info("Evaluated %s genes" % (i + 1))
        logging.debug(transcript.get_name())
        logging.debug(pprint.pformat(transcript.attr))
        if len(transcript) <= 0:
            logging.warn("Transcript %s is length zero (0), skipping!",
                         transcript.get_name())
            continue
        if transcript.attr.get("pseudo", None) == "true":
            logging.warn("Transcript %s is a pseudogene, skipping!",
                         transcript.get_name())
            continue
        transcript_seq = transcript.get_sequence(seq_dict)
        transcript_counts = transcript.get_counts(alignments)
        num_codons = len(transcript_counts) / 3
        if num_codons != round(num_codons):
            logging.warn("Transcript %s length (%i) is not a multiple of "
                         "three, skipping!" %
                         (transcript.get_name(), len(transcript_counts)))
            continue
        logging.debug("Trancript length %i basepairs, %f codons" %
                      (len(transcript_counts), num_codons))
        for codon_index in range(1, int(numpy.floor(num_codons))):
            codon_start = (codon_index - 1) * 3
            codon_stop = codon_start + 3
            codon_seq = transcript_seq[codon_start:codon_stop]
            codon_counts = transcript_counts[codon_start:codon_stop]
            codon_count_sum = sum(codon_counts)
            outfh.write(
                "%s\t%s\t%s\t%i\t%i\t%i\t%i\t%i\n" %
                (transcript.get_name(), transcript.attr.get(
                    "gene", ""), codon_seq, codon_index, codon_count_sum,
                 codon_counts[0], codon_counts[1], codon_counts[2]))
示例#7
0
def fetch_vectors(filenames):

    allowed_ids = set(allowed_transcript_ids(global_args.gene_set))
    alignments = BAMGenomeArray(filenames, mapping=FivePrimeMapFactory())
    print("Genomes loaded!")

    except_count = 0
    count_vectors_start = []
    count_vectors_term = []

    for transcript in extend_gtf_frame(global_args.annotation_file):
        if any([
                transcript.attr.get('Name') in allowed_ids,
                transcript.get_name() in allowed_ids
        ]):
            try:
                value_array = transcript.get_counts(alignments)
                if global_args.shortest < (
                        len(value_array) -
                        global_args.offset * 2) < global_args.longest:

                    if np.sum(value_array[(-global_args.longest -
                                           global_args.offset):]) > 1:
                        count_vectors_term.append(
                            np.concatenate(
                                (np.zeros(global_args.longest, dtype=int),
                                 value_array))[-global_args.longest -
                                               global_args.offset:])

                    if np.sum(value_array[:global_args.longest +
                                          global_args.offset]) > 1:
                        count_vectors_start.append(
                            np.concatenate(
                                (value_array,
                                 np.zeros(global_args.longest,
                                          dtype=int)))[:global_args.longest +
                                                       global_args.offset])
            except ValueError:
                except_count += 1

    vector_array_start = np.vstack(count_vectors_start)
    vector_array_term = np.vstack(count_vectors_term)

    print("Vectors retrieved!")
    print("Removed %i transcripts!" % except_count)

    if global_args.normalize == True:
        vector_normsum_start = np.sum(vector_array_start, axis=1)
        vector_array_start = vector_array_start / \
            vector_normsum_start[:, np.newaxis]

        vector_normsum_term = np.sum(vector_array_term, axis=1)
        vector_array_term = vector_array_term / \
            vector_normsum_term[:, np.newaxis]

    metagene_start = vector_array_start.sum(axis=0)
    metagene_stack_start = np.reshape(metagene_start, (-1, 3))
    metagene_stack_start = np.hsplit(metagene_stack_start, 3)

    metagene_term = vector_array_term.sum(axis=0)
    metagene_stack_term = np.reshape(metagene_term, (-1, 3))
    metagene_stack_term = np.hsplit(metagene_stack_term, 3)

    frames_start = []
    for arr in metagene_stack_start:
        frame_vector = np.concatenate((np.zeros(2), arr.T[0], np.zeros(2)))
        window_iter = (np.sum(frame_vector[i:i + 5])
                       for i in range(len(frame_vector[2:-3])))
        frames_start.append(np.fromiter(window_iter, dtype=float))

    frames_term = []
    for arr in metagene_stack_term:
        frame_vector = np.concatenate((np.zeros(2), arr.T[0], np.zeros(2)))
        window_iter = (np.sum(frame_vector[i:i + 5])
                       for i in range(len(frame_vector[2:-3])))
        frames_term.append(np.fromiter(window_iter, dtype=float))

    print("Frames split")
    return frames_start, frames_term
示例#8
0
def main(args, loglevel):

    logging.basicConfig(format="%(levelname)s: %(message)s", level=loglevel)

    logging.debug("Building sequence dictionary")
    seq_dict = SeqIO.index(args.fasta, "fasta")
    logging.debug("Reading Annotations")

    if args.gff:
        transcripts = list(
            GFF3_TranscriptAssembler(open(args.gff),
                                     add_three_for_stop=args.add_three))
    elif args.gtf:
        transcripts = list(
            GTF2_TranscriptAssembler(open(args.gtf),
                                     add_three_for_stop=args.add_three))

    logging.debug("Reading Alignments")
    alignments = BAMGenomeArray([args.bam])

    if sum([args.threeprime, args.fiveprime]) != 1:
        logging.error("Must specify only one and at least one mapping type "
                      "(--fiveprime or --threeprime)")
        exit(1)

    if args.threeprime:
        alignments.set_mapping(ThreePrimeMapFactory(offset=args.offset))
    elif args.fiveprime:
        alignments.set_mapping(FivePrimeMapFactory(offset=args.offset))

    alignments.add_filter(
        "size", SizeFilterFactory(min=args.min_length, max=args.max_length))
    outfh = open(args.outfile, 'w')
    outfh.write("%s\n" % "\t".join(
        ("transcript_id", "gene_id", "codon_seq", "codon_index",
         "codon_count_sum", "position_1_count", "position_2_count",
         "position_3_count")))
    for (i, transcript) in enumerate(transcripts):
        if (i == 0 or (i + 1) % 100 == 0):
            logging.info("Evaluated %s genes" % (i + 1))
        logging.debug(transcript.get_name())
        logging.debug(pprint.pformat(transcript.attr))
        if (transcript.get_cds().get_length() <= 0):
            logging.info("Transcript %s has zero (0) length CDS, skipping!",
                         transcript.get_name())
            continue
        if transcript.attr.get("pseudo", None) == "true":
            logging.info("Transcript %s is a pseudogene, skipping!",
                         transcript.get_name())
            continue
        logging.debug('Transcript {} attributes: {}'.format(
            transcript.get_name(), transcript.attr))

        # Many Ensembl MT annotations have incomplete codon records.
        # These are coded with an `ensembl_end_phase` attribute
        # These should be filled in with 'A's, which come from the
        # polyA tail
        transcript_cds = transcript.get_cds()
        transcript_seq = transcript_cds.get_sequence(seq_dict)

        end_phase = transcript_cds.get_length() % 3
        extra_bases = 0
        if end_phase != 0:
            extra_bases = 3 - end_phase
            logging.warning("Transcript %s CDS length (%i) is not a multiple "
                            "of three, adding %i \"A\" bases" %
                            (transcript.get_name(),
                             transcript_cds.get_length(), extra_bases))
            transcript_seq = transcript_seq + "A" * extra_bases
            last_segment = transcript_cds[-1]
            logging.debug(last_segment)
            transcript_cds.add_segments(
                GenomicSegment(last_segment.chrom, last_segment.end,
                               last_segment.end + extra_bases,
                               last_segment.strand))

        num_codons = int(numpy.floor(len(transcript_seq) / 3))
        logging.debug("Trancript %s length %i basepairs, %i codons" %
                      (transcript.get_name(), len(transcript_seq), num_codons))
        logging.debug('>{} {}\n{}'.format(transcript.get_name(),
                                          transcript.get_gene(),
                                          transcript_seq.upper()))

        start_codon = transcript_seq[:3].upper()
        stop_codon = transcript_seq[-3:].upper()
        if start_codon not in args.start_codons:
            logging.error('Transcript {} start codon "{}" is not valid'.format(
                transcript.get_name(), start_codon))
        if stop_codon not in args.stop_codons:
            logging.error('Transcript {} stop codon "{}" is not valid'.format(
                transcript.get_name(), stop_codon))
        logging.debug(transcript_cds.as_gff3())

        transcript_counts = transcript_cds.get_counts(alignments)

        for codon_index in range(1, num_codons + 1):
            codon_start = (codon_index - 1) * 3
            codon_stop = codon_start + 3
            codon_seq = transcript_seq[codon_start:codon_stop]
            codon_counts = transcript_counts[codon_start:codon_stop]
            codon_count_sum = sum(codon_counts)
            transcript_id = transcript.get_name()
            if ":" in transcript_id:
                prefix, transcript_id = transcript_id.split(":", 1)
            gene_ids_raw = transcript.attr.get("Parent", "")
            gene_ids = []
            for gene_id_raw in gene_ids_raw:
                if ":" in gene_id_raw:
                    prefix, gene_id = gene_id_raw.split(":", 1)
                    gene_ids.append(gene_id)
            outfh.write("%s\t%s\t%s\t%i\t%i\t%i\t%i\t%i\n" %
                        (transcript_id, ",".join(gene_ids), codon_seq.upper(),
                         codon_index, codon_count_sum, codon_counts[0],
                         codon_counts[1], codon_counts[2]))
示例#9
0
# rpy:			http://rpy.sourceforge.net/
# ROCR (in R)	https://rocr.bioinf.mpi-sb.mpg.de/

from plastid import Transcript, BED_Reader, BAMGenomeArray, FivePrimeMapFactory, VariableFivePrimeMapFactory, SizeFilterFactory
import os
import numpy

os.chdir('/Users/huf/')

orfBedFile = "tmp/plastid/data/orfs_Cxcr4_test.bed"
riboAlignmentFile_B_resting_chr1_29nt = "tmp/plastid/data/B_resting_chr1_29nt_12offset.bam"
riboAlignmentFile_B_resting_chr1 = "tmp/plastid/data/B_resting_chr1.bam"
psiteFile = "out/orf-discovery/B/ribo-seq/manuel/Resting/2016-May-18_10-44-28/plastid/psite/merged_q255_star_genome_p_offsets.txt"

orfs = list(BED_Reader(orfBedFile ,return_type=Transcript))
alignments = BAMGenomeArray(riboAlignmentFile_B_resting_chr1)
#alignments.set_mapping(FivePrimeMapFactory(offset=12))

maprule = VariableFivePrimeMapFactory.from_file(open(psiteFile))
alignments.set_mapping(maprule)

from plastid import SizeFilterFactory
size_filter = SizeFilterFactory(min=29,max=35)
alignments.add_filter("size",size_filter)

# create a holder for phasing
phasing = numpy.zeros(3)
#phasing_orf = numpy.zeros(3)

# start codons are hyper-phased; stop codons can have differnt
# phasing or even be de-phased depending on experimental protocol
def assign_uORFs_from_harr_peaks(input_transcript, path_to_harringtonine_reads,
                                 mapping_for_harringtonine_reads, trained_SVM,
                                 scale_logical, scaler_function, genome_twobit,
                                 canonical_start_codon,
                                 nearcanonical_start_codon, stop_codon,
                                 reads_cutoff):
    """Wrapper function for uORF assigment from harringtonine peak data. Workflow as follows:
        (1) Calls tile_SVM_utr5 to create Ingolia vectors for all positions within the 5'UTR of the input transcript and classify using trained SVM. Aggregates positive nt locations into peaks. 
        (2) Loops through peaks and calls find_start_in_peak to annotate canonical or non-canonical start codons
        (3) Loops through UNIQUE start codons identified in peaks and pairs with stop codon by calling pair_start_with_stop
        (4) Loops through start/stop uORF pairs and extracts uORFs as plastid segmentchains, so long as the stop codon is not equal to the cds_stop for that transcript (avoids calling CDS and N-terminal extensions of CDS as uORFs)
        
    --Input--
    input_transcript: plastid transcript object
    path_to_harringtonine_reads: path to bam file for harringtonine reads. Used to make BAMGenomeArray containing Harringtonine read aligments
    mapping_for_harringtonine_reads: path to psite offset file generated by psite script, applies mapping to BAMGenomeArray. 
    trained_SVM: SVM trained to call start_codons based on Ingolia vector
    scale_logical: passed to tile_SVM_utr5. Was the SVM trained on a scaled training set? If true, you must provide a scaler function to transform the generated arrays prior to classification
    scaler_function: passed to tile_SVM_utr5. Function of type preprocessing.StandardScaler().fit(training_array) for given training_array.
    genome_twobit: twobit genome file
    canonical_start_codon: regular expression (re.compile) to find start codons, as required by find_start_in_peak
    nearcanonical_start_codon: regular expression (re.compile) to find near-canonical starts, which will be searched for in peak if a canonical start cannot be found using find_start_in_peak
    stop_codon: regex (re.compile) for stop codons, passed to pair_start_with_stop internally
    reads_cutoff: passed to find_start_in_peak. Used for initial QC of discovered peaks, requring that a peak has > this number of reads in the vector range (-7 to +40 relative to peak bounds)
    
    --Output--
    A list with found uORFs as plastid segmentchains, named as input_transcript.get_name() + start codon location. If no uORFs are found, an empty list.
    """
    print 'Working on ' + input_transcript.get_name() + ' ...'

    #Generate Harringtonine Read BAMGenomeArray Interally, as Object cannot otherwise be pickled.
    harringtonine_reads = BAMGenomeArray(path_to_harringtonine_reads)
    harringtonine_reads.set_mapping(
        VariableFivePrimeMapFactory.from_file(
            open(mapping_for_harringtonine_reads)))

    #Generate the count vector object used to construct Ingolia vectors
    count_vector = input_transcript.get_counts(harringtonine_reads)

    #Tile across all 5'UTR and construct vectors at each position, testing with SVM and storing only if predicted start
    predicted_starts = tile_SVM_utr5(input_transcript, count_vector,
                                     trained_SVM, scale_logical,
                                     scaler_function)

    print 'Found ' + str(len(predicted_starts)) + ' positive positions'
    if len(predicted_starts) == 0:
        print '...Done!'
        return []

    #Concatenate positive positions into peaks
    positive_ranges = [
        list(group) for group in mit.consecutive_groups(predicted_starts)
    ]
    print 'Concatenated positions to ' + str(len(positive_ranges)) + ' peak(s)'

    #Get Sequence of transcript
    transcript_sequence = input_transcript.get_sequence(genome_twobit)

    #Loop through peaks and find start codons
    called_start = []
    for peak in positive_ranges:
        identified_start = find_start_in_peak(
            peak_range=peak,
            transcript_sequence=transcript_sequence,
            count_vector=count_vector,
            reads_cutoff=reads_cutoff,
            canonical_start_codon=canonical_start_codon,
            nearcanonical_start_codon=nearcanonical_start_codon)

        #Append Found Starts
        if type(identified_start) is not str:
            called_start.append(identified_start)

    #Find only unique start codons
    called_start = np.asarray(called_start)
    called_start = np.unique(called_start)

    if len(called_start) == 0:
        print 'Identified 0 start codons.'
        print '...Done!'
        return []

    print 'Identified ' + str(len(called_start)) + ' unique start codon(s)'

    #Identify uORFs
    orf_start, orf_stop = pair_start_with_stop(
        start_codon_array=called_start,
        transcript_sequence=transcript_sequence,
        stop_codon_regex=stop_codon)

    if len(orf_start) == 0 or len(orf_stop) == 0:
        print 'Paired 0 start codons.'
        print '...Done!'
        return []

    print 'Identified ' + str(len(orf_start)) + ' uORF(s)'

    #Probably do not need to look for unique stops because we are predicting peaks based on real data, not computationally where you worry about calling in-frame methionines as new start codons.

    #Extract uORF Sequences as segment chains
    count = 0
    segmentchain_list = []
    for start, stop in itertools.izip(orf_start, orf_stop):
        if stop != input_transcript.cds_end:
            discovered_subchain = input_transcript.get_subchain(
                start, stop
            )  #Cannot simply do my_transcript.get_subchain(start,stop,'ID'='x') b/c hardcoded that ID of subchain = ID transcript + 'subchain'. This apparently cannot be overwritten
            discovered_subchain.attr['ID'] = input_transcript.get_name(
            ) + '_' + str(
                start
            )  #overwrite the attribute to identify uORF uniquely. START this time since we are identifying based on unique start peak and allowing multiple uORF to share the same stop.
            segmentchain_list.append(discovered_subchain)
            count = count + 1
        #
    #
    print str(count) + ' uORF(s) passed QC and were appended to list'
    print '...Done!'
    return segmentchain_list
    #Calculate Distances and save to dataframe
    distance_table.dist_uORFstart_to_CDS[
        index] = full_transcript.cds_start - start_position
    distance_table.dist_uORFend_to_CDS[
        index] = full_transcript.cds_start - stop_position
    distance_table.Len_uORF[index] = feature.get_length()

distance_table.to_csv(
    path_or_buf='./output_for_model/HEK293T_uORFs_distance_toCDSstart.tsv',
    sep='\t',
    header=True,
    index=False,
    index_label=False)

#Count CHX Reads Across all the uORFs
CHX_reads_array = BAMGenomeArray('file.path')
CHX_reads_array.set_mapping(
    VariableFivePrimeMapFactory.from_file(open('file.path')))

uORF_CHX_counts = pd.DataFrame(
    index=range(0, len(annotated_ORFs_final)),
    columns=['uORF_ID', 'transcript', 'Length_uORF', 'uORF_CHX_counts'])
for index, feature in enumerate(annotated_ORFs_final):
    print 'Working on: ' + feature.get_name()
    print 'Index is: ' + str(index)
    uORF_CHX_counts.uORF_ID[index] = feature.get_name()
    uORF_CHX_counts.transcript[index] = feature.attr['transcript_id']
    uORF_CHX_counts.Length_uORF[index] = feature.get_length()
    uORF_CHX_counts.uORF_CHX_counts[index] = np.nansum(
        feature.get_counts(CHX_reads_array))
    #
    subchain_window = full_transcript.get_subchain(lower_bound, upper_bound)
    subchain_window.attr['ID'] = i.attr['ID'] + '_' + str(
        '100bpwindow') + '_' + str(
            roi_start_in_transcript -
            lower_bound)  #overwrite the attribute to identify uORF uniquely
    cds_windows_list.append(subchain_window)

fout = open('./updated_quantitation/CDS_100bpwindows_aroundstart.fasta', 'w')
for i in cds_windows_list:
    fout.write(i.get_fasta(genome_twobit))

fout.close()

#Folks wanted the CDS Counts and uORF counts for everything from the CHX data. Import the Data
CHX_1 = BAMGenomeArray(['path.to.reads'])
CHX_1.set_mapping(
    VariableFivePrimeMapFactory.from_file(open('path.to.psite'))
)  #doesn't read string like it should, so my workaround is to give it the open file handle instead

CHX_2 = BAMGenomeArray(['path.to.reads'])
CHX_2.set_mapping(
    VariableFivePrimeMapFactory.from_file(open('path.to.psite'))
)  #doesn't read string like it should, so my workaround is to give it the open file handle instead

CHX_3 = BAMGenomeArray(['path.to.reads'])
CHX_3.set_mapping(VariableFivePrimeMapFactory.from_file(open('path.to.psite')))

#Count uORF reads across 3 CHX replicates
uORF_counts_table = pd.DataFrame(
    index=range(0, len(uORF_list)),
示例#13
0
def count_all_meta_regions_from_genewise_dicts(gene_id, full_transcript_dict,
                                               meta_uORFs_computational_dict,
                                               meta_uORFs_experimental_dict,
                                               path_to_bam, path_to_psite):
    """An extremely specialized function to return a list of pertinant values relevant to gene-wise ribosome profiling
    
    --Input--
    gene_id: gene_id for gene of interest. Should be a key in all three dictionaries
    full_transcript_dict: dictionary {gene_id, list of plastid transcript objects. Transcripts should be entire annotation}
    meta_uORFs_computational_dict: dictionary {gene_id, SINGLE plastid transcript object with meta_roi for computationally predicted uORFs for that gene}
    meta_uORFs_experimental_dict: dictionary {gene_id, SINGLE plastid transcript object with meta_roi for experimentally predicted uORFs}
    path_to_bam: as described. passed to function as required for multithreading
    path_to_psite: table generated by plastid psite script giving offsets. see above note.
    
    --Ouput--
    A vector of length 24: [gene_id, gene_name, utr5_counts, utr5_len, utr5_counts_maskedbyCDS, utr5_len_maskedbyCDS, 
    cds_counts, cds_len, utr3_counts, utr3_len, utr3_counts_maskedbyCDS, utr3_len_maskedbyCDS, 
    meta_uORF_comp_counts, meta_uORF_comp_len, meta_uORF_comp_counts_in5utronly, meta_uORF_comp_len_in5utronly, meta_uORF_comp_counts_in5utronly_noCDS, meta_uORF_comp_len_in5utronly_noCDS,
    meta_uORF_exp_counts, meta_uORF_exp_len, meta_uORF_exp_counts_in5utronly, meta_uORF_exp_len_in5utronly, meta_uORF_exp_counts_in5utronly_noCDS, meta_uORF_exp_len_in5utronly_noCDS]
    
    utr_'maskedbyCDS': only portion of meta-utr window that does NOT overlap CDS is counted
    meta_uORF...'in5utronly': only portion of meta-uORF window in the meta-5utr is counted.
    meta_uORF...'in5utronly_noCDS': only portion of meta-uORF window in meta-5utr and NOT in meta-CDS is counted. 
    """

    print 'Working on ' + str(gene_id)

    #Import Reads and Set mapping
    CHX_reads = BAMGenomeArray(path_to_bam)
    CHX_reads.set_mapping(
        VariableFivePrimeMapFactory.from_file(open(path_to_psite))
    )  #doesn't read string like it should, so my workaround is to give it the open file handle instead

    #Set up meta-window annotations for given gene using full transcript annotations
    transcript_list = full_transcript_dict[gene_id]
    CDS_list = [i.get_cds() for i in transcript_list]
    utr5_list = [i.get_utr5() for i in transcript_list]
    utr3_list = [i.get_utr3() for i in transcript_list]
    meta_CDS = meta_roi(CDS_list)
    meta_utr5 = meta_roi(utr5_list)
    meta_utr3 = meta_roi(utr3_list)

    #Mask CDS overlap of 5'UTR and 3'UTR windows. For calculating 5'UTR/CDS/3'UTR ratios most efficiently
    meta_utr5.add_masks(*meta_CDS.segments)
    meta_utr3.add_masks(*meta_CDS.segments)

    #Calculate Everything Except for uORF Parameters
    utr5_counts, utr5_len = get_counts_and_lengths_masked(
        input_segment_chain=meta_utr5,
        mapped_read_array=CHX_reads,
        masked_logical=False,
        keep_true='no')
    utr5_counts_maskedbyCDS, utr5_len_maskedbyCDS = get_counts_and_lengths_masked(
        input_segment_chain=meta_utr5,
        mapped_read_array=CHX_reads,
        masked_logical=True,
        keep_true='no')
    cds_counts, cds_len = get_counts_and_lengths_masked(
        input_segment_chain=meta_CDS,
        mapped_read_array=CHX_reads,
        masked_logical=False,
        keep_true='no')
    utr3_counts, utr3_len = get_counts_and_lengths_masked(
        input_segment_chain=meta_utr3,
        mapped_read_array=CHX_reads,
        masked_logical=False,
        keep_true='no')
    utr3_counts_maskedbyCDS, utr3_len_maskedbyCDS = get_counts_and_lengths_masked(
        input_segment_chain=meta_utr3,
        mapped_read_array=CHX_reads,
        masked_logical=True,
        keep_true='no')

    #Build in if statements b/c may not have uORFs
    try:
        meta_uORF_comp = meta_uORFs_computational_dict[gene_id]
        meta_uORF_comp.add_masks(*meta_utr5.segments)
        meta_uORF_comp_counts, meta_uORF_comp_len = get_counts_and_lengths_masked(
            input_segment_chain=meta_uORF_comp,
            mapped_read_array=CHX_reads,
            masked_logical=False,
            keep_true='no')
        meta_uORF_comp_counts_in5utronly, meta_uORF_comp_len_in5utronly = get_counts_and_lengths_masked(
            input_segment_chain=meta_uORF_comp,
            mapped_read_array=CHX_reads,
            masked_logical=True,
            keep_true='yes')
        meta_uORF_comp_in5utr = meta_uORF_comp.get_masks_as_segmentchain()
        meta_uORF_comp_in5utr.add_masks(*meta_CDS.segments)
        meta_uORF_comp_counts_in5utronly_noCDS, meta_uORF_comp_len_in5utronly_noCDS = get_counts_and_lengths_masked(
            input_segment_chain=meta_uORF_comp_in5utr,
            mapped_read_array=CHX_reads,
            masked_logical=True,
            keep_true='no')
    except KeyError:
        meta_uORF_comp_counts, meta_uORF_comp_len = (np.nan, np.nan)
        meta_uORF_comp_counts_in5utronly, meta_uORF_comp_len_in5utronly = (
            np.nan, np.nan)
        meta_uORF_comp_counts_in5utronly_noCDS, meta_uORF_comp_len_in5utronly_noCDS = (
            np.nan, np.nan)

    try:
        meta_uORF_exp = meta_uORFs_experimental_dict[gene_id]
        meta_uORF_exp.add_masks(*meta_utr5.segments)
        meta_uORF_exp_counts, meta_uORF_exp_len = get_counts_and_lengths_masked(
            input_segment_chain=meta_uORF_exp,
            mapped_read_array=CHX_reads,
            masked_logical=False,
            keep_true='no')
        meta_uORF_exp_counts_in5utronly, meta_uORF_exp_len_in5utronly = get_counts_and_lengths_masked(
            input_segment_chain=meta_uORF_exp,
            mapped_read_array=CHX_reads,
            masked_logical=True,
            keep_true='yes')
        meta_uORF_exp_in5utr = meta_uORF_exp.get_masks_as_segmentchain()
        meta_uORF_exp_in5utr.add_masks(*meta_CDS.segments)
        meta_uORF_exp_counts_in5utronly_noCDS, meta_uORF_exp_len_in5utronly_noCDS = get_counts_and_lengths_masked(
            input_segment_chain=meta_uORF_exp_in5utr,
            mapped_read_array=CHX_reads,
            masked_logical=True,
            keep_true='no')
    except KeyError:
        meta_uORF_exp_counts, meta_uORF_exp_len = (np.nan, np.nan)
        meta_uORF_exp_counts_in5utronly, meta_uORF_exp_len_in5utronly = (
            np.nan, np.nan)
        meta_uORF_exp_counts_in5utronly_noCDS, meta_uORF_exp_len_in5utronly_noCDS = (
            np.nan, np.nan)

    test = [
        gene_id, transcript_list[0].attr['gene_id'], utr5_counts, utr5_len,
        utr5_counts_maskedbyCDS, utr5_len_maskedbyCDS, cds_counts, cds_len,
        utr3_counts, utr3_len, utr3_counts_maskedbyCDS, utr3_len_maskedbyCDS,
        meta_uORF_comp_counts, meta_uORF_comp_len,
        meta_uORF_comp_counts_in5utronly, meta_uORF_comp_len_in5utronly,
        meta_uORF_comp_counts_in5utronly_noCDS,
        meta_uORF_comp_len_in5utronly_noCDS, meta_uORF_exp_counts,
        meta_uORF_exp_len, meta_uORF_exp_counts_in5utronly,
        meta_uORF_exp_len_in5utronly, meta_uORF_exp_counts_in5utronly_noCDS,
        meta_uORF_exp_len_in5utronly_noCDS
    ]

    print '...Done!'
    return test