예제 #1
0
def ribo_seq_read_counting_raw(gene,
                               sqlite_path_organism,
                               sqlite_path_reads,
                               count_type="range",
                               unique=True):
    supported = get_protein_coding_transcript_ids(gene, sqlite_path_organism)
    exclude = True
    orf_regions = genomic_orf_coordinate_ranges(gene, sqlite_path_organism,
                                                supported)
    if unique:
        orf_regions = get_unique_regions(orf_regions)

    # unique_regions = get_unique_regions(genomic_exon_coordinates)
    # all_junctions, unique_junctions = unique_exon_junctions(genomic_exon_coordinates)
    # junction_scores = get_scores_per_exonjunction_for_gene(gene, sqlite_path_organism, sqlite_path_reads, supported, filter=exclude)
    # transcripts_to_explain_reads = explain_exon_junctions(junction_scores, all_junctions, unique_junctions)

    try:
        ["range", "fiveprime", "asite"].index(count_type)
    except ValueError:
        print(
            "The count type must be one of 'range', 'fiveprime' or 'asite'. "
            "count_type refers to the part of the read that is used in the feature counting process"
        )
        return "ERROR"

    if count_type == "range":
        genomic_read_ranges = get_read_ranges_genomic_location(
            gene,
            sqlite_path_reads,
            sqlite_path_organism,
            supported,
            filter=exclude)
        counts = count_readranges_supporting_exons_per_transcript(
            orf_regions, genomic_read_ranges)

    if count_type == "fiveprime":
        genomic_read_positions = get_reads_per_genomic_location_fiveprime(
            gene,
            sqlite_path_reads,
            sqlite_path_organism,
            supported,
            filter=exclude)
        counts = count_read_supporting_regions_per_transcript(
            orf_regions, genomic_read_positions)

    if count_type == "asite":
        genomic_read_positions = get_reads_per_genomic_location_asite(
            gene,
            sqlite_path_reads,
            sqlite_path_organism,
            supported,
            filter=exclude)
        counts = count_read_supporting_regions_per_transcript(
            orf_regions, genomic_read_positions)

    sum_of_exon_counts = {}
    maximum_sum = 0
    print(counts)
    return counts
예제 #2
0
def ribo_seq_read_counting(gene, sqlite_path_organism, sqlite_path_reads, count_type="range", unique=True):
    supported = get_protein_coding_transcript_ids(gene, sqlite_path_organism)
    exclude = True
    orf_regions = genomic_orf_coordinate_ranges(gene, sqlite_path_organism, supported)
    if unique:
        orf_regions = get_unique_regions(orf_regions)

    # unique_regions = get_unique_regions(genomic_exon_coordinates)
    # all_junctions, unique_junctions = unique_exon_junctions(genomic_exon_coordinates)
    # junction_scores = get_scores_per_exonjunction_for_gene(gene, sqlite_path_organism, sqlite_path_reads, supported, filter=exclude)
    # transcripts_to_explain_reads = explain_exon_junctions(junction_scores, all_junctions, unique_junctions)

    try:
        ["range", "fiveprime", "asite"].index(count_type)
    except ValueError:
        print("The count type must be one of 'range', 'fiveprime' or 'asite'. "
                "count_type refers to the part of the read that is used in the feature counting process")
        return "ERROR"

    if count_type == "range":
        genomic_read_ranges = get_read_ranges_genomic_location(gene, sqlite_path_reads, sqlite_path_organism, supported,
                                                           filter=exclude)
        counts = count_readranges_supporting_exons_per_transcript(orf_regions, genomic_read_ranges)

    if count_type == "fiveprime":
        genomic_read_positions = get_reads_per_genomic_location_fiveprime(gene, sqlite_path_reads, sqlite_path_organism,
                                                                          supported, filter=exclude)
        counts = count_read_supporting_regions_per_transcript(orf_regions, genomic_read_positions)

    if count_type == "asite":
        genomic_read_positions = get_reads_per_genomic_location_asite(gene, sqlite_path_reads, sqlite_path_organism,
                                                                          supported, filter=exclude)
        counts = count_read_supporting_regions_per_transcript(orf_regions, genomic_read_positions)

    sum_of_exon_counts = {}
    maximum_sum = 0

    region_coverage = get_coverage_per_region(orf_regions, counts)
    coverage = average_coverage_per_transcript(region_coverage)
    # rankings = rank_based_on_dict_values(coverage)
    # rankings_dict = {}
    # for i in rankings:
    #     print i
    #     if i[0] not in rankings_dict:
    #         rankings_dict[i[0]] = i[1]

    # for transcript in counts:
    #     transcript_sum = sum(counts[transcript])
    #     # if (transcript_sum > 0) and (transcript not in transcripts_to_explain_reads):
    #     #     transcripts_to_explain_reads.append(str(transcript))
    #     sum_of_exon_counts[transcript] = transcript_sum
    #
    #     if transcript_sum > maximum_sum:
    #         maximum_sum = transcript_sum
    #         maximum_transcript = transcript
    # print "highest sum of counts: {maximum_transcript} with a total sum of: {maximum_sum}".format(maximum_transcript=str(maximum_transcript), maximum_sum=maximum_sum)
    return coverage
예제 #3
0
def rna_seq_read_counting(gene, sqlite_path_organism, sqlite_path_reads, exclude=True, count_type="range"):
    if exclude:
        supported = filter_unsupported_transcripts(gene, sqlite_path_organism, sqlite_path_reads)
    else:
        gene_info = get_gene_info(gene, sqlite_path_organism)
        supported = [i[0] for i in gene_info]

    genomic_exon_coordinates = genomic_exon_coordinate_ranges(gene, sqlite_path_organism, supported)
    unique_regions = get_unique_regions(genomic_exon_coordinates)
    all_junctions, unique_junctions = unique_exon_junctions(genomic_exon_coordinates)
    junction_scores = get_scores_per_exonjunction_for_gene(gene, sqlite_path_organism, sqlite_path_reads, supported, filter=exclude)
    transcripts_to_explain_reads = explain_exon_junctions(junction_scores, all_junctions, unique_junctions)


    try:
        ["range", "fiveprime", "asite"].index(count_type)
    except ValueError:
        print("The count type must be one of 'range', 'fiveprime' or 'asite'. "
                "count_type refers to the part of the read that is used in the feature counting process")
        return "ERROR"

    if count_type == "range":
        genomic_read_ranges = get_read_ranges_genomic_location(gene, sqlite_path_reads, sqlite_path_organism, supported,
                                                           filter=exclude)
        counts = count_readranges_supporting_exons_per_transcript(unique_regions, genomic_read_ranges)

    if count_type == "fiveprime":
        genomic_read_positions = get_reads_per_genomic_location_fiveprime(gene, sqlite_path_reads, sqlite_path_organism,
                                                                          supported, filter=exclude)
        counts = count_read_supporting_regions_per_transcript(unique_regions, genomic_read_positions)

    if count_type == "asite":
        genomic_read_positions = get_reads_per_genomic_location_asite(gene, sqlite_path_reads, sqlite_path_organism,
                                                                          supported, filter=exclude)
        counts = count_read_supporting_regions_per_transcript(unique_regions, genomic_read_positions)


    sum_of_exon_counts = {}
    maximum_sum = 0

    for transcript in counts:
        transcript_sum = sum(counts[transcript])
        if (transcript_sum > 0) and (transcript not in transcripts_to_explain_reads):
            transcripts_to_explain_reads.append(str(transcript))
        sum_of_exon_counts[transcript] = transcript_sum

        if transcript_sum > maximum_sum:
            maximum_sum = transcript_sum
            maximum_transcript = transcript

    print("The transcript with most uniquely mapped reads is {maximum_transcript} with a score of {maximum_sum}".format(maximum_transcript=str(maximum_transcript), maximum_sum=maximum_sum) )
    return transcripts_to_explain_reads
예제 #4
0
def classify_regions_pos_neg(gene,
                             sqlite_path_reads,
                             sqlite_path_organism,
                             regions,
                             supported,
                             exclude=True,
                             count_type="range"):
    # classify the genomic coordinate ranges into pos or neg. pos if 1 or more read supports it. Returns a nested dictionary
    # with two nests. 1st on transcripts as keys, 2nd with pos or neg as keys

    if count_type == "range":
        genomic_read_ranges = get_read_ranges_genomic_location(
            gene,
            sqlite_path_reads,
            sqlite_path_organism,
            supported,
            filter=exclude)
        counts = count_readranges_supporting_exons_per_transcript(
            regions, genomic_read_ranges)

    if count_type == "fiveprime":
        genomic_read_positions = get_reads_per_genomic_location_fiveprime(
            gene,
            sqlite_path_reads,
            sqlite_path_organism,
            supported,
            filter=exclude)
        counts = count_read_supporting_regions_per_transcript(
            regions, genomic_read_positions)

    if count_type == "asite":
        genomic_read_positions = get_reads_per_genomic_location_asite(
            gene,
            sqlite_path_reads,
            sqlite_path_organism,
            supported,
            filter=exclude)
        counts = count_read_supporting_regions_per_transcript(
            regions, genomic_read_positions)

    classified = {}
    for transcript in counts:
        if transcript not in classified:
            classified[transcript] = {'pos': [], 'neg': []}

        for i in zip(regions[transcript], counts[transcript]):
            if i[1] > 0:
                classified[transcript]['pos'].append(i[0])
            else:
                classified[transcript]['neg'].append(i[0])
    return classified