예제 #1
0
def ribo_seq_read_counting_raw(gene,
                               sqlite_path_organism,
                               sqlite_path_reads,
                               count_type="range",
                               unique=True):
    supported = get_protein_coding_transcript_ids(gene, sqlite_path_organism)
    exclude = True
    orf_regions = genomic_orf_coordinate_ranges(gene, sqlite_path_organism,
                                                supported)
    if unique:
        orf_regions = get_unique_regions(orf_regions)

    # unique_regions = get_unique_regions(genomic_exon_coordinates)
    # all_junctions, unique_junctions = unique_exon_junctions(genomic_exon_coordinates)
    # junction_scores = get_scores_per_exonjunction_for_gene(gene, sqlite_path_organism, sqlite_path_reads, supported, filter=exclude)
    # transcripts_to_explain_reads = explain_exon_junctions(junction_scores, all_junctions, unique_junctions)

    try:
        ["range", "fiveprime", "asite"].index(count_type)
    except ValueError:
        print(
            "The count type must be one of 'range', 'fiveprime' or 'asite'. "
            "count_type refers to the part of the read that is used in the feature counting process"
        )
        return "ERROR"

    if count_type == "range":
        genomic_read_ranges = get_read_ranges_genomic_location(
            gene,
            sqlite_path_reads,
            sqlite_path_organism,
            supported,
            filter=exclude)
        counts = count_readranges_supporting_exons_per_transcript(
            orf_regions, genomic_read_ranges)

    if count_type == "fiveprime":
        genomic_read_positions = get_reads_per_genomic_location_fiveprime(
            gene,
            sqlite_path_reads,
            sqlite_path_organism,
            supported,
            filter=exclude)
        counts = count_read_supporting_regions_per_transcript(
            orf_regions, genomic_read_positions)

    if count_type == "asite":
        genomic_read_positions = get_reads_per_genomic_location_asite(
            gene,
            sqlite_path_reads,
            sqlite_path_organism,
            supported,
            filter=exclude)
        counts = count_read_supporting_regions_per_transcript(
            orf_regions, genomic_read_positions)

    sum_of_exon_counts = {}
    maximum_sum = 0
    print(counts)
    return counts
예제 #2
0
def ribo_seq_read_counting(gene, sqlite_path_organism, sqlite_path_reads, count_type="range", unique=True):
    supported = get_protein_coding_transcript_ids(gene, sqlite_path_organism)
    exclude = True
    orf_regions = genomic_orf_coordinate_ranges(gene, sqlite_path_organism, supported)
    if unique:
        orf_regions = get_unique_regions(orf_regions)

    # unique_regions = get_unique_regions(genomic_exon_coordinates)
    # all_junctions, unique_junctions = unique_exon_junctions(genomic_exon_coordinates)
    # junction_scores = get_scores_per_exonjunction_for_gene(gene, sqlite_path_organism, sqlite_path_reads, supported, filter=exclude)
    # transcripts_to_explain_reads = explain_exon_junctions(junction_scores, all_junctions, unique_junctions)

    try:
        ["range", "fiveprime", "asite"].index(count_type)
    except ValueError:
        print("The count type must be one of 'range', 'fiveprime' or 'asite'. "
                "count_type refers to the part of the read that is used in the feature counting process")
        return "ERROR"

    if count_type == "range":
        genomic_read_ranges = get_read_ranges_genomic_location(gene, sqlite_path_reads, sqlite_path_organism, supported,
                                                           filter=exclude)
        counts = count_readranges_supporting_exons_per_transcript(orf_regions, genomic_read_ranges)

    if count_type == "fiveprime":
        genomic_read_positions = get_reads_per_genomic_location_fiveprime(gene, sqlite_path_reads, sqlite_path_organism,
                                                                          supported, filter=exclude)
        counts = count_read_supporting_regions_per_transcript(orf_regions, genomic_read_positions)

    if count_type == "asite":
        genomic_read_positions = get_reads_per_genomic_location_asite(gene, sqlite_path_reads, sqlite_path_organism,
                                                                          supported, filter=exclude)
        counts = count_read_supporting_regions_per_transcript(orf_regions, genomic_read_positions)

    sum_of_exon_counts = {}
    maximum_sum = 0

    region_coverage = get_coverage_per_region(orf_regions, counts)
    coverage = average_coverage_per_transcript(region_coverage)
    # rankings = rank_based_on_dict_values(coverage)
    # rankings_dict = {}
    # for i in rankings:
    #     print i
    #     if i[0] not in rankings_dict:
    #         rankings_dict[i[0]] = i[1]

    # for transcript in counts:
    #     transcript_sum = sum(counts[transcript])
    #     # if (transcript_sum > 0) and (transcript not in transcripts_to_explain_reads):
    #     #     transcripts_to_explain_reads.append(str(transcript))
    #     sum_of_exon_counts[transcript] = transcript_sum
    #
    #     if transcript_sum > maximum_sum:
    #         maximum_sum = transcript_sum
    #         maximum_transcript = transcript
    # print "highest sum of counts: {maximum_transcript} with a total sum of: {maximum_sum}".format(maximum_transcript=str(maximum_transcript), maximum_sum=maximum_sum)
    return coverage
예제 #3
0
def rna_seq_read_counting(gene, sqlite_path_organism, sqlite_path_reads, exclude=True, count_type="range"):
    if exclude:
        supported = filter_unsupported_transcripts(gene, sqlite_path_organism, sqlite_path_reads)
    else:
        gene_info = get_gene_info(gene, sqlite_path_organism)
        supported = [i[0] for i in gene_info]

    genomic_exon_coordinates = genomic_exon_coordinate_ranges(gene, sqlite_path_organism, supported)
    unique_regions = get_unique_regions(genomic_exon_coordinates)
    all_junctions, unique_junctions = unique_exon_junctions(genomic_exon_coordinates)
    junction_scores = get_scores_per_exonjunction_for_gene(gene, sqlite_path_organism, sqlite_path_reads, supported, filter=exclude)
    transcripts_to_explain_reads = explain_exon_junctions(junction_scores, all_junctions, unique_junctions)


    try:
        ["range", "fiveprime", "asite"].index(count_type)
    except ValueError:
        print("The count type must be one of 'range', 'fiveprime' or 'asite'. "
                "count_type refers to the part of the read that is used in the feature counting process")
        return "ERROR"

    if count_type == "range":
        genomic_read_ranges = get_read_ranges_genomic_location(gene, sqlite_path_reads, sqlite_path_organism, supported,
                                                           filter=exclude)
        counts = count_readranges_supporting_exons_per_transcript(unique_regions, genomic_read_ranges)

    if count_type == "fiveprime":
        genomic_read_positions = get_reads_per_genomic_location_fiveprime(gene, sqlite_path_reads, sqlite_path_organism,
                                                                          supported, filter=exclude)
        counts = count_read_supporting_regions_per_transcript(unique_regions, genomic_read_positions)

    if count_type == "asite":
        genomic_read_positions = get_reads_per_genomic_location_asite(gene, sqlite_path_reads, sqlite_path_organism,
                                                                          supported, filter=exclude)
        counts = count_read_supporting_regions_per_transcript(unique_regions, genomic_read_positions)


    sum_of_exon_counts = {}
    maximum_sum = 0

    for transcript in counts:
        transcript_sum = sum(counts[transcript])
        if (transcript_sum > 0) and (transcript not in transcripts_to_explain_reads):
            transcripts_to_explain_reads.append(str(transcript))
        sum_of_exon_counts[transcript] = transcript_sum

        if transcript_sum > maximum_sum:
            maximum_sum = transcript_sum
            maximum_transcript = transcript

    print("The transcript with most uniquely mapped reads is {maximum_transcript} with a score of {maximum_sum}".format(maximum_transcript=str(maximum_transcript), maximum_sum=maximum_sum) )
    return transcripts_to_explain_reads
예제 #4
0
def classify_regions_pos_neg(gene,
                             sqlite_path_reads,
                             sqlite_path_organism,
                             regions,
                             supported,
                             exclude=True,
                             count_type="range"):
    # classify the genomic coordinate ranges into pos or neg. pos if 1 or more read supports it. Returns a nested dictionary
    # with two nests. 1st on transcripts as keys, 2nd with pos or neg as keys

    if count_type == "range":
        genomic_read_ranges = get_read_ranges_genomic_location(
            gene,
            sqlite_path_reads,
            sqlite_path_organism,
            supported,
            filter=exclude)
        counts = count_readranges_supporting_exons_per_transcript(
            regions, genomic_read_ranges)

    if count_type == "fiveprime":
        genomic_read_positions = get_reads_per_genomic_location_fiveprime(
            gene,
            sqlite_path_reads,
            sqlite_path_organism,
            supported,
            filter=exclude)
        counts = count_read_supporting_regions_per_transcript(
            regions, genomic_read_positions)

    if count_type == "asite":
        genomic_read_positions = get_reads_per_genomic_location_asite(
            gene,
            sqlite_path_reads,
            sqlite_path_organism,
            supported,
            filter=exclude)
        counts = count_read_supporting_regions_per_transcript(
            regions, genomic_read_positions)

    classified = {}
    for transcript in counts:
        if transcript not in classified:
            classified[transcript] = {'pos': [], 'neg': []}

        for i in zip(regions[transcript], counts[transcript]):
            if i[1] > 0:
                classified[transcript]['pos'].append(i[0])
            else:
                classified[transcript]['neg'].append(i[0])
    return classified
예제 #5
0
import time
from tripsCountpy2 import count_read_supporting_regions_per_transcript
from tripsSplicepy2 import genomic_exon_coordinate_ranges
from tripsSplicepy2 import get_protein_coding_transcript_ids
from tripsSplicepy2 import get_reads_per_genomic_location_asite

from tripsCountpy2 import ribo_seq_read_counting
from tripsCountpy2 import ribo_seq_read_counting_raw

if __name__ == "__main__":
    start = time.time()
    gene = "phpt1"
    sqlite_path_organism = "homo_sapiens.v2.sqlite"
    sqlite_path_reads = ["SRR2433794.sqlite"]
    coding = get_protein_coding_transcript_ids(gene, sqlite_path_organism)
    genomic_read_positions = get_reads_per_genomic_location_asite(
        gene, sqlite_path_reads, sqlite_path_organism, coding, filter=True)
    # genomic_read_positions = get_reads_per_genomic_location_fiveprime(gene, sqlite_path_reads, sqlite_path_organism, coding, filter=True)
    exons = genomic_exon_coordinate_ranges(gene, sqlite_path_organism, coding)

    counts = count_read_supporting_regions_per_transcript(
        exons, genomic_read_positions)

    orfQuant_res = orfQuant_OPM(gene,
                                sqlite_path_organism,
                                sqlite_path_reads,
                                coding,
                                counts,
                                filter=True)
    end = time.time()
    print("ORFquant OPM time: " + str(end - start))
def query():
	global user_short_passed
	tran_dict = {}
	gene_dict = {}
	ribo_user_files = {}
	data = ast.literal_eval(request.data)

	tran = data['transcript'].upper().strip()
	readscore = data['readscore']
	secondary_readscore = data['secondary_readscore']
	minread = int(data['minread'])
	maxread = int(data['maxread'])
	minfiles = int(data['minfiles'])
	organism = data['organism']
	seqhili = data['seqhili'].split(",")
	hili_start = int(data['hili_start'])
	hili_stop = int(data['hili_stop'])
	transcriptome = data['transcriptome']
	advanced =  data["advanced"]

	# Send file_list (a list of integers intentionally encoded as strings due to javascript), to be converted to a dictionary with riboseq/rnaseq lists of file paths.
	file_paths_dict = fetch_file_paths(data["file_list"],organism)

	primetype = data["primetype"]
	user_hili_starts = data["user_hili_starts"]
	user_hili_stops = data["user_hili_stops"]
	user_short = data["user_short"]

	connection = sqlite3.connect('{}/trips.sqlite'.format(config.SCRIPT_LOC))
	connection.text_factory = str
	cursor = connection.cursor()
	cursor.execute("SELECT owner FROM organisms WHERE organism_name = '{}' and transcriptome_list = '{}';".format(organism, transcriptome))
	owner = (cursor.fetchone())[0]

	if owner == 1:
		if os.path.isfile("{0}{1}/{1}.{2}.sqlite".format(config.ANNOTATION_DIR,organism,transcriptome)):
			sqlite_path_organism = "{0}{1}/{1}.{2}.sqlite".format(config.ANNOTATION_DIR,organism,transcriptome)
			transhelve = sqlite3.connect("{0}{1}/{1}.{2}.sqlite".format(config.ANNOTATION_DIR,organism,transcriptome))
		else:
			return "Cannot find annotation file {}.{}.sqlite".format(organism,transcriptome)
	else:
		sqlite_path_organism = "{0}transcriptomes/{1}/{2}/{3}/{2}_{3}.v2.sqlite".format(config.UPLOADS_DIR,owner,organism,transcriptome)
		transhelve = sqlite3.connect("{0}transcriptomes/{1}/{2}/{3}/{2}_{3}.v2.sqlite".format(config.UPLOADS_DIR,owner,organism,transcriptome))
	cursor = transhelve.cursor()
	cursor.execute("SELECT * from transcripts WHERE transcript = '{}'".format(tran))
	
	result = cursor.fetchone()
	inputtran = True

	if result != None:
		newtran = result[0]
	else:
		inputtran = False
	if inputtran == False:
		cursor.execute("SELECT * from transcripts WHERE gene = '{}'".format(tran))
		result = cursor.fetchall()

		if result != []:
			if len(result) == 1:
				tran = str(result[0][0])
			else:
				return_str = "TRANSCRIPTS"
				f = open("logfile.txt", "w")
				coding = get_protein_coding_transcript_ids(tran, sqlite_path_organism)

				genomic_read_positions = get_reads_per_genomic_location_asite(tran, file_paths_dict["riboseq"].values(),
																				  sqlite_path_organism, coding, filter=True)
				exons = genomic_exon_coordinate_ranges(tran, sqlite_path_organism, coding)

				counts = count_read_supporting_regions_per_transcript(exons, genomic_read_positions)

				orfQuant_res = orfQuant(tran, sqlite_path_organism, file_paths_dict["riboseq"].values(), coding, counts, filter=True)


				f.close()
				for transcript in result:
					cursor.execute("SELECT length,cds_start,cds_stop,principal,version from transcripts WHERE transcript = '{}'".format(transcript[0]))
					tran_result = cursor.fetchone()
					tranlen = tran_result[0]
					cds_start = tran_result[1]
					cds_stop = tran_result[2]
					if str(tran_result[3]) == "1":
						principal = "principal"
					else:
						principal = ""
					version = tran_result[4]
					if cds_start == "NULL" or cds_start == None:
						cdslen = "NULL"
						threeutrlen = "NULL"
					else:
						cdslen = cds_stop-cds_start
						threeutrlen = tranlen - cds_stop
					if transcript[0] in orfQuant_res:
						coverage = orfQuant_res[transcript[0]]
					else:
						coverage = "NULL"

					return_str += (":{},{},{},{},{},{},{}".format(transcript[0],version, tranlen, cds_start, cdslen, threeutrlen,coverage))

				return return_str
				
		else:
			return "ERROR! Could not find any transcript corresponding to {}".format(tran)
	transhelve.close()
	if 'varlite' in data:
		lite = "y"
	else:
		lite="n"
	if 'preprocess' in data:
		preprocess = True
	else:
		preprocess = False
	if 'uga_diff' in data:
		uga_diff = True
	else:
		uga_diff = False
	if 'color_readlen_dist' in data:
		color_readlen_dist = True
	else:
		color_readlen_dist = False
	if 'ribocoverage' in data:
		ribocoverage = True
	else:
		ribocoverage = False
	if "nucseq" in data:
		nucseq = True
	else:
		nucseq = False
	if "mismatches" in data:
		mismatches = True
	else:
		mismatches = False
	if "ambiguous" in data:
		ambiguous = "ambig"
	else:
		ambiguous = "unambig"
	if "pcr" in data:
		pcr = True
	else:
		pcr = False
	if "noisered" in data:
		noisered = True
	else:
		noisered = False

	if "mismatch" in data:
		mismatch = True
	else:
		mismatch = False
	if data["user_short"] == "None" or user_short_passed == True:
		short_code = generate_short_code(data,organism,data["transcriptome"],"interactive_plot")
	else:
		short_code = data["user_short"]
		user_short_passed = True
	try:
		user = current_user.name
	except:
		user = None
	connection = sqlite3.connect('{}/trips.sqlite'.format(config.SCRIPT_LOC))
	connection.text_factory = str
	cursor = connection.cursor()
	background_col = config.BACKGROUND_COL
	uga_col = config.UGA_COL
	uag_col = config.UAG_COL
	uaa_col = config.UAA_COL
	title_size = config.TITLE_SIZE
	subheading_size = config.SUBHEADING_SIZE
	axis_label_size = config.AXIS_LABEL_SIZE
	marker_size = config.MARKER_SIZE
	cds_marker_size = config.CDS_MARKER_SIZE
	cds_marker_colour = config.CDS_MARKER_COLOUR
	legend_size = config.LEGEND_SIZE
	ribo_linewidth = config.RIBO_LINEWIDTH
	#Put any publicly available seq types (apart from riboseq and rnaseq) here
	seq_rules = {"proteomics":{"frame_breakdown":1},"conservation":{"frame_breakdown":1},"tcpseq":{"frame_breakdown":0}}

	#get user_id
	if user != None:
		cursor.execute("SELECT user_id from users WHERE username = '******';".format(user))
		result = (cursor.fetchone())
		user_id = result[0]
		#get a list of organism id's this user can access
		cursor.execute("SELECT background_col,uga_col,uag_col,uaa_col,title_size,subheading_size,axis_label_size,marker_size,cds_marker_width,cds_marker_colour,legend_size,ribo_linewidth from user_settings WHERE user_id = '{}';".format(user_id))
		result = (cursor.fetchone())
		background_col = result[0]
		uga_col = result[1]
		uag_col = result[2]
		uaa_col = result[3]
		title_size = result[4]
		subheading_size = result[5]
		axis_label_size = result[6]
		marker_size = result[7]
		cds_marker_size = result[8]
		cds_marker_colour = result[9]
		legend_size = result[10]
		ribo_linewidth = result[11]
		#get rules for all custom seq types
		cursor.execute("SELECT * from seq_rules WHERE user_id = {};".format(user_id))
		result = (cursor.fetchall())
		for row in result:
			seq_name = row[1]
			frame_breakdown = row[2]
			seq_rules[seq_name] = {"frame_breakdown":frame_breakdown}
		connection.close()

	if tran != "":
		x = riboflask.generate_plot(tran, ambiguous, minread, maxread, lite , ribocoverage, organism, readscore, noisered,primetype,
								   minfiles,nucseq, user_hili_starts, user_hili_stops,uga_diff,file_paths_dict,short_code, color_readlen_dist,
								   background_col,uga_col, uag_col, uaa_col,advanced,config.ANNOTATION_DIR,seqhili,seq_rules,title_size,
								   subheading_size,axis_label_size,marker_size,transcriptome,config.UPLOADS_DIR,cds_marker_size,cds_marker_colour,
								   legend_size,ribo_linewidth,secondary_readscore,pcr,mismatches,hili_start, hili_stop)
	else:
		x = "ERROR! Could not find any transcript corresponding to whatever you entered"
	return x