def profile_plot(bed_file, variable="transposons", chrom='I', bar_width=None, savefig=False): '''This function creates a bar plot along a specified chromosome for the number of transposons or reads. The height of each bar represents the number of transposons or reads at the genomic position indicated on the x-axis. The input is as follows: - bed_file: input absolute path to bed file - variable: either transposons or reads - chrom: roman numeral indicated the chromosome that needs to be plotted - bar_width: integer. By default, the bar_width is set to length_chromosome/800 - savefig: whether to save the figure at the location of the bed file (True or False) The bar_width determines how many basepairs are put in one bin. Little basepairs per bin may be slow. Too many basepairs in one bin and possible low transposon areas might be obscured. The bottom part of the graph is color coded to indicate areas that code for genes. For this a list for essential genes is needed (used in 'list_known_essentials' function) and a .gff file is required (for the functions in 'chromosome_and_gene_positions.py') and a list for gene aliases (used in the function 'gene_aliases') ''' #%% USED FILES gff_file = os.path.join(file_dirname, '..', 'data_files', 'Saccharomyces_cerevisiae.R64-1-1.99.gff3') essential_genes_files = [ os.path.join(file_dirname, '..', 'data_files', 'Cerevisiae_EssentialGenes_List_1.txt'), os.path.join(file_dirname, '..', 'data_files', 'Cerevisiae_EssentialGenes_List_2.txt') ] #%% GET CHROMOSOME LENGTHS AND POSITIONS chr_length_dict, chr_start_pos_dict, chr_end_pos_dict = chromosome_position( gff_file) #%% CREATE LIST OF ALL CHROMOSOMES IN ROMAN NUMERALS print('Chromosome length: ', chr_length_dict.get(chrom)) if bar_width == None: bar_width = int(chr_length_dict.get(chrom) / 800) #%% GET ALL GENES IN CURRENT CHROMOSOME gene_pos_dict = gene_position(gff_file) genes_currentchrom_pos_list = [ k for k, v in gene_pos_dict.items() if chrom in v ] genes_essential_list = list_known_essentials(essential_genes_files) #%% READ BED FILE with open(bed_file) as f: lines = f.readlines() #%% GET NAMES FOR THE CHROMOSOMES IN THE BED FILE chrom_start_index_dict, chrom_end_index_dict = chromosome_name_bedfile( bed_file)[1:3] #%% GET ALL TRANSPOSON COUNTS allcounts_list = np.zeros(chr_length_dict.get(chrom) + 1) if variable == "transposons": for line in lines[chrom_start_index_dict. get(chrom):chrom_end_index_dict.get(chrom) + 1]: line = line.strip('\n').split() allcounts_list[int(line[1]) - 1] += 1 elif variable == "reads": for line in lines[chrom_start_index_dict. get(chrom):chrom_end_index_dict.get(chrom) + 1]: line = line.strip('\n').split() allcounts_list[int(line[1]) - 1] += (int(line[4]) - 100) / 20 else: print( "ERROR: No valid variable argument given. Use transposons or reads" ) sys.exit(1) #%% BINNING OF THE READS #THE LIST WITH ALL THE TRANPOSONS FOR THE CURRENT CHROMOSOME IS TYPICALLY REALLY LARGE. #TO COMPRESS THIS LIST, THE BASEPAIR POSITIONS ARE GROUPED IN GROUPS WITH SIZE DEFINED BY 'BAR_WIDTH' #IN EACH GROUP THE NUMBER OF readS ARE SUMMED UP. #THIS IS DONE TO SPEED UP THE SCRIPT AS PLOTTING ALL VALUES IS SLOW allcounts_binnedlist = [] val_counter = 0 sum_values = 0 if bar_width == 1: allcounts_binnedlist = allcounts_list allinsertionsites_list = np.linspace( 0, chr_length_dict.get(chrom), int(chr_length_dict.get(chrom) / float(bar_width))) else: for n in range(len(allcounts_list)): if val_counter % bar_width != 0: sum_values += allcounts_list[n] elif val_counter % bar_width == 0: allcounts_binnedlist.append(sum_values) sum_values = 0 val_counter += 1 allinsertionsites_list = np.linspace( 0, chr_length_dict.get(chrom), int(chr_length_dict.get(chrom) / bar_width) + 1) #%% PLOTTING print('Plotting chromosome ', chrom, '...') print('bar width for plotting is ', bar_width) textsize = 18 textcolor = "#000000" plt.figure(figsize=(19, 9)) #(17,6)) grid = plt.GridSpec(20, 1, wspace=0.0, hspace=0.0) binsize = bar_width ax = plt.subplot(grid[0:19, 0]) ax.bar(allinsertionsites_list, allcounts_binnedlist, width=binsize, color="#000000") ax.tick_params(axis='both', which='major', labelsize=textsize) ax.set_axisbelow(True) ax.grid(True) ax.set_xlim(0, chr_length_dict.get(chrom)) # ax.set_ylim(0, 200) ax.tick_params(axis='x', which='major', pad=30) ax.ticklabel_format(axis='x', style='sci', scilimits=(0, 0)) ax.xaxis.get_offset_text().set_fontsize(textsize) ax.set_xlabel("Basepair position on chromosome " + chrom, fontsize=textsize, color=textcolor, labelpad=10) if variable == "transposons": ax.set_ylabel('Transposon count', fontsize=textsize, color=textcolor, labelpad=25) elif variable == "reads": ax.set_ylabel('Read count', fontsize=textsize, color=textcolor, labelpad=25) # ax.set_title('Transposon profile for chromosome '+chrom) axc = plt.subplot(grid[19, 0]) for gene in genes_currentchrom_pos_list: gene_start_pos = int(gene_pos_dict.get(gene)[1]) gene_end_pos = int(gene_pos_dict.get(gene)[2]) if gene in genes_essential_list: axc.axvspan(gene_start_pos, gene_end_pos, facecolor="#00F28E", alpha=0.8) # ax.text(gene_start_pos,max(alltransposoncounts_binnedlist),gene_alias_list.get(gene)[0], rotation=90, fontsize=18) else: axc.axvspan(gene_start_pos, gene_end_pos, facecolor="#F20064", alpha=0.8) axc.set_xlim(0, chr_length_dict.get(chrom)) axc.tick_params( axis='x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom=False, # ticks along the bottom edge are off top=False, # ticks along the top edge are off labelbottom=False) # labels along the bottom edge are off axc.tick_params( axis='y', # changes apply to the y-axis which='both', # both major and minor ticks are affected left=False, # ticks along the bottom edge are off right=False, # ticks along the top edge are off labelleft=False) # labels along the bottom edge are off if savefig == True and variable == "transposons": savepath = os.path.splitext(bed_file) print('saving figure at %s' % savepath[0] + '_transposonplot_chrom' + chrom + '.png') plt.savefig(savepath[0] + '_transposonplot_chrom' + chrom + '.png', dpi=400) plt.close() elif savefig == True and variable == "reads": savepath = os.path.splitext(bed_file) print('saving figure at %s' % savepath[0] + '_readplot_chrom' + chrom + '.png') plt.savefig(savepath[0] + '_readplot_chrom' + chrom + '.png', dpi=400) plt.close() else: plt.show()
def scatterplot(pergenefile): ''' This code creates a scatterplot of the number of reads per insertion per gene combined with a histogram. The genes are sorted based on the number of reads per insertion and are color coded based on the annotated essentiality in wild type. Input: - path to _pergene.txt (each line containing a gene with corresponding number of insertions and reads seperated either by a space or tab) Requirements: - essential_genes_names.py located in python_modules directory (the python_modules directory is expected to be located in the same directory as this script). - Cerevisiae_EssentialGenes_List_1.txt and Cerevisiae_EssentialGenes_List_2.txt, located in the Data_Files directory (the Data_Files directory is expected to be located in the parent directory of this script). ''' #%% #THIS COMMENT IS FOR THE FOLLOWING SECTIONS: # - READ FILE # - DETERMINE NUMBER OF READS PER INSERTION PER GENE # - DETERMINE ESSENTIAL GENES # - CREATE DATAFRAME # #THIS CAN BE REPLACED BY THE FOLLOWNG CODE: # file_dirname = os.path.dirname(os.path.abspath('__file__')) # sys.path.insert(1,os.path.join(file_dirname,'python_modules')) # from dataframe_from_pergene import dataframe_from_pergenefile # # read_gene_df_a = dataframe_from_pergenefile(pergenefile_a) #%% read file assert os.path.isfile(pergenefile), 'File not found at: %s' % pergenefile with open(pergenefile) as f: lines = f.readlines()[1:] #skip header genenames_list = [np.nan] * len(lines) tnpergene_list = [np.nan] * len(lines) readpergene_list = [np.nan] * len(lines) line_counter = 0 for line in lines: # l = line.strip('\n').split(' ') line_split = re.split(' |\t', line.strip('\n')) l = [x for x in line_split if x] if len(l) == 3: genenames_list[line_counter] = l[0] tnpergene_list[line_counter] = int(l[1]) readpergene_list[line_counter] = int(l[2]) line_counter += 1 del (line, l, line_counter, pergenefile) #%% determine number of reads per insertion per gene readperinspergene_list = [np.nan] * len(lines) for i in range(len(tnpergene_list)): if not tnpergene_list[i] == 0: readperinspergene_list[i] = readpergene_list[i] / tnpergene_list[i] else: readperinspergene_list[i] = 0 del (i) #%% determine essential genes # known_essential_gene_list = list_known_essentials(input_files=[os.path.join(file_dirname,'..','data_files','Cerevisiae_EssentialGenes_List_1.txt'), # os.path.join(file_dirname,'..','data_files','Cerevisiae_EssentialGenes_List_2.txt')]) known_essential_gene_list = list_known_essentials(input_files=[ r"C:\Users\gregoryvanbeek\Documents\GitHub\LaanLab-SATAY-DataAnalysis\data_files\Cerevisiae_EssentialGenes_List_1.txt", r"C:\Users\gregoryvanbeek\Documents\GitHub\LaanLab-SATAY-DataAnalysis\data_files\Cerevisiae_EssentialGenes_List_2.txt" ]) geneessentiality_list = [None] * len(lines) for i in range(len(genenames_list)): if genenames_list[i] in known_essential_gene_list: geneessentiality_list[i] = True else: geneessentiality_list[i] = False del (lines, known_essential_gene_list, i) #%% create dataframe read_gene_dict = { "gene_names": genenames_list, "gene_essentiality": geneessentiality_list, "tn_per_gene": tnpergene_list, "read_per_gene": readpergene_list, "Nreadsperinsrt": readperinspergene_list } read_gene_df = pd.DataFrame( read_gene_dict, columns=[column_name for column_name in read_gene_dict]) del (read_gene_dict, genenames_list, geneessentiality_list, tnpergene_list, readpergene_list, readperinspergene_list) #%% sort values read_gene_df = read_gene_df.sort_values(by=["Nreadsperinsrt"]) x_lin = np.linspace(0, len(read_gene_df) - 1, len(read_gene_df)) #%% plotting plt.figure(figsize=(19, 9)) grid = plt.GridSpec(1, 20, wspace=0.0, hspace=0.0) ax1 = plt.subplot(grid[0, 0:15]) colorpalette = sns.diverging_palette( 10, 170, s=90, l=50, n=2 ) #https://seaborn.pydata.org/generated/seaborn.diverging_palette.html#seaborn.diverging_palette sns.scatterplot(x=x_lin, y=read_gene_df.Nreadsperinsrt, hue=read_gene_df.gene_essentiality, palette=colorpalette, alpha=0.5, marker='|', legend=True) ax1.grid(linestyle='-', alpha=0.8) ax1.set_xlim(-1, max(x_lin) + 1) ax1.set_ylim(-1, 100) ax1.set_xticklabels([]) ax1.set_ylabel('Reads per insertion') ax1.set_xlabel('Genes') ax2 = plt.subplot(grid[0, 15:19]) colorpalette = sns.diverging_palette(170, 10, s=90, l=50, n=2) sns.histplot(data=read_gene_df, y="Nreadsperinsrt", hue="gene_essentiality", hue_order=[True, False], palette=colorpalette, alpha=0.5, binwidth=1) ax2.get_legend().remove() # ax2.set_xlim(0, 500) ax2.set_ylim(-1, 100) ax2.set_yticklabels([]) ax2.set_ylabel('') ax2.grid(linestyle='-', alpha=0.8) return (read_gene_df)
def dataframe_from_pergenefile(pergenefile, verbose=True): ''' This function creates a dataframe with the information from a pergene.txt file. Input is a path to a pergene.txt file Output is a dataframe where each row is a single gene and with the following columns: - gene_names - gene_essentiality - tn_per_gene - read_per_gene - Nreadsperinsrt The gene_essentiality is created based on the genes present in the Cerevisiae_EssentialGenes_List_1.txt and Cerevisiae_EssentialGenes_List_2.txt files The number of reads per insertion (Nreadsperinsrt) is determined by dividing the read_per_gene column by the tn_per_gene column. A more extensive version of this function is the python script genomicfeatures_dataframe.py found in the python_scripts folder. ''' file_dirname = os.path.dirname(os.path.abspath('__file__')) sys.path.insert(1, os.path.join(file_dirname, 'python_modules')) from essential_genes_names import list_known_essentials #import essential_genes_names from python modules directory # read file assert os.path.isfile(pergenefile), 'File not found at: %s' % pergenefile with open(pergenefile) as f: lines = f.readlines()[1:] #skip header genenames_list = [np.nan] * len(lines) tnpergene_list = [np.nan] * len(lines) readpergene_list = [np.nan] * len(lines) line_counter = 0 for line in lines: line_split = re.split(' |\t', line.strip('\n')) l = [x for x in line_split if x] if len(l) == 3: genenames_list[line_counter] = l[0] tnpergene_list[line_counter] = int(l[1]) readpergene_list[line_counter] = int(l[2]) line_counter += 1 del (line, l, line_counter, pergenefile) # determine number of reads per insertion per gene readperinspergene_list = [np.nan] * len(lines) for i in range(len(tnpergene_list)): if not tnpergene_list[i] == 0: readperinspergene_list[i] = readpergene_list[i] / tnpergene_list[i] else: readperinspergene_list[i] = 0 del (i) # determine essential genes if os.path.isfile( os.path.join(file_dirname, '..', '..', 'data_files', 'Cerevisiae_EssentialGenes_List_1.txt')): known_essential_gene_list = list_known_essentials(input_files=[ os.path.join(file_dirname, '..', '..', 'data_files', 'Cerevisiae_EssentialGenes_List_1.txt'), os.path.join(file_dirname, '..', '..', 'data_files', 'Cerevisiae_EssentialGenes_List_2.txt') ], verbose=verbose) elif os.path.isfile( os.path.join(file_dirname, '..', 'data_files', 'Cerevisiae_EssentialGenes_List_1.txt')): known_essential_gene_list = list_known_essentials(input_files=[ os.path.join(file_dirname, '..', 'data_files', 'Cerevisiae_EssentialGenes_List_1.txt'), os.path.join(file_dirname, '..', 'data_files', 'Cerevisiae_EssentialGenes_List_2.txt') ], verbose=verbose) geneessentiality_list = [None] * len(lines) for i in range(len(genenames_list)): if genenames_list[i] in known_essential_gene_list: geneessentiality_list[i] = True else: geneessentiality_list[i] = False del (lines, file_dirname, known_essential_gene_list, i) # create dataframe read_gene_dict = { "gene_names": genenames_list, "gene_essentiality": geneessentiality_list, "tn_per_gene": tnpergene_list, "read_per_gene": readpergene_list, "Nreadsperinsrt": readperinspergene_list } read_gene_df = pd.DataFrame( read_gene_dict, columns=[column_name for column_name in read_gene_dict]) del (read_gene_dict, genenames_list, geneessentiality_list, tnpergene_list, readpergene_list, readperinspergene_list) return (read_gene_df)
def compareplot(bed_files=None, variable="insertions", chromosome=None, set_barwidth=None, set_logscale=False, savefig=False): '''This function creates a bar plot along a specified chromosome for the number of transposons. The height of each bar represents the number of transposons at the genomic position indicated on the x-axis. The input is as follows: -The bed-files ('bed_files', a list containing two paths, each refering to a bed-file [mandatory]), -Which chromosome ('chromosome', indicated by roman numeral or list of roman numerals [optional]), -The width of the bars ('bar_width-user_set', indicated by an integer [optional]), -Path to where to save the figures ('savefigure_path', string containing an existing path [optional]), -Name of the figures ('savefigure_name', string containing a single name, the name will be automatically extended with the chromosomal number [optional]). The bed_file is one of the files created by the Matlab code from the kornmann-lab. The figure shows two graphs, the top one represents the first bed-file given in the list, the bottom plot the second bed-file in the list. If the chromosome number is not set by the user, it automatically loops over all chromosomes and determines the figures for each of them. The bar_width determines how many basepairs are put in one bin. Little basepairs per bin may be slow. Too many basepairs in one bin and possible low transposon areas might be obscured. When either the savefigure_path and/or the savefigure_name is left empty, the figure won't be saved. If the both these variables are given, the figures are saved using the path/figurename_chromX where the _chromX extension is automatically added. The background of the graph is color coded to indicate areas that code for genes. For this a list for essential genes is needed (used in 'list_known_essentials' function) and a .gff file is required (for the functions in 'chromosome_and_gene_positions.py') and a list for gene aliases (used in the function 'gene_aliases'). ''' #%% USED FILES gff_file = os.path.join(file_dirname, '..', 'data_files', 'Saccharomyces_cerevisiae.R64-1-1.99.gff3') essential_genes_files = [ os.path.join(file_dirname, '..', 'data_files', 'Cerevisiae_EssentialGenes_List_1.txt'), os.path.join(file_dirname, '..', 'data_files', 'Cerevisiae_EssentialGenes_List_2.txt') ] gene_information_file = os.path.join(file_dirname, '..', 'data_files', 'Yeast_Protein_Names.txt') #%% GET CHROMOSOME LENGTHS AND POSITIONS chr_length_dict, chr_start_pos_dict, chr_end_pos_dict = chromosome_position( gff_file) #%% GET ALL GENES IN CURRENT CHROMOSOME gene_pos_dict = gene_position(gff_file) genes_essential_list = list_known_essentials(essential_genes_files, verbose=False) gene_alias_list = gene_aliases(gene_information_file)[0] #%% DETERMINE WHICH CHROMOSOME NEEDS TO BE ANALYZED AND LOOP OVER THE CHROMOSOMES if type(chromosome) is list: chrom_list = chromosome elif type(chromosome) is str: chrom_list = [chromosome.upper()] else: chrom_list = [] roman_to_arabic_numerals = chromosomename_roman_to_arabic()[1] for keys in roman_to_arabic_numerals: chrom_list.append(keys) for chrom in chrom_list: print('') print('Analyzing chromosome: ', chrom) genes_currentchrom_pos_list = [ k for k, v in gene_pos_dict.items() if chrom in v ] #%% READ BED FILE allinsertionsites_allfiles_list = [] alltransposoncounts_allfiles_binnedlist = [] for bed_file in bed_files: print("Processing file: %s" % bed_file) with open(bed_file) as f: lines = f.readlines() #%% GET NAMES FOR THE CHROMOSOMES IN THE BED FILE chrom_start_index_dict, chrom_end_index_dict = chromosome_name_bedfile( bed_file)[1:3] #%% GET ALL TRANSPOSON COUNTS allcounts_list = np.zeros(chr_length_dict.get(chrom) + 2) if variable == "insertions": for line in lines[chrom_start_index_dict. get(chrom):chrom_end_index_dict.get(chrom) + 1]: line = line.strip('\n').split() allcounts_list[int(line[1])] += 1 elif variable == "reads": for line in lines[chrom_start_index_dict. get(chrom):chrom_end_index_dict.get(chrom) + 1]: line = line.strip('\n').split() allcounts_list[int(line[1])] += int(line[4]) #%% BINNING OF THE READS if set_barwidth == None: bar_width = int(chr_length_dict.get(chrom) / 500) else: bar_width = set_barwidth allcounts_binnedlist = [] val_counter = 0 sum_values = 0 if bar_width == 1: allcounts_binnedlist = allcounts_list allinsertionsites_list = np.linspace( 0, chr_length_dict.get(chrom), int(chr_length_dict.get(chrom) / float(bar_width))) else: for n in range(len(allcounts_list)): if val_counter % bar_width != 0: sum_values += allcounts_list[n] elif val_counter % bar_width == 0: allcounts_binnedlist.append(sum_values) sum_values = 0 val_counter += 1 allinsertionsites_list = np.linspace( 0, chr_length_dict.get(chrom), int(chr_length_dict.get(chrom) / bar_width) + 1) allinsertionsites_allfiles_list.append(allinsertionsites_list) alltransposoncounts_allfiles_binnedlist.append( allcounts_binnedlist) #%% DETERMINE DIFFERENCE BETWEEN DATASETS TRANSPOSONCOUNTS transposoncounts_positivedifference_list = [0] * len( alltransposoncounts_allfiles_binnedlist[0]) transposoncounts_negativedifference_list = [0] * len( alltransposoncounts_allfiles_binnedlist[0]) for i in range(0, len(alltransposoncounts_allfiles_binnedlist[0])): difference = alltransposoncounts_allfiles_binnedlist[0][ i] - alltransposoncounts_allfiles_binnedlist[1][i] if difference >= 0: transposoncounts_positivedifference_list[i] = difference elif difference < 0: transposoncounts_negativedifference_list[i] = -difference #%% PLOTTING print('Plotting chromosome ', chrom, '...') print('bar width for plotting is ', bar_width) binsize = bar_width font_size = 12 max_ylim = max( [ item for sublist in alltransposoncounts_allfiles_binnedlist for item in sublist ] ) #GET MAXIMUM VALUE FOR SETTING THE Y AXIS LIMIT EQUAL FOR BOTH GRAPHS max_ylim = max_ylim + 0.1 * max_ylim plt.figure(figsize=(19, 9)) grid = plt.GridSpec(2, 1, wspace=0.0, hspace=0.0) ax1 = plt.subplot(grid[0, 0]) for gene in genes_currentchrom_pos_list: gene_start_pos = int(gene_pos_dict.get(gene)[1]) gene_end_pos = int(gene_pos_dict.get(gene)[2]) if gene in genes_essential_list: ax1.axvspan(gene_start_pos, gene_end_pos, facecolor='g', alpha=0.3) ax1.text(gene_start_pos, max_ylim, gene_alias_list.get(gene)[0], rotation=45) else: ax1.axvspan(gene_start_pos, gene_end_pos, facecolor='r', alpha=0.3) ax1.bar(allinsertionsites_allfiles_list[0], alltransposoncounts_allfiles_binnedlist[0], width=binsize, color=(0.2, 0.2, 0.2, 0.8)) ax1.bar(allinsertionsites_allfiles_list[0], transposoncounts_positivedifference_list, width=binsize, color=(0.52, 0.71, 0.90, 0.8)) if set_logscale == True: ax1.set_yscale('log') else: ax1.set_ylim(0, max_ylim) ax1.set_axisbelow(True) ax1.grid(True) if variable == "insertions": ax1.set_ylabel('Aboslute insertion count', fontsize=font_size) elif variable == "reads": ax1.set_ylabel('Aboslute read count', fontsize=font_size) ax1.set_xlim(0, chr_length_dict.get(chrom)) ax2 = plt.subplot(grid[1, 0]) for gene in genes_currentchrom_pos_list: gene_start_pos = int(gene_pos_dict.get(gene)[1]) gene_end_pos = int(gene_pos_dict.get(gene)[2]) if gene in genes_essential_list: ax2.axvspan(gene_start_pos, gene_end_pos, facecolor='g', alpha=0.3) else: ax2.axvspan(gene_start_pos, gene_end_pos, facecolor='r', alpha=0.3) if variable == "insertions": ax2.bar(allinsertionsites_allfiles_list[1], alltransposoncounts_allfiles_binnedlist[1], width=binsize, color=(0.2, 0.2, 0.2, 0.8), label='Number of transposons') elif variable == "reads": ax2.bar(allinsertionsites_allfiles_list[1], alltransposoncounts_allfiles_binnedlist[1], width=binsize, color=(0.2, 0.2, 0.2, 0.8), label='Number of reads') ax2.bar(allinsertionsites_allfiles_list[1], transposoncounts_negativedifference_list, width=binsize, color=(0.52, 0.71, 0.90, 0.8), label='Absolute difference datasets (set1-set2)') if set_logscale == True: ax2.set_yscale('log') else: ax2.set_ylim(0, max_ylim) ax2.set_axisbelow(True) ax2.grid(True) if variable == "insertions": ax2.set_ylabel('Aboslute insertion count', fontsize=font_size) elif variable == "reads": ax2.set_ylabel('Aboslute read count', fontsize=font_size) ax2.set_xlabel('Basepair position on chromosome ' + chrom, fontsize=font_size) ax2.set_xlim(0, chr_length_dict.get(chrom)) ax2.invert_yaxis() ax2.legend(loc='lower left', fontsize=font_size) plt.tight_layout() if savefig == True: saving_name = os.path.join( os.path.dirname(bed_files[0]), os.path.basename(bed_files[0]).strip(".bed") + "_compareplot_chrom" + chrom + ".png") plt.savefig(saving_name) plt.close()
def profile_genome(bed_file=None, variable="transposons", bar_width=None, savefig=False): '''This function creates a bar plot along the entire genome. The height of each bar represents the number of transposons or reads at the genomic position indicated on the x-axis. The input is as follows: - bed file - variable ('transposons' or 'reads') - bar_width - savefig The bar_width determines how many basepairs are put in one bin. Little basepairs per bin may be slow. Too many basepairs in one bin and possible low transposon areas might be obscured. For this a list for essential genes is needed (used in 'list_known_essentials' function) and a .gff file is required (for the functions in 'chromosome_and_gene_positions.py') and a list for gene aliases (used in the function 'gene_aliases') ''' #%% gff_file = os.path.join(file_dirname, '..', 'data_files', 'Saccharomyces_cerevisiae.R64-1-1.99.gff3') essential_genes_files = [ os.path.join(file_dirname, '..', 'data_files', 'Cerevisiae_EssentialGenes_List_1.txt'), os.path.join(file_dirname, '..', 'data_files', 'Cerevisiae_EssentialGenes_List_2.txt') ] chrom_list = [ 'I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X', 'XI', 'XII', 'XIII', 'XIV', 'XV', 'XVI' ] chr_length_dict, chr_start_pos_dict, chr_end_pos_dict = chromosome_position( gff_file) summed_chr_length_dict = {} summed_chr_length = 0 for c in chrom_list: summed_chr_length_dict[c] = summed_chr_length summed_chr_length += chr_length_dict.get(c) l_genome = 0 for chrom in chrom_list: l_genome += int(chr_length_dict.get(chrom)) print('Genome length: ', l_genome) if bar_width == None: bar_width = l_genome / 1000 middle_chr_position = [] c1 = summed_chr_length_dict.get('I') for c in summed_chr_length_dict: if not c == 'I': c2 = summed_chr_length_dict.get(c) middle_chr_position.append(c1 + (c2 - c1) / 2) c1 = c2 c2 = l_genome middle_chr_position.append(c1 + (c2 - c1) / 2) gene_pos_dict = gene_position(gff_file) genes_currentchrom_pos_list = [k for k, v in gene_pos_dict.items()] genes_essential_list = list_known_essentials(essential_genes_files) with open(bed_file) as f: lines = f.readlines() chrom_names_dict, chrom_start_index_dict, chrom_end_index_dict = chromosome_name_bedfile( bed_file) allcounts_list = np.zeros(l_genome) if variable == "transposons": for line in lines[chrom_start_index_dict. get("I"):chrom_end_index_dict.get("XVI") + 1]: line = line.strip('\n').split() chrom_name = [ k for k, v in chrom_names_dict.items() if v == line[0].replace("chr", '') ][0] allcounts_list[summed_chr_length_dict.get(chrom_name) + int(line[1]) - 1] += 1 elif variable == "reads": for line in lines[chrom_start_index_dict. get("I"):chrom_end_index_dict.get("XVI") + 1]: line = line.strip('\n').split() chrom_name = [ k for k, v in chrom_names_dict.items() if v == line[0].replace("chr", '') ][0] allcounts_list[summed_chr_length_dict.get(chrom_name) + int(line[1]) - 1] += (int(line[4]) - 100) / 20 allcounts_binnedlist = [] val_counter = 0 sum_values = 0 for n in range(len(allcounts_list)): if int(val_counter % bar_width) != 0: sum_values += allcounts_list[n] elif int(val_counter % bar_width) == 0: allcounts_binnedlist.append(sum_values) sum_values = 0 val_counter += 1 allcounts_binnedlist.append(sum_values) if bar_width == (l_genome / 1000): allinsertionsites_list = np.linspace(0, l_genome, int(l_genome / bar_width + 1)) else: allinsertionsites_list = np.linspace(0, l_genome, int(l_genome / bar_width + 2)) plt.figure(figsize=(19.0, 9.0)) #(27.0,3)) grid = plt.GridSpec(20, 1, wspace=0.0, hspace=0.0) textsize = 12 textcolor = "#000000" binsize = bar_width ax = plt.subplot(grid[0:19, 0]) # for gene in genes_currentchrom_pos_list: # if not gene_pos_dict.get(gene)[0] == 'Mito': # gene_start_pos = summed_chr_length_dict.get(gene_pos_dict.get(gene)[0]) + int(gene_pos_dict.get(gene)[1]) # gene_end_pos = summed_chr_length_dict.get(gene_pos_dict.get(gene)[0]) + int(gene_pos_dict.get(gene)[2]) # if gene in genes_essential_list: # ax.axvspan(gene_start_pos,gene_end_pos,facecolor="#BBE6AA",alpha=0.8) # else: # ax.axvspan(gene_start_pos,gene_end_pos,facecolor="#F6A089",alpha=0.8) ax.bar(allinsertionsites_list, allcounts_binnedlist, width=binsize, color="#333333") #"#00918f") ax.grid(False) ax.set_xlim(0, l_genome) for chrom in summed_chr_length_dict: ax.axvline(x=summed_chr_length_dict.get(chrom), linestyle='-', color=(0.9, 0.9, 0.9, 1.0)) ax.set_xticks(middle_chr_position) ax.set_xticklabels(chrom_list, fontsize=textsize) ax.tick_params(axis='x', which='major', pad=30) if variable == "transposons": plt.ylabel('Transposon Count', fontsize=textsize, color=textcolor) #, labelpad=30) elif variable == "reads": plt.ylabel('Read Count', fontsize=textsize, color=textcolor) #, labelpad=30) axc = plt.subplot(grid[19, 0]) for gene in genes_currentchrom_pos_list: if not gene_pos_dict.get(gene)[0] == 'Mito': gene_start_pos = summed_chr_length_dict.get( gene_pos_dict.get(gene)[0]) + int(gene_pos_dict.get(gene)[1]) gene_end_pos = summed_chr_length_dict.get( gene_pos_dict.get(gene)[0]) + int(gene_pos_dict.get(gene)[2]) if gene in genes_essential_list: axc.axvspan(gene_start_pos, gene_end_pos, facecolor="#00F28E", alpha=0.8) else: axc.axvspan(gene_start_pos, gene_end_pos, facecolor="#F20064", alpha=0.8) axc.set_xlim(0, l_genome) axc.tick_params( axis='x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom=False, # ticks along the bottom edge are off top=False, # ticks along the top edge are off labelbottom=False) # labels along the bottom edge are off axc.tick_params( axis='y', # changes apply to the y-axis which='both', # both major and minor ticks are affected left=False, # ticks along the bottom edge are off right=False, # ticks along the top edge are off labelleft=False) # labels along the bottom edge are off if savefig == True and variable == "transposons": savepath = os.path.splitext(bed_file) print('saving figure at %s' % savepath[0] + '_transposonplot_genome.png') plt.savefig(savepath[0] + '_transposonplot_genome.png', dpi=400) plt.close() elif savefig == True and variable == "reads": savepath = os.path.splitext(bed_file) print('saving figure at %s' % savepath[0] + '_readplot_genome.png') plt.savefig(savepath[0] + '_readplot_genome.png', dpi=400) plt.close() else: plt.show()