def profile_plot(bed_file,
                 variable="transposons",
                 chrom='I',
                 bar_width=None,
                 savefig=False):
    '''This function creates a bar plot along a specified chromosome for the number of transposons or reads.
    The height of each bar represents the number of transposons or reads at the genomic position indicated on the x-axis.
    The input is as follows: 
        - bed_file: input absolute path to bed file
        - variable: either transposons or reads
        - chrom: roman numeral indicated the chromosome that needs to be plotted
        - bar_width: integer. By default, the bar_width is set to length_chromosome/800
        - savefig: whether to save the figure at the location of the bed file (True or False)
        
    The bar_width determines how many basepairs are put in one bin. Little basepairs per bin may be slow. Too many basepairs in one bin and possible low transposon areas might be obscured.
    The bottom part of the graph is color coded to indicate areas that code for genes.
    For this a list for essential genes is needed (used in 'list_known_essentials' function) and a .gff file is required (for the functions in 'chromosome_and_gene_positions.py') and a list for gene aliases (used in the function 'gene_aliases')
    '''
    #%% USED FILES
    gff_file = os.path.join(file_dirname, '..', 'data_files',
                            'Saccharomyces_cerevisiae.R64-1-1.99.gff3')
    essential_genes_files = [
        os.path.join(file_dirname, '..', 'data_files',
                     'Cerevisiae_EssentialGenes_List_1.txt'),
        os.path.join(file_dirname, '..', 'data_files',
                     'Cerevisiae_EssentialGenes_List_2.txt')
    ]

    #%% GET CHROMOSOME LENGTHS AND POSITIONS
    chr_length_dict, chr_start_pos_dict, chr_end_pos_dict = chromosome_position(
        gff_file)

    #%% CREATE LIST OF ALL CHROMOSOMES IN ROMAN NUMERALS
    print('Chromosome length: ', chr_length_dict.get(chrom))
    if bar_width == None:
        bar_width = int(chr_length_dict.get(chrom) / 800)

#%% GET ALL GENES IN CURRENT CHROMOSOME
    gene_pos_dict = gene_position(gff_file)
    genes_currentchrom_pos_list = [
        k for k, v in gene_pos_dict.items() if chrom in v
    ]
    genes_essential_list = list_known_essentials(essential_genes_files)

    #%% READ BED FILE
    with open(bed_file) as f:
        lines = f.readlines()

#%% GET NAMES FOR THE CHROMOSOMES IN THE BED FILE
    chrom_start_index_dict, chrom_end_index_dict = chromosome_name_bedfile(
        bed_file)[1:3]

    #%% GET ALL TRANSPOSON COUNTS
    allcounts_list = np.zeros(chr_length_dict.get(chrom) + 1)
    if variable == "transposons":
        for line in lines[chrom_start_index_dict.
                          get(chrom):chrom_end_index_dict.get(chrom) + 1]:
            line = line.strip('\n').split()
            allcounts_list[int(line[1]) - 1] += 1

    elif variable == "reads":
        for line in lines[chrom_start_index_dict.
                          get(chrom):chrom_end_index_dict.get(chrom) + 1]:
            line = line.strip('\n').split()
            allcounts_list[int(line[1]) - 1] += (int(line[4]) - 100) / 20

    else:
        print(
            "ERROR: No valid variable argument given. Use transposons or reads"
        )
        sys.exit(1)

#%% BINNING OF THE READS
#THE LIST WITH ALL THE TRANPOSONS FOR THE CURRENT CHROMOSOME IS TYPICALLY REALLY LARGE.
#TO COMPRESS THIS LIST, THE BASEPAIR POSITIONS ARE GROUPED IN GROUPS WITH SIZE DEFINED BY 'BAR_WIDTH'
#IN EACH GROUP THE NUMBER OF readS ARE SUMMED UP.
#THIS IS DONE TO SPEED UP THE SCRIPT AS PLOTTING ALL VALUES IS SLOW
    allcounts_binnedlist = []
    val_counter = 0
    sum_values = 0
    if bar_width == 1:
        allcounts_binnedlist = allcounts_list
        allinsertionsites_list = np.linspace(
            0, chr_length_dict.get(chrom),
            int(chr_length_dict.get(chrom) / float(bar_width)))
    else:
        for n in range(len(allcounts_list)):
            if val_counter % bar_width != 0:
                sum_values += allcounts_list[n]
            elif val_counter % bar_width == 0:
                allcounts_binnedlist.append(sum_values)
                sum_values = 0
            val_counter += 1

        allinsertionsites_list = np.linspace(
            0, chr_length_dict.get(chrom),
            int(chr_length_dict.get(chrom) / bar_width) + 1)

#%% PLOTTING
    print('Plotting chromosome ', chrom, '...')
    print('bar width for plotting is ', bar_width)

    textsize = 18
    textcolor = "#000000"

    plt.figure(figsize=(19, 9))  #(17,6))
    grid = plt.GridSpec(20, 1, wspace=0.0, hspace=0.0)

    binsize = bar_width
    ax = plt.subplot(grid[0:19, 0])
    ax.bar(allinsertionsites_list,
           allcounts_binnedlist,
           width=binsize,
           color="#000000")
    ax.tick_params(axis='both', which='major', labelsize=textsize)
    ax.set_axisbelow(True)
    ax.grid(True)
    ax.set_xlim(0, chr_length_dict.get(chrom))
    #    ax.set_ylim(0, 200)
    ax.tick_params(axis='x', which='major', pad=30)
    ax.ticklabel_format(axis='x', style='sci', scilimits=(0, 0))
    ax.xaxis.get_offset_text().set_fontsize(textsize)
    ax.set_xlabel("Basepair position on chromosome " + chrom,
                  fontsize=textsize,
                  color=textcolor,
                  labelpad=10)
    if variable == "transposons":
        ax.set_ylabel('Transposon count',
                      fontsize=textsize,
                      color=textcolor,
                      labelpad=25)
    elif variable == "reads":
        ax.set_ylabel('Read count',
                      fontsize=textsize,
                      color=textcolor,
                      labelpad=25)
#    ax.set_title('Transposon profile for chromosome '+chrom)

    axc = plt.subplot(grid[19, 0])
    for gene in genes_currentchrom_pos_list:
        gene_start_pos = int(gene_pos_dict.get(gene)[1])
        gene_end_pos = int(gene_pos_dict.get(gene)[2])
        if gene in genes_essential_list:
            axc.axvspan(gene_start_pos,
                        gene_end_pos,
                        facecolor="#00F28E",
                        alpha=0.8)
#            ax.text(gene_start_pos,max(alltransposoncounts_binnedlist),gene_alias_list.get(gene)[0], rotation=90, fontsize=18)
        else:
            axc.axvspan(gene_start_pos,
                        gene_end_pos,
                        facecolor="#F20064",
                        alpha=0.8)
    axc.set_xlim(0, chr_length_dict.get(chrom))
    axc.tick_params(
        axis='x',  # changes apply to the x-axis
        which='both',  # both major and minor ticks are affected
        bottom=False,  # ticks along the bottom edge are off
        top=False,  # ticks along the top edge are off
        labelbottom=False)  # labels along the bottom edge are off

    axc.tick_params(
        axis='y',  # changes apply to the y-axis
        which='both',  # both major and minor ticks are affected
        left=False,  # ticks along the bottom edge are off
        right=False,  # ticks along the top edge are off
        labelleft=False)  # labels along the bottom edge are off

    if savefig == True and variable == "transposons":
        savepath = os.path.splitext(bed_file)
        print('saving figure at %s' % savepath[0] + '_transposonplot_chrom' +
              chrom + '.png')
        plt.savefig(savepath[0] + '_transposonplot_chrom' + chrom + '.png',
                    dpi=400)
        plt.close()
    elif savefig == True and variable == "reads":
        savepath = os.path.splitext(bed_file)
        print('saving figure at %s' % savepath[0] + '_readplot_chrom' + chrom +
              '.png')
        plt.savefig(savepath[0] + '_readplot_chrom' + chrom + '.png', dpi=400)
        plt.close()
    else:
        plt.show()
def scatterplot(pergenefile):
    '''
    This code creates a scatterplot of the number of reads per insertion per gene combined with a histogram.
    The genes are sorted based on the number of reads per insertion and are color coded based on the annotated essentiality in wild type.
    
    Input:
        - path to _pergene.txt (each line containing a gene with corresponding number of insertions and reads seperated either by a space or tab)

    Requirements:
        - essential_genes_names.py located in python_modules directory (the python_modules directory is expected to be located in the same directory as this script).
        - Cerevisiae_EssentialGenes_List_1.txt and Cerevisiae_EssentialGenes_List_2.txt, located in the Data_Files directory (the Data_Files directory is expected to be located in the parent directory of this script).
    '''

    #%%
    #THIS COMMENT IS FOR THE FOLLOWING SECTIONS:
    #    - READ FILE
    #    - DETERMINE NUMBER OF READS PER INSERTION PER GENE
    #    - DETERMINE ESSENTIAL GENES
    #    - CREATE DATAFRAME
    #
    #THIS CAN BE REPLACED BY THE FOLLOWNG CODE:
    #    file_dirname = os.path.dirname(os.path.abspath('__file__'))
    #    sys.path.insert(1,os.path.join(file_dirname,'python_modules'))
    #    from dataframe_from_pergene import dataframe_from_pergenefile
    #
    #    read_gene_df_a = dataframe_from_pergenefile(pergenefile_a)

    #%% read file
    assert os.path.isfile(pergenefile), 'File not found at: %s' % pergenefile

    with open(pergenefile) as f:
        lines = f.readlines()[1:]  #skip header

    genenames_list = [np.nan] * len(lines)
    tnpergene_list = [np.nan] * len(lines)
    readpergene_list = [np.nan] * len(lines)

    line_counter = 0
    for line in lines:
        #        l = line.strip('\n').split(' ')
        line_split = re.split(' |\t', line.strip('\n'))
        l = [x for x in line_split if x]

        if len(l) == 3:
            genenames_list[line_counter] = l[0]
            tnpergene_list[line_counter] = int(l[1])
            readpergene_list[line_counter] = int(l[2])

            line_counter += 1

    del (line, l, line_counter, pergenefile)

    #%% determine number of reads per insertion per gene
    readperinspergene_list = [np.nan] * len(lines)
    for i in range(len(tnpergene_list)):
        if not tnpergene_list[i] == 0:
            readperinspergene_list[i] = readpergene_list[i] / tnpergene_list[i]
        else:
            readperinspergene_list[i] = 0

    del (i)

    #%% determine essential genes
    # known_essential_gene_list = list_known_essentials(input_files=[os.path.join(file_dirname,'..','data_files','Cerevisiae_EssentialGenes_List_1.txt'),
    #                                                                 os.path.join(file_dirname,'..','data_files','Cerevisiae_EssentialGenes_List_2.txt')])

    known_essential_gene_list = list_known_essentials(input_files=[
        r"C:\Users\gregoryvanbeek\Documents\GitHub\LaanLab-SATAY-DataAnalysis\data_files\Cerevisiae_EssentialGenes_List_1.txt",
        r"C:\Users\gregoryvanbeek\Documents\GitHub\LaanLab-SATAY-DataAnalysis\data_files\Cerevisiae_EssentialGenes_List_2.txt"
    ])

    geneessentiality_list = [None] * len(lines)
    for i in range(len(genenames_list)):
        if genenames_list[i] in known_essential_gene_list:
            geneessentiality_list[i] = True
        else:
            geneessentiality_list[i] = False

    del (lines, known_essential_gene_list, i)

    #%% create dataframe
    read_gene_dict = {
        "gene_names": genenames_list,
        "gene_essentiality": geneessentiality_list,
        "tn_per_gene": tnpergene_list,
        "read_per_gene": readpergene_list,
        "Nreadsperinsrt": readperinspergene_list
    }

    read_gene_df = pd.DataFrame(
        read_gene_dict,
        columns=[column_name for column_name in read_gene_dict])

    del (read_gene_dict, genenames_list, geneessentiality_list, tnpergene_list,
         readpergene_list, readperinspergene_list)

    #%% sort values
    read_gene_df = read_gene_df.sort_values(by=["Nreadsperinsrt"])

    x_lin = np.linspace(0, len(read_gene_df) - 1, len(read_gene_df))

    #%% plotting
    plt.figure(figsize=(19, 9))
    grid = plt.GridSpec(1, 20, wspace=0.0, hspace=0.0)

    ax1 = plt.subplot(grid[0, 0:15])
    colorpalette = sns.diverging_palette(
        10, 170, s=90, l=50, n=2
    )  #https://seaborn.pydata.org/generated/seaborn.diverging_palette.html#seaborn.diverging_palette
    sns.scatterplot(x=x_lin,
                    y=read_gene_df.Nreadsperinsrt,
                    hue=read_gene_df.gene_essentiality,
                    palette=colorpalette,
                    alpha=0.5,
                    marker='|',
                    legend=True)
    ax1.grid(linestyle='-', alpha=0.8)
    ax1.set_xlim(-1, max(x_lin) + 1)
    ax1.set_ylim(-1, 100)
    ax1.set_xticklabels([])
    ax1.set_ylabel('Reads per insertion')
    ax1.set_xlabel('Genes')

    ax2 = plt.subplot(grid[0, 15:19])
    colorpalette = sns.diverging_palette(170, 10, s=90, l=50, n=2)
    sns.histplot(data=read_gene_df,
                 y="Nreadsperinsrt",
                 hue="gene_essentiality",
                 hue_order=[True, False],
                 palette=colorpalette,
                 alpha=0.5,
                 binwidth=1)
    ax2.get_legend().remove()
    #    ax2.set_xlim(0, 500)
    ax2.set_ylim(-1, 100)
    ax2.set_yticklabels([])
    ax2.set_ylabel('')
    ax2.grid(linestyle='-', alpha=0.8)

    return (read_gene_df)
示例#3
0
def dataframe_from_pergenefile(pergenefile, verbose=True):
    '''
    This function creates a dataframe with the information from a pergene.txt file.
    Input is a path to a pergene.txt file
    Output is a dataframe where each row is a single gene and with the following columns:
        - gene_names
        - gene_essentiality
        - tn_per_gene
        - read_per_gene
        - Nreadsperinsrt
    
    The gene_essentiality is created based on the genes present in the Cerevisiae_EssentialGenes_List_1.txt and Cerevisiae_EssentialGenes_List_2.txt files
    The number of reads per insertion (Nreadsperinsrt) is determined by dividing the read_per_gene column by the tn_per_gene column.
    
    A more extensive version of this function is the python script genomicfeatures_dataframe.py found in the python_scripts folder.
    '''

    file_dirname = os.path.dirname(os.path.abspath('__file__'))
    sys.path.insert(1, os.path.join(file_dirname, 'python_modules'))
    from essential_genes_names import list_known_essentials  #import essential_genes_names from python modules directory

    # read file
    assert os.path.isfile(pergenefile), 'File not found at: %s' % pergenefile

    with open(pergenefile) as f:
        lines = f.readlines()[1:]  #skip header

    genenames_list = [np.nan] * len(lines)
    tnpergene_list = [np.nan] * len(lines)
    readpergene_list = [np.nan] * len(lines)

    line_counter = 0
    for line in lines:
        line_split = re.split(' |\t', line.strip('\n'))
        l = [x for x in line_split if x]

        if len(l) == 3:
            genenames_list[line_counter] = l[0]
            tnpergene_list[line_counter] = int(l[1])
            readpergene_list[line_counter] = int(l[2])

            line_counter += 1

    del (line, l, line_counter, pergenefile)

    # determine number of reads per insertion per gene
    readperinspergene_list = [np.nan] * len(lines)
    for i in range(len(tnpergene_list)):
        if not tnpergene_list[i] == 0:
            readperinspergene_list[i] = readpergene_list[i] / tnpergene_list[i]
        else:
            readperinspergene_list[i] = 0

    del (i)

    # determine essential genes
    if os.path.isfile(
            os.path.join(file_dirname, '..', '..', 'data_files',
                         'Cerevisiae_EssentialGenes_List_1.txt')):
        known_essential_gene_list = list_known_essentials(input_files=[
            os.path.join(file_dirname, '..', '..', 'data_files',
                         'Cerevisiae_EssentialGenes_List_1.txt'),
            os.path.join(file_dirname, '..', '..', 'data_files',
                         'Cerevisiae_EssentialGenes_List_2.txt')
        ],
                                                          verbose=verbose)
    elif os.path.isfile(
            os.path.join(file_dirname, '..', 'data_files',
                         'Cerevisiae_EssentialGenes_List_1.txt')):
        known_essential_gene_list = list_known_essentials(input_files=[
            os.path.join(file_dirname, '..', 'data_files',
                         'Cerevisiae_EssentialGenes_List_1.txt'),
            os.path.join(file_dirname, '..', 'data_files',
                         'Cerevisiae_EssentialGenes_List_2.txt')
        ],
                                                          verbose=verbose)

    geneessentiality_list = [None] * len(lines)
    for i in range(len(genenames_list)):
        if genenames_list[i] in known_essential_gene_list:
            geneessentiality_list[i] = True
        else:
            geneessentiality_list[i] = False

    del (lines, file_dirname, known_essential_gene_list, i)

    # create dataframe
    read_gene_dict = {
        "gene_names": genenames_list,
        "gene_essentiality": geneessentiality_list,
        "tn_per_gene": tnpergene_list,
        "read_per_gene": readpergene_list,
        "Nreadsperinsrt": readperinspergene_list
    }

    read_gene_df = pd.DataFrame(
        read_gene_dict,
        columns=[column_name for column_name in read_gene_dict])

    del (read_gene_dict, genenames_list, geneessentiality_list, tnpergene_list,
         readpergene_list, readperinspergene_list)

    return (read_gene_df)
示例#4
0
def compareplot(bed_files=None,
                variable="insertions",
                chromosome=None,
                set_barwidth=None,
                set_logscale=False,
                savefig=False):
    '''This function creates a bar plot along a specified chromosome for the number of transposons.
    The height of each bar represents the number of transposons at the genomic position indicated on the x-axis.
    The input is as follows:
        -The bed-files ('bed_files', a list containing two paths, each refering to a bed-file [mandatory]),
        -Which chromosome ('chromosome', indicated by roman numeral or list of roman numerals [optional]),
        -The width of the bars ('bar_width-user_set', indicated by an integer [optional]),
        -Path to where to save the figures ('savefigure_path', string containing an existing path [optional]),
        -Name of the figures ('savefigure_name', string containing a single name, the name will be automatically extended with the chromosomal number [optional]).
    
    The bed_file is one of the files created by the Matlab code from the kornmann-lab.
    The figure shows two graphs, the top one represents the first bed-file given in the list, the bottom plot the second bed-file in the list.
    If the chromosome number is not set by the user, it automatically loops over all chromosomes and determines the figures for each of them.    
    The bar_width determines how many basepairs are put in one bin. Little basepairs per bin may be slow. Too many basepairs in one bin and possible low transposon areas might be obscured.
    When either the savefigure_path and/or the savefigure_name is left empty, the figure won't be saved.
    If the both these variables are given, the figures are saved using the path/figurename_chromX where the _chromX extension is automatically added.
    
    The background of the graph is color coded to indicate areas that code for genes.
    For this a list for essential genes is needed (used in 'list_known_essentials' function) and a .gff file is required (for the functions in 'chromosome_and_gene_positions.py') and a list for gene aliases (used in the function 'gene_aliases').
    '''
    #%% USED FILES
    gff_file = os.path.join(file_dirname, '..', 'data_files',
                            'Saccharomyces_cerevisiae.R64-1-1.99.gff3')
    essential_genes_files = [
        os.path.join(file_dirname, '..', 'data_files',
                     'Cerevisiae_EssentialGenes_List_1.txt'),
        os.path.join(file_dirname, '..', 'data_files',
                     'Cerevisiae_EssentialGenes_List_2.txt')
    ]
    gene_information_file = os.path.join(file_dirname, '..', 'data_files',
                                         'Yeast_Protein_Names.txt')
    #%% GET CHROMOSOME LENGTHS AND POSITIONS
    chr_length_dict, chr_start_pos_dict, chr_end_pos_dict = chromosome_position(
        gff_file)

    #%% GET ALL GENES IN CURRENT CHROMOSOME
    gene_pos_dict = gene_position(gff_file)
    genes_essential_list = list_known_essentials(essential_genes_files,
                                                 verbose=False)
    gene_alias_list = gene_aliases(gene_information_file)[0]

    #%% DETERMINE WHICH CHROMOSOME NEEDS TO BE ANALYZED AND LOOP OVER THE CHROMOSOMES
    if type(chromosome) is list:
        chrom_list = chromosome
    elif type(chromosome) is str:
        chrom_list = [chromosome.upper()]
    else:
        chrom_list = []
        roman_to_arabic_numerals = chromosomename_roman_to_arabic()[1]
        for keys in roman_to_arabic_numerals:
            chrom_list.append(keys)

    for chrom in chrom_list:
        print('')
        print('Analyzing chromosome: ', chrom)
        genes_currentchrom_pos_list = [
            k for k, v in gene_pos_dict.items() if chrom in v
        ]

        #%% READ BED FILE
        allinsertionsites_allfiles_list = []
        alltransposoncounts_allfiles_binnedlist = []
        for bed_file in bed_files:
            print("Processing file: %s" % bed_file)
            with open(bed_file) as f:
                lines = f.readlines()

#%% GET NAMES FOR THE CHROMOSOMES IN THE BED FILE
            chrom_start_index_dict, chrom_end_index_dict = chromosome_name_bedfile(
                bed_file)[1:3]

            #%% GET ALL TRANSPOSON COUNTS
            allcounts_list = np.zeros(chr_length_dict.get(chrom) + 2)
            if variable == "insertions":
                for line in lines[chrom_start_index_dict.
                                  get(chrom):chrom_end_index_dict.get(chrom) +
                                  1]:
                    line = line.strip('\n').split()
                    allcounts_list[int(line[1])] += 1

            elif variable == "reads":
                for line in lines[chrom_start_index_dict.
                                  get(chrom):chrom_end_index_dict.get(chrom) +
                                  1]:
                    line = line.strip('\n').split()
                    allcounts_list[int(line[1])] += int(line[4])

#%% BINNING OF THE READS
            if set_barwidth == None:
                bar_width = int(chr_length_dict.get(chrom) / 500)
            else:
                bar_width = set_barwidth

            allcounts_binnedlist = []
            val_counter = 0
            sum_values = 0
            if bar_width == 1:
                allcounts_binnedlist = allcounts_list
                allinsertionsites_list = np.linspace(
                    0, chr_length_dict.get(chrom),
                    int(chr_length_dict.get(chrom) / float(bar_width)))
            else:
                for n in range(len(allcounts_list)):
                    if val_counter % bar_width != 0:
                        sum_values += allcounts_list[n]
                    elif val_counter % bar_width == 0:
                        allcounts_binnedlist.append(sum_values)
                        sum_values = 0
                    val_counter += 1

                allinsertionsites_list = np.linspace(
                    0, chr_length_dict.get(chrom),
                    int(chr_length_dict.get(chrom) / bar_width) + 1)

            allinsertionsites_allfiles_list.append(allinsertionsites_list)
            alltransposoncounts_allfiles_binnedlist.append(
                allcounts_binnedlist)

#%% DETERMINE DIFFERENCE BETWEEN DATASETS TRANSPOSONCOUNTS
        transposoncounts_positivedifference_list = [0] * len(
            alltransposoncounts_allfiles_binnedlist[0])
        transposoncounts_negativedifference_list = [0] * len(
            alltransposoncounts_allfiles_binnedlist[0])
        for i in range(0, len(alltransposoncounts_allfiles_binnedlist[0])):
            difference = alltransposoncounts_allfiles_binnedlist[0][
                i] - alltransposoncounts_allfiles_binnedlist[1][i]
            if difference >= 0:
                transposoncounts_positivedifference_list[i] = difference
            elif difference < 0:
                transposoncounts_negativedifference_list[i] = -difference

#%% PLOTTING
        print('Plotting chromosome ', chrom, '...')
        print('bar width for plotting is ', bar_width)
        binsize = bar_width
        font_size = 12
        max_ylim = max(
            [
                item for sublist in alltransposoncounts_allfiles_binnedlist
                for item in sublist
            ]
        )  #GET MAXIMUM VALUE FOR SETTING THE Y AXIS LIMIT EQUAL FOR BOTH GRAPHS
        max_ylim = max_ylim + 0.1 * max_ylim

        plt.figure(figsize=(19, 9))
        grid = plt.GridSpec(2, 1, wspace=0.0, hspace=0.0)

        ax1 = plt.subplot(grid[0, 0])
        for gene in genes_currentchrom_pos_list:
            gene_start_pos = int(gene_pos_dict.get(gene)[1])
            gene_end_pos = int(gene_pos_dict.get(gene)[2])
            if gene in genes_essential_list:
                ax1.axvspan(gene_start_pos,
                            gene_end_pos,
                            facecolor='g',
                            alpha=0.3)
                ax1.text(gene_start_pos,
                         max_ylim,
                         gene_alias_list.get(gene)[0],
                         rotation=45)
            else:
                ax1.axvspan(gene_start_pos,
                            gene_end_pos,
                            facecolor='r',
                            alpha=0.3)

        ax1.bar(allinsertionsites_allfiles_list[0],
                alltransposoncounts_allfiles_binnedlist[0],
                width=binsize,
                color=(0.2, 0.2, 0.2, 0.8))
        ax1.bar(allinsertionsites_allfiles_list[0],
                transposoncounts_positivedifference_list,
                width=binsize,
                color=(0.52, 0.71, 0.90, 0.8))

        if set_logscale == True:
            ax1.set_yscale('log')
        else:
            ax1.set_ylim(0, max_ylim)
        ax1.set_axisbelow(True)
        ax1.grid(True)
        if variable == "insertions":
            ax1.set_ylabel('Aboslute insertion count', fontsize=font_size)
        elif variable == "reads":
            ax1.set_ylabel('Aboslute read count', fontsize=font_size)
        ax1.set_xlim(0, chr_length_dict.get(chrom))

        ax2 = plt.subplot(grid[1, 0])
        for gene in genes_currentchrom_pos_list:
            gene_start_pos = int(gene_pos_dict.get(gene)[1])
            gene_end_pos = int(gene_pos_dict.get(gene)[2])
            if gene in genes_essential_list:
                ax2.axvspan(gene_start_pos,
                            gene_end_pos,
                            facecolor='g',
                            alpha=0.3)
            else:
                ax2.axvspan(gene_start_pos,
                            gene_end_pos,
                            facecolor='r',
                            alpha=0.3)

        if variable == "insertions":
            ax2.bar(allinsertionsites_allfiles_list[1],
                    alltransposoncounts_allfiles_binnedlist[1],
                    width=binsize,
                    color=(0.2, 0.2, 0.2, 0.8),
                    label='Number of transposons')
        elif variable == "reads":
            ax2.bar(allinsertionsites_allfiles_list[1],
                    alltransposoncounts_allfiles_binnedlist[1],
                    width=binsize,
                    color=(0.2, 0.2, 0.2, 0.8),
                    label='Number of reads')
        ax2.bar(allinsertionsites_allfiles_list[1],
                transposoncounts_negativedifference_list,
                width=binsize,
                color=(0.52, 0.71, 0.90, 0.8),
                label='Absolute difference datasets (set1-set2)')

        if set_logscale == True:
            ax2.set_yscale('log')
        else:
            ax2.set_ylim(0, max_ylim)
        ax2.set_axisbelow(True)
        ax2.grid(True)
        if variable == "insertions":
            ax2.set_ylabel('Aboslute insertion count', fontsize=font_size)
        elif variable == "reads":
            ax2.set_ylabel('Aboslute read count', fontsize=font_size)
        ax2.set_xlabel('Basepair position on chromosome ' + chrom,
                       fontsize=font_size)
        ax2.set_xlim(0, chr_length_dict.get(chrom))
        ax2.invert_yaxis()
        ax2.legend(loc='lower left', fontsize=font_size)

        plt.tight_layout()

        if savefig == True:
            saving_name = os.path.join(
                os.path.dirname(bed_files[0]),
                os.path.basename(bed_files[0]).strip(".bed") +
                "_compareplot_chrom" + chrom + ".png")
            plt.savefig(saving_name)
            plt.close()
def profile_genome(bed_file=None,
                   variable="transposons",
                   bar_width=None,
                   savefig=False):
    '''This function creates a bar plot along the entire genome.
    The height of each bar represents the number of transposons or reads at the genomic position indicated on the x-axis.
    The input is as follows:
        - bed file
        - variable ('transposons' or 'reads')
        - bar_width
        - savefig

    The bar_width determines how many basepairs are put in one bin. Little basepairs per bin may be slow. Too many basepairs in one bin and possible low transposon areas might be obscured.
    For this a list for essential genes is needed (used in 'list_known_essentials' function) and a .gff file is required (for the functions in 'chromosome_and_gene_positions.py') and a list for gene aliases (used in the function 'gene_aliases')
    '''

    #%%
    gff_file = os.path.join(file_dirname, '..', 'data_files',
                            'Saccharomyces_cerevisiae.R64-1-1.99.gff3')
    essential_genes_files = [
        os.path.join(file_dirname, '..', 'data_files',
                     'Cerevisiae_EssentialGenes_List_1.txt'),
        os.path.join(file_dirname, '..', 'data_files',
                     'Cerevisiae_EssentialGenes_List_2.txt')
    ]

    chrom_list = [
        'I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X', 'XI',
        'XII', 'XIII', 'XIV', 'XV', 'XVI'
    ]

    chr_length_dict, chr_start_pos_dict, chr_end_pos_dict = chromosome_position(
        gff_file)

    summed_chr_length_dict = {}
    summed_chr_length = 0
    for c in chrom_list:
        summed_chr_length_dict[c] = summed_chr_length
        summed_chr_length += chr_length_dict.get(c)

    l_genome = 0
    for chrom in chrom_list:
        l_genome += int(chr_length_dict.get(chrom))
    print('Genome length: ', l_genome)
    if bar_width == None:
        bar_width = l_genome / 1000

    middle_chr_position = []
    c1 = summed_chr_length_dict.get('I')
    for c in summed_chr_length_dict:
        if not c == 'I':
            c2 = summed_chr_length_dict.get(c)
            middle_chr_position.append(c1 + (c2 - c1) / 2)
            c1 = c2
    c2 = l_genome
    middle_chr_position.append(c1 + (c2 - c1) / 2)

    gene_pos_dict = gene_position(gff_file)
    genes_currentchrom_pos_list = [k for k, v in gene_pos_dict.items()]
    genes_essential_list = list_known_essentials(essential_genes_files)

    with open(bed_file) as f:
        lines = f.readlines()

    chrom_names_dict, chrom_start_index_dict, chrom_end_index_dict = chromosome_name_bedfile(
        bed_file)

    allcounts_list = np.zeros(l_genome)
    if variable == "transposons":
        for line in lines[chrom_start_index_dict.
                          get("I"):chrom_end_index_dict.get("XVI") + 1]:
            line = line.strip('\n').split()
            chrom_name = [
                k for k, v in chrom_names_dict.items()
                if v == line[0].replace("chr", '')
            ][0]
            allcounts_list[summed_chr_length_dict.get(chrom_name) +
                           int(line[1]) - 1] += 1
    elif variable == "reads":
        for line in lines[chrom_start_index_dict.
                          get("I"):chrom_end_index_dict.get("XVI") + 1]:
            line = line.strip('\n').split()
            chrom_name = [
                k for k, v in chrom_names_dict.items()
                if v == line[0].replace("chr", '')
            ][0]
            allcounts_list[summed_chr_length_dict.get(chrom_name) +
                           int(line[1]) - 1] += (int(line[4]) - 100) / 20

    allcounts_binnedlist = []
    val_counter = 0
    sum_values = 0
    for n in range(len(allcounts_list)):
        if int(val_counter % bar_width) != 0:
            sum_values += allcounts_list[n]
        elif int(val_counter % bar_width) == 0:
            allcounts_binnedlist.append(sum_values)
            sum_values = 0
        val_counter += 1
    allcounts_binnedlist.append(sum_values)

    if bar_width == (l_genome / 1000):
        allinsertionsites_list = np.linspace(0, l_genome,
                                             int(l_genome / bar_width + 1))
    else:
        allinsertionsites_list = np.linspace(0, l_genome,
                                             int(l_genome / bar_width + 2))

    plt.figure(figsize=(19.0, 9.0))  #(27.0,3))
    grid = plt.GridSpec(20, 1, wspace=0.0, hspace=0.0)

    textsize = 12
    textcolor = "#000000"
    binsize = bar_width
    ax = plt.subplot(grid[0:19, 0])
    #    for gene in genes_currentchrom_pos_list:
    #        if not gene_pos_dict.get(gene)[0] == 'Mito':
    #            gene_start_pos = summed_chr_length_dict.get(gene_pos_dict.get(gene)[0]) + int(gene_pos_dict.get(gene)[1])
    #            gene_end_pos = summed_chr_length_dict.get(gene_pos_dict.get(gene)[0]) + int(gene_pos_dict.get(gene)[2])
    #            if gene in genes_essential_list:
    #                ax.axvspan(gene_start_pos,gene_end_pos,facecolor="#BBE6AA",alpha=0.8)
    #            else:
    #                ax.axvspan(gene_start_pos,gene_end_pos,facecolor="#F6A089",alpha=0.8)
    ax.bar(allinsertionsites_list,
           allcounts_binnedlist,
           width=binsize,
           color="#333333")  #"#00918f")
    ax.grid(False)
    ax.set_xlim(0, l_genome)

    for chrom in summed_chr_length_dict:
        ax.axvline(x=summed_chr_length_dict.get(chrom),
                   linestyle='-',
                   color=(0.9, 0.9, 0.9, 1.0))

    ax.set_xticks(middle_chr_position)
    ax.set_xticklabels(chrom_list, fontsize=textsize)
    ax.tick_params(axis='x', which='major', pad=30)
    if variable == "transposons":
        plt.ylabel('Transposon Count', fontsize=textsize,
                   color=textcolor)  #, labelpad=30)
    elif variable == "reads":
        plt.ylabel('Read Count', fontsize=textsize,
                   color=textcolor)  #, labelpad=30)

    axc = plt.subplot(grid[19, 0])
    for gene in genes_currentchrom_pos_list:
        if not gene_pos_dict.get(gene)[0] == 'Mito':
            gene_start_pos = summed_chr_length_dict.get(
                gene_pos_dict.get(gene)[0]) + int(gene_pos_dict.get(gene)[1])
            gene_end_pos = summed_chr_length_dict.get(
                gene_pos_dict.get(gene)[0]) + int(gene_pos_dict.get(gene)[2])
            if gene in genes_essential_list:
                axc.axvspan(gene_start_pos,
                            gene_end_pos,
                            facecolor="#00F28E",
                            alpha=0.8)
            else:
                axc.axvspan(gene_start_pos,
                            gene_end_pos,
                            facecolor="#F20064",
                            alpha=0.8)
    axc.set_xlim(0, l_genome)
    axc.tick_params(
        axis='x',  # changes apply to the x-axis
        which='both',  # both major and minor ticks are affected
        bottom=False,  # ticks along the bottom edge are off
        top=False,  # ticks along the top edge are off
        labelbottom=False)  # labels along the bottom edge are off

    axc.tick_params(
        axis='y',  # changes apply to the y-axis
        which='both',  # both major and minor ticks are affected
        left=False,  # ticks along the bottom edge are off
        right=False,  # ticks along the top edge are off
        labelleft=False)  # labels along the bottom edge are off

    if savefig == True and variable == "transposons":
        savepath = os.path.splitext(bed_file)
        print('saving figure at %s' % savepath[0] +
              '_transposonplot_genome.png')
        plt.savefig(savepath[0] + '_transposonplot_genome.png', dpi=400)
        plt.close()
    elif savefig == True and variable == "reads":
        savepath = os.path.splitext(bed_file)
        print('saving figure at %s' % savepath[0] + '_readplot_genome.png')
        plt.savefig(savepath[0] + '_readplot_genome.png', dpi=400)
        plt.close()
    else:
        plt.show()