示例#1
0
def create_essentialgenes_list(inputfiles_list=None):
    '''
    This function requires as input a list of paths to files containing essential genes.
    Multiple files can be present in this list.
    The input files have to have the following layout:
        - Three header lines (can be empty or containing any text)
        - Each new lines should contain one gene name in either oln or designation naming convention.
    
    This function is dependable on the following custom made modules:
        - gene_names.py (requires the file Yeast_Protein_Names.txt)
    
    The output will be a text file containing all uniquely found genes in all input files given.
    The file will be stored at the same location of the first file of the input list with the name 'Cerevisiae_AllEssentialGenes_List.txt'.
    In this file each line contains one gene and it has a single header line containing all the filenames that were used to create this file.
    
    '''

    if inputfiles_list == None:
        raise ValueError('Input list containing one or more paths is missing.')
    else:
        files = inputfiles_list

    path = os.path.dirname(files[0])
    filename_list = []
    for file in files:
        filename_list.append(os.path.basename(file))

    del (inputfiles_list, file)

    #%%
    all_genes_list = []
    for file in files:  #ASSUMES THREE HEADER LINES
        filename = os.path.basename(file)
        with open(file) as f:
            lines = f.readlines()
            print('Number of genes found in %s: %i' % (filename,
                                                       (len(lines) - 3)))

            for line in lines[3:]:
                all_genes_list.append(line.strip('\n'))

    del (file, f, lines, line)

    #%%
    gene_aliases_dict = gene_aliases(
        r"C:\Users\gregoryvanbeek\Documents\GitHub\LaanLab-SATAY-DataAnalysis\Python_scripts\Data_Files\Yeast_Protein_Names.txt"
    )[0]

    #%%
    all_genes_oln_list = []
    for gene in all_genes_list:
        if gene in gene_aliases_dict:
            all_genes_oln_list.append(gene)
        else:
            for key, val in gene_aliases_dict.items():
                if gene in val:
                    all_genes_oln_list.append(key)
                    break

    del (gene, all_genes_list, key, val, gene_aliases_dict)

    #%%
    unique_genes_list = list(set(all_genes_oln_list))
    unique_genes_list.sort()

    print('Number of unique essential genes found : %i' %
          len(unique_genes_list))

    del (all_genes_oln_list)

    #%%
    save_filename = r'Cerevisiae_AllEssentialGenes_List.txt'
    save_file = os.path.join(path, save_filename)

    print('Creating text file with all unique genes at %s' % save_file)

    with open(save_file, 'w') as f:
        f.write('All essential genes found in lists:' + str(filename_list) +
                '\n')
        for gene in unique_genes_list:
            f.write(gene + '\n')

    del (gene)
def gene_reads(gene_name=None, region=None, bed_file=None, savefigure=False):
    '''This script makes a profile plot for the number of reads per tranposon for a specific genomic region.
    Input is a region and the .bed file from the output of the Matlab code from the Kornmann-lab.
    The region can be defined either as a gene name (e.g. 'bem1') or as a list consisting of three elements where the first element is the chromosome name, the start and end position respectively (e.g. ['I',1,4000]).
    If a gene name is input, the script searches in a .gff file (downloaded from yeastgenome.org).
    The output is a bar plot where the number of reads divided by the number of transposons.
    '''
    #%% USED FILES
    gff_file = r"X:\tnw\BN\LL\Shared\Gregory\Gene_Database\Saccharomyces_cerevisiae.R64-1-1.99.gff3"
    gene_information_file = r'X:\tnw\BN\LL\Shared\Gregory\Gene_Database\Yeast_Protein_Names.txt'
    #%%SAVE FILES
    if savefigure == True:
        save_figure_path = r'X:\tnw\BN\LL\Shared\Gregory\Python\Python Figures\gene_reads_figures'
#%% GET START AND END POSITION OF GENE
    if gene_name.upper() == 'HOLOCUS' or gene_name == 'HO-LOCUS':
        gene_pos = ['IV', 46271, 48031]
        gene_name = 'HOlocus'

    elif gene_name != None:
        gene_pos_dict = gene_position(
            gff_file)  #GET POSITION INFORMATION OF ALL GENES

        gene_name = gene_name.upper()  #CAPITALIZE GENE NAME
        if gene_pos_dict.get(
                gene_name
        ) == None:  #CHECK IF GENE_NAME EXISTS IN GENE_POS_DICT. IF NOT, CHECK IF ANY OF THE ALIASES EXISTS
            gene_alias_dict = gene_aliases(gene_information_file)[0]
            gene_alias_key = [
                k for k, v in gene_alias_dict.items() if gene_name in v
            ]
            print('gene_alias_key ', gene_alias_key[0])
            if gene_pos_dict.get(
                    gene_alias_key[0]
            ) == None:  #IF KEY DOES ALSO NOT EXISTS IN GENE_POS_DICT, CHECK IF MORE ALIASES EXISTS OF GENE_NAME
                gene_alias_list = gene_alias_dict.get(gene_alias_key[0])
                for gene_alias in gene_alias_list:
                    if gene_pos_dict.get(gene_alias) != None:
                        gene_pos = gene_pos_dict.get(gene_alias)
                        print('The alias ', gene_alias,
                              ' is used for the gene ', gene_name)
            else:
                gene_pos = gene_pos_dict.get(gene_alias_key[0])
                print('The alias ', gene_alias_key[0],
                      ' is used for the gene ', gene_name)

        else:
            gene_pos = gene_pos_dict.get(gene_name)

        gene_orien = gene_pos[3]

    elif region != None:
        gene_pos = region

    gene_chr = gene_pos[0]
    gene_start = int(gene_pos[1])
    gene_end = int(gene_pos[2])
    if gene_name != None:
        print(gene_name, ' starts at basepair ', gene_start,
              ' and ends at basepair ', gene_end, ' in chromosome', gene_chr)
    else:
        print('Selected region starts at basepair ', gene_start,
              ' and ends at basepair ', gene_end, ' in chromosome', gene_chr)

#%% READ THE BED FILE
    with open(bed_file) as f:
        lines = f.readlines()

#%% GET POSITION FOR THE CHROMOSOMES IN THE BED FILE

    chrom_start_line_dict, chrom_end_line_dict = chromosome_props_bedfile(
        lines)[1:3]

    #%% GET ALL READS WITHIN THE GENE
    insertion_list = []
    read_list = []
    for line in lines[chrom_start_line_dict.get(gene_chr):chrom_end_line_dict.
                      get(gene_chr)]:
        line_list = line.strip('\n').split()
        if gene_start <= int(line_list[1]) <= gene_end:
            insertion_list.append(int(line_list[1]))

            read_value = (
                int(line_list[4]) - 100
            ) / 20  #the matlab script by benoit takes the number of reads*20+100. This line makes this undone
            read_list.append(read_value)

#%% ACCOUNT FOR DOUBLE INSERTIONS FOR PLOTTING
#see for example chromosome I, bp 3891
    unique_insertion_list = []
    duplicate_insertion_list = []
    for ins in insertion_list:  #FIND THE CHROMOSOME POSITION OF ALL DUPLICATED TRANSPOSON INSERTION SITES
        if ins not in unique_insertion_list:
            unique_insertion_list.append(ins)
        else:
            duplicate_insertion_list.append(ins)
    duplicate_insertion_list = np.unique(
        duplicate_insertion_list
    )  #ACCOUNT FOR THE SITUATION WHERE THERE ARE MORE THAN TWO INSERTIONS AT THE SAME LOCATION

    duplicate_index_list = []
    for dup in duplicate_insertion_list:
        insertion_arr = np.asarray(insertion_list)
        duplicate_index_list.append(
            np.where(insertion_arr == dup)
        )  #GET ALL INDICES OF THE LIST OF TRANSPOSON INSERTIONS WHERE THE DUPLICATES ARE PRESENT. EACH INSERTION LOCATION GETS ITS OWN NUMPY ARRAY WITHIN THIS LIST

    if len(duplicate_index_list) > 0:
        number_of_duplicates_list = [1] * len(
            insertion_list
        )  #MAKE LIST OF ONES WITH SAME LENGTH AS INSERTION_LIST FOR STORING NUMBER OF DUPLICATES
        delete_index = []
        for ind_arr in duplicate_index_list:  #LOOP OVER ALL INDICES OF DUPLICATES
            ind_list = ind_arr[0]
            ind_list_max = max(ind_list)  #GET THE LAST INDEX OF THE DUPLICATES
            #            print('Mulitple transposons found at ',ind_list)
            for ind in ind_list:
                if not ind == ind_list_max:
                    read_list[ind_list_max] += read_list[
                        ind]  #ADD UP THE READS TO THE LAST DUPLICATE
                    number_of_duplicates_list[ind_list_max] = len(
                        ind_list)  #UPDATE NUMBER OF DUPLICATES
                    delete_index.append(ind)

        #REVERSE LOOP OVER LIST FOR DELETING
        for del_ind in reversed(delete_index):
            del read_list[
                del_ind]  #DELETES THE INDEX WHICH IS NOW ADDED UP TO THE LAST INDEX
            del insertion_list[
                del_ind]  #DELETES THE SAME INDICES IN THE INSERTION LISTS.
            del number_of_duplicates_list[del_ind]

        readspertransposon_list = [
            x / y for x, y in zip(read_list, number_of_duplicates_list)
        ]  #DIVIDE THE NUMBER OF READS BY THE NUMBER OF TRANSPOSONS
    else:
        readspertransposon_list = read_list

#%% MAKE LIST OF ALL LOCATIONS IN THE GENE WITH THE NUMBER OF READS IN EACH LOCATION
    gene_length = gene_end - gene_start
    print('Length of region of interest is ', gene_length)
    insertion_roi_list = list(range(gene_start, gene_end + 1))
    reads_roi_list = list(np.zeros(gene_length + 1))

    read_index = 0
    for position in insertion_list:
        roi_index = insertion_roi_list.index(position)
        reads_roi_list[roi_index] = float(readspertransposon_list[read_index])
        read_index += 1

#%% CALCULATE SOME STATISTICAL VALUES FOR THE SELECTED REGION
#insertion_roi_list := list of all potential insertion sites in the region
#reads_roi_list := number of reads in the selected region.

    bp_between_tn_insertions_chr_dict = statistics_perchromosome.chromosome_insertion_periodicity(
        gene_chr, bed_file)
    insertion_chromosome_avgperiodicity = np.nanmean(
        bp_between_tn_insertions_chr_dict.get(gene_chr))
    insertion_chromosome_stdperiodicity = np.nanstd(
        bp_between_tn_insertions_chr_dict.get(gene_chr))
    insertion_chromosome_firstquartileperiodicity = np.nanpercentile(
        bp_between_tn_insertions_chr_dict.get(gene_chr), 25)
    insertion_chromosome_medperiodicity = np.nanpercentile(
        bp_between_tn_insertions_chr_dict.get(gene_chr), 50)
    insertion_chromosome_thirdquartileperiodicity = np.nanpercentile(
        bp_between_tn_insertions_chr_dict.get(gene_chr), 75)

    coverage_percentage = (len(read_list) / len(insertion_roi_list) * 100)

    if insertion_list != []:
        bp_between_tn_insertions = [
            abs(y - x) for x, y in zip(insertion_list[:-1], insertion_list[1:])
        ]
        bp_between_tn_insertions.insert(0, insertion_list[0] -
                                        gene_start)  #ADD START OF GENE (bp=0)
        bp_between_tn_insertions.append(
            gene_end - insertion_list[-1]
        )  #ADD END OF GENE (bp=INDEX LAST TN - GENE LENGTH)

        max_empty_region = max(bp_between_tn_insertions)

        insertion_avgperiodicity = np.nanmean(bp_between_tn_insertions)
        insertion_stdperiodicity = np.nanstd(bp_between_tn_insertions)
        insertion_firstquartileperiodicity = np.nanpercentile(
            bp_between_tn_insertions, 25)
        insertion_medperiodicity = np.nanpercentile(bp_between_tn_insertions,
                                                    50)
        insertion_thirdquartileperiodicity = np.nanpercentile(
            bp_between_tn_insertions, 75)
    else:
        max_empty_region = gene_length
        bp_between_tn_insertions = [
            abs(y - x) for x, y in zip(insertion_list[:-1], insertion_list[1:])
        ]
        insertion_avgperiodicity = 0
        insertion_medperiodicity = 0

    if insertion_list != []:
        print('')
        print('Percentage of coverage is %.2f' % coverage_percentage)
        print('')
        print('Mean transposon insertion frequency in gene is %.2f, %.2f ' %
              (insertion_avgperiodicity, insertion_stdperiodicity))
        print(
            'Mean transposon insertion frequency in chromosome is %.2f, %.2f' %
            (insertion_chromosome_avgperiodicity,
             insertion_chromosome_stdperiodicity))
        print('')
        print(
            'Quartiles transposon insertion frequency in gene is %.2f, %.2f, %.2f '
            % (insertion_firstquartileperiodicity, insertion_medperiodicity,
               insertion_thirdquartileperiodicity))
        print(
            'Quartiles transposon insertion frequency in chromosome is %.2f, %.2f, %.2f '
            % (insertion_chromosome_firstquartileperiodicity,
               insertion_chromosome_medperiodicity,
               insertion_chromosome_thirdquartileperiodicity))

#%% BINNING OF THE READS_ROI_LIST
### DETERMINE VARIABLE BIN WIDTH SUCH THAT EITHER EACH BIN INCLUDES 8 TN OR THE WIDTH OF THE BAR IS EIGTH TIMES THE AVERAGE DISTANCE BETWEEN TN IN THE CHROMOSOME.

#    bin_width = int(insertion_chromosome_avgperiodicity*8)
#    reads_roi_binnedlist = []
#    val_counter = 0
#    sum_values = 0
#
#    for n in range(len(reads_roi_list)):
#        if val_counter % bin_width != 0:
#            sum_values += reads_roi_list[n]
#        elif val_counter % bin_width == 0:
#            reads_roi_binnedlist.append(sum_values)
#            sum_values = 0
#        val_counter += 1
#
#    insertion_roi_binnedlist = np.linspace(gene_start,gene_end,int(gene_length/bin_width)+1)

    reads_roi_binnedlist = []  #STORES THE HEIGHT OF THE BARS
    insertion_roi_binnedlist = []  #STORES THE X POSITION OF THE BARS
    bin_width = []  #STORES THE WIDTH OF THE BARS

    currentbin_reads_list = []
    currentbin_insertion_list = []
    reads_currentbin = 0
    insertion_counter = 0
    for ins in range(0, len(insertion_roi_list)):
        currentbin_reads_list.append(reads_roi_list[ins])
        currentbin_insertion_list.append(insertion_roi_list[ins])
        if reads_roi_list[ins] > 0:
            reads_currentbin += 1

        if reads_currentbin == 8:  #STOP IF THERE ARE 8 INSERTIONS FOUND
            reads_roi_binnedlist.append(sum(currentbin_reads_list))
            bin_width.append(
                max(currentbin_insertion_list) -
                min(currentbin_insertion_list))
            insertion_roi_binnedlist.append(insertion_roi_list[ins] -
                                            bin_width[-1] / 2)

            currentbin_reads_list = []
            currentbin_insertion_list = []
            reads_currentbin = 0
            insertion_counter = 0

        elif insertion_counter == int(
                insertion_chromosome_avgperiodicity * 8
        ):  #STOP IF THE LENGTH OF THE CURRENTBIN EXCEEDS AVERAGE PERIODICITY OF THE CHROMOSOME * 8
            reads_roi_binnedlist.append(sum(currentbin_reads_list))
            bin_width.append(
                max(currentbin_insertion_list) -
                min(currentbin_insertion_list))
            insertion_roi_binnedlist.append(insertion_roi_list[ins] -
                                            bin_width[-1] / 2)

            currentbin_reads_list = []
            currentbin_insertion_list = []
            reads_currentbin = 0
            insertion_counter = 0

        elif gene_start + ins == gene_end:  #ACCOUNT FOR THE FINAL BIN IN THE GENE.
            reads_roi_binnedlist.append(sum(currentbin_reads_list))
            bin_width.append(
                max(currentbin_insertion_list) -
                min(currentbin_insertion_list))
            insertion_roi_binnedlist.append(insertion_roi_list[ins] -
                                            bin_width[-1] / 2)

        insertion_counter += 1

#%% MAKE BAR PLOT FOR READS IN CHROMOSOME

    if gene_name != None:
        print('Plotting reads for gene ', gene_name, '...')
    else:
        print('Plotting reads in range ', gene_start, '..', gene_end,
              'in chromosome ', gene_chr, '...')

    text_size = 12

    plt.figure(figsize=(19, 9))
    grid = plt.GridSpec(1, 3, wspace=0.4, hspace=0.3)

    ax = plt.subplot(grid[0, :2])
    ax.bar(insertion_roi_binnedlist,
           reads_roi_binnedlist,
           width=bin_width,
           facecolor=np.array([126.0, 164.0, 179.0]) / 255,
           edgecolor='w')
    ax.set_axisbelow(True)
    ax.grid(True)
    if gene_name != None:
        ax.set_title(gene_name, fontweight='bold', fontsize=text_size)
    elif region == ['IV', 46271, 48031]:
        ax.set_title('HO-locus', fontweight='bold', fontsize=text_size)
    else:
        ax.set_title(str(gene_chr) + str(gene_start) + '-' + str(gene_end))
    ax.set_xlabel('Basepair position in chromosome ' + gene_chr,
                  fontsize=text_size)
    ax.set_ylabel('Read/Tn', fontsize=text_size)
    ax.set_xlim(gene_start, gene_end)

    if gene_name != None and gene_name != 'HOlocus':
        if insertion_list != []:
            textstr = '\n'.join((
                r'Reading orientation of gene: ' + gene_orien,
                r'Transposon coverage = %.2f percent' % (coverage_percentage),
                r'Mean transposon insertion frequency in gene is %.2f, %.2f ' %
                (insertion_avgperiodicity, insertion_stdperiodicity),
                r'Mean transposon insertion frequency in chromosome is %.2f, %.2f'
                % (insertion_chromosome_avgperiodicity,
                   insertion_chromosome_stdperiodicity),
                r'Quartiles transposon insertion frequency in gene is %.2f, %.2f, %.2f '
                %
                (insertion_firstquartileperiodicity, insertion_medperiodicity,
                 insertion_thirdquartileperiodicity),
                r'Quartiles transposon insertion frequency in chromosome is %.2f, %.2f, %.2f '
                % (insertion_chromosome_firstquartileperiodicity,
                   insertion_chromosome_medperiodicity,
                   insertion_chromosome_thirdquartileperiodicity)))
        else:
            textstr = (r'Reading orientation of gene: ' + gene_orien)
    else:
        textstr = '\n'.join((
            r'Transposon coverage = %.2f percent' % (coverage_percentage),
            r'Mean transposon insertion frequency in gene is %.2f, %.2f ' %
            (insertion_avgperiodicity, insertion_stdperiodicity),
            r'Mean transposon insertion frequency in chromosome is %.2f, %.2f'
            % (insertion_chromosome_avgperiodicity,
               insertion_chromosome_stdperiodicity),
            r'Quartiles transposon insertion frequency in gene is %.2f, %.2f, %.2f '
            % (insertion_firstquartileperiodicity, insertion_medperiodicity,
               insertion_thirdquartileperiodicity),
            r'Quartiles transposon insertion frequency in chromosome is %.2f, %.2f, %.2f '
            % (insertion_chromosome_firstquartileperiodicity,
               insertion_chromosome_medperiodicity,
               insertion_chromosome_thirdquartileperiodicity)))
    props = dict(boxstyle='round', facecolor='grey', alpha=0.8)
    ax.text(0.05,
            0.9,
            textstr,
            transform=ax.transAxes,
            fontsize=text_size,
            verticalalignment='top',
            horizontalalignment='left',
            bbox=props)

    for ins in insertion_list:  #PLOT TICKS ON THE X AXIS TO INDICATE THE EXACT INSERTION LOCATIONS
        ax.axvline(x=ins, ymax=0.05, linewidth=1, color='k')

# COMPARE DISTRIBUTION OF BASEPAIRS BETWEEN INSERTIONS FOR THE CHROMOSOME AND THE GENE USING VIOLINPLOT
    plt.subplot(grid[0, 2])

    if gene_name == None:
        if region == ['IV', 46271, 48031]:
            gene_name = 'HOlocus'
        else:
            gene_name = str(gene_chr) + str(gene_start) + '-' + str(gene_end)

    bp_between_tn_insertions_dict = {}
    bp_between_tn_insertions_dict[gene_chr] = bp_between_tn_insertions

    df_chr = pd.DataFrame(bp_between_tn_insertions_chr_dict)
    df = pd.DataFrame(bp_between_tn_insertions_dict)
    if len(df) != 0:
        df_concat = pd.concat([df, df_chr], axis=0, ignore_index=True)
        names_list = ['gene'
                      ] * len(bp_between_tn_insertions) + ['chromosome'] * len(
                          bp_between_tn_insertions_chr_dict.get(gene_chr))
        df_concat['label'] = names_list
        xlabel = gene_name + ' | ' + gene_chr
        df_concat[xlabel] = ''
        df_concat.columns = ['bp between tn', 'label', xlabel]

        sb.set(style="whitegrid", palette="pastel", color_codes=True)
        sb.violinplot(data=df_concat,
                      x=xlabel,
                      y='bp between tn',
                      hue='label',
                      inner='quartile',
                      scale='width',
                      gridsize=5000,
                      split=True,
                      cut=0,
                      palette={
                          "gene": "g",
                          "chromosome": "r"
                      })
        plt.ylim(0, max_empty_region + 10)
    else:
        sb.violinplot(data=df_chr,
                      inner='quartile',
                      orien='v',
                      scale='width',
                      gridsize=5000,
                      cut=0)

    plt.show()
    if savefigure == True:
        plt.savefig(os.path.join(save_figure_path,
                                 gene_name + '_TnInsertions.png'),
                    dpi=300)
        print('Figure saved at ', save_figure_path)
def dna_features(region,
                 wig_file,
                 pergene_insertions_file,
                 variable="reads",
                 normalize=True,
                 normalization_window_size=20000,
                 plotting=True,
                 savefigure=False,
                 verbose=True):
    '''This function inputs a wig file and pergene_insertions file created using transposonmapping_satay.py.
    Optional is to define with data is displayed, which can be either "insertions" or "reads".
    Output is a dataframe including major information about all genomic features and optionally a barplot indicating the number of transposons per genomic region.
    A genomic region is here defined as a gene (separated as annotated essential and not essential), telomere, centromere, ars etc.
    This can be used for identifying neutral regions (i.e. genomic regions that, if inhibited, do not influence the fitness of the cells).
    This function can be used for normalizing the transposon insertions per gene using the neutral regions.
    
    Input:
        - Region: e.g. chromosome number (either a normal number between 1 and 16 or in roman numerals between I and XVI), a list like ['V', 0, 14790] which creates a barplot between basepair 0 and 14790) or a genename.
        - wig_file: wiggle file from the output of transposonmapping.py that is used in the processing workflow.
        - pergene_insertions_file: text file from the output of transposonsmapping.py
        - variable: (only for plotting) either 'insertions' or 'reads', which determines what is being plotted.
        - normalize: (only for plotting) either True or False. Normalization only works for when variable is 'reads'. The normalized reads are plotted and adds a column to the dataframe.
        - normalization_window_size: Integer. Normalization relies on windows that corrects for inter chromosomal differences. This determine the size of those windows in terms of basepairs (default=10000)
        - plotting: Either True or False. Determines whether the barplot has to be created.
        - savefigure: (only for plotting) Whether to save the figure at the same location of this script.
        - Verbose: Either True of False. Determines how much textual feedback is given. When set to False, only warnings will be shown.

    Output:
        - dna_df2: Dataframe containing information about the selected chromosome.
    
    Required files (see next section):
        - essentials_file: https://github.com/Gregory94/LaanLab-SATAY-DataAnalysis/blob/master/Data_Files/Cerevisiae_AllEssentialGenes_List.txt
        - gene_information_file: https://github.com/Gregory94/LaanLab-SATAY-DataAnalysis/blob/master/Data_Files/Yeast_Protein_Names.txt
        - gff-file: https://github.com/Gregory94/LaanLab-SATAY-DataAnalysis/blob/master/Data_Files/Saccharomyces_cerevisiae.R64-1-1.99.gff3
        - sgd_features_file: https://github.com/Gregory94/LaanLab-SATAY-DataAnalysis/blob/master/Data_Files/SGD_features.tab
    '''
    #%% FILES
    essentials_file = os.path.join(file_dirname, '..', 'Data_Files',
                                   "Cerevisiae_AllEssentialGenes_List.txt")

    gene_information_file = os.path.join(file_dirname, '..', 'Data_Files',
                                         'Yeast_Protein_Names.txt')

    gff_file = os.path.join(file_dirname, '..', 'Data_Files',
                            'Saccharomyces_cerevisiae.R64-1-1.99.gff3')

    sgd_features_file = os.path.join(file_dirname, '..', 'Data_Files',
                                     'SGD_features.tab')

    variable = variable.lower()
    if plotting == True:
        create_plottitle = ''

#%% DETERMINE INPUTTED REGION

    warningmessage = "WARNING: Specified chromosome or gene name not found. Enter chromosome as a number (or roman numeral) between 1 and 16 (I and XVI), a list in the form ['chromosome number, start_position, end_position'] or a valid gene name."

    if verbose == True:
        print('Selected region: ', region)

    if type(region) == str:
        if region.upper() in chromosomename_roman_to_arabic()[1]:
            chrom = region.upper()
            roi_start = None
            roi_end = None
            region_type = 'Chromosome'

        elif region.upper() in list_gene_names(gene_information_file):
            gene_pos_dict = gene_position(gff_file)
            region = region.upper()
            if region in gene_pos_dict:
                region_pos = gene_pos_dict.get(region)
                chrom = region_pos[0]
                roi_start = int(region_pos[1])
                roi_end = int(region_pos[2])
            else:
                gene_alias_dict = gene_aliases(gene_information_file)[0]
                region_alias = [
                    key for key, val in gene_alias_dict.items()
                    if region in val
                ]
                if not region_alias == [] and region_alias[0] in gene_pos_dict:
                    region_pos = gene_pos_dict.get(region_alias[0])
                    chrom = region_pos[0]
                    roi_start = int(region_pos[1]) - 100
                    roi_end = int(region_pos[2]) + 100
                    del (region_alias, gene_alias_dict)
                else:
                    print(warningmessage)
#                    return()
            if plotting == True:
                create_plottitle = region
            region_type = 'Gene'
            del (region_pos, gene_pos_dict)

        else:
            print(warningmessage)
#            return()

    elif type(region) == list:
        if type(region[0]) == str:
            chrom = region[0].upper()
        elif type(region[0]) == int:
            if region[0] in chromosomename_roman_to_arabic()[0]:
                chrom = chromosomename_roman_to_arabic()[0].get(region[0])
        else:
            print(warningmessage)
#            return()
        roi_start = region[1]
        roi_end = region[2]
        region_type = 'Chromosome'

    elif type(region) == int:
        if region in chromosomename_roman_to_arabic()[0]:
            chrom = chromosomename_roman_to_arabic()[0].get(region)
            roi_start = None
            roi_end = None
        else:
            print(warningmessage)
#            return()
        region_type = 'Chromosome'

    else:
        print(warningmessage)
#        return()

    del (warningmessage)

    #%% READ WIG FILE FOR GETTING LOCATIONS OF ALL TN INSERTIONS

    with open(wig_file, 'r') as f:
        lines = f.readlines()

    chrom_start_line_dict, chrom_end_line_dict = chromosome_name_wigfile(
        lines)[1:]

    insrt_in_chrom_list = []
    reads_in_chrom_list = []
    for l in lines[chrom_start_line_dict.get(chrom):chrom_end_line_dict.
                   get(chrom)]:
        insrt_in_chrom_list.append(int(l.strip('\n').split(' ')[0]))
        reads_in_chrom_list.append(int(l.strip('\n').split(' ')[1]))

    del (lines, l, f, chrom_start_line_dict, chrom_end_line_dict)

    #%% READ PERGENE_INSERTIONS FILE FOR LOCATION OF ALL INSERTIONS PER EACH GENE.

    with open(pergene_insertions_file) as f:
        lines = f.readlines()

    gene_position_dict = {}
    for line in lines[1:]:
        line_split = line.strip('\n').split('\t')

        if line_split[1] == chrom:
            genename = line_split[0]
            gene_chrom = line_split[1]
            gene_start = int(line_split[2])
            gene_end = int(line_split[3])

            gene_position_dict[genename] = [
                gene_chrom, gene_start, gene_end
            ]  #DICT CONTAINING ALL GENES WITHIN THE DEFINED CHROMOSOME INCLUDING ITS START AND END POSITION

            geneinserts_str = line_split[4].strip('[]')
            if not geneinserts_str == '':
                geneinserts_list = [
                    int(ins) for ins in geneinserts_str.split(',')
                ]
            else:
                geneinserts_list = []

            genereads_str = line_split[5].strip('[]')
            if not genereads_str == '':
                genereads_list = [
                    int(read) for read in genereads_str.split(',')
                ]
            else:
                genereads_list = []

            if len(geneinserts_list) != len(genereads_list):
                print(
                    'WARNING: %s has different number of reads compared with the number of inserts'
                    % genename)

    del (f, lines, line, line_split, genename, gene_chrom, gene_start,
         gene_end, geneinserts_list, geneinserts_str, genereads_str,
         genereads_list)

    #%% DETERMINE THE LOCATION GENOMIC FEATURES IN THE CURRENT CHROMOSOME AND STORE THIS IN A DICTIONARY

    len_chr = chromosome_position(gff_file)[0].get(chrom)
    start_chr = chromosome_position(gff_file)[1].get(chrom)
    end_chr = chromosome_position(gff_file)[2].get(chrom)

    dna_dict = {
    }  #for each bp in chromosome, determine whether it belongs to a noncoding or coding region
    for bp in range(
            start_chr, end_chr + 1
    ):  #initialize dna_dict with all basepair positions as ['noncoding', None]
        dna_dict[bp] = ['noncoding', None]  #form is: ['element_name', 'type']

    feature_orf_dict = sgd_features(sgd_features_file)[1]
    gene_alias_dict = gene_aliases(gene_information_file)[0]

    for gene in gene_position_dict:
        if gene in feature_orf_dict:
            if (not gene.endswith("-A")
                    and not feature_orf_dict.get(gene)[1] == 'Verified') and (
                        not gene.endswith("-B")
                        and not feature_orf_dict.get(gene)[1] == 'Verified'):
                for bp in range(
                        gene_position_dict.get(gene)[1] + start_chr,
                        gene_position_dict.get(gene)[2] + start_chr + 1):
                    dna_dict[bp] = [
                        gene, "Gene; " + feature_orf_dict.get(gene)[1]
                    ]
        else:
            gene_alias = [
                key for key, val in gene_alias_dict.items() if gene in val
            ][0]
            for bp in range(
                    gene_position_dict.get(gene)[1] + start_chr,
                    gene_position_dict.get(gene)[2] + start_chr + 1):
                dna_dict[bp] = [
                    gene_alias, "Gene; " + feature_orf_dict.get(gene_alias)[1]
                ]

    del (gff_file, gene, bp, gene_alias)

    #%% GET FEATURES FROM INTERGENIC REGIONS (-> SEE SGD_features.tab IN DATA_FILES IN GITHUB FOLDER)

    genomicregions_list = sgd_features(sgd_features_file)[0]

    i = 2
    for genomicregion in genomicregions_list[1:]:
        dna_dict = feature_position(
            sgd_features(sgd_features_file)[i], chrom, start_chr, dna_dict,
            genomicregion)
        i += 1

    ### TEST IF ELEMENTS IN FEATURE_ORF_DICT FOR SELECTED CHROMOSOME ARE THE SAME AS THE GENES IN GENE_POSITION_DICT BY CREATING THE DICTIONARY FEATURE_POSITION_DICT CONTAINING ALL THE GENES IN FEATURE_ORF_DICT WITH THEIR CORRESPONDING POSITION IN THE CHROMOSOME
    gene_alias_dict = gene_aliases(gene_information_file)[0]
    orf_position_dict = {}
    for feature in feature_orf_dict:
        if feature_orf_dict.get(feature)[5] == chrom:
            if feature in gene_position_dict:
                orf_position_dict[feature] = [
                    feature_orf_dict.get(feature)[6],
                    feature_orf_dict.get(feature)[7]
                ]
            else:
                for feature_alias in gene_alias_dict.get(feature):
                    if feature_alias in gene_position_dict:
                        orf_position_dict[feature_alias] = [
                            feature_orf_dict.get(feature)[6],
                            feature_orf_dict.get(feature)[7]
                        ]

    if sorted(orf_position_dict) == sorted(gene_position_dict):
        if verbose == True:
            #            print('Everything alright, just ignore me!')
            pass
        else:
            pass
    else:
        print(
            'WARNING: Genes in feature_list are not the same as the genes in the gene_position_dict. Please check!'
        )

    del (sgd_features_file, feature_orf_dict, orf_position_dict, feature,
         feature_alias, gene_position_dict)

    #%% DETERMINE THE NUMBER OF TRANSPOSONS PER BP FOR EACH FEATURE

    reads_loc_list = [0] * len(
        dna_dict
    )  # CONTAINS ALL READS JUST LIKE READS_IN_CHROM_LIST, BUT THIS LIST HAS THE SAME LENGTH AS THE NUMBER OF BP IN THE CHROMOSOME WHERE THE LOCATIONS WITH NO READS ARE FILLED WITH ZEROS
    i = 0
    for ins in insrt_in_chrom_list:
        reads_loc_list[ins] = reads_in_chrom_list[i]
        i += 1

    del (i, ins, insrt_in_chrom_list, reads_in_chrom_list)  #, dna_df)

    #%% CREATE DATAFRAME FOR EACH FEATURE (E.G. NONCODING DNA, GENE, ETC.) IN THE CHROMOSOME AND DETERMINE THE NUMBER OF INSERTIONS AND READS PER FEATURE.

    feature_NameAndType_list = []
    f_previous = dna_dict.get(start_chr)[0]
    f_type = dna_dict.get(start_chr)[1]
    N_reads = []
    N_reads_list = []
    N_reads_truncatedgene_list = []
    N_insrt_truncatedgene_list = []
    N_insrt_list = []
    N_bp = 1
    N_bp_list = []
    f_start = 0
    f_end = 0
    f_pos_list = []
    i = 0
    for bp in dna_dict:
        f_current = dna_dict.get(bp)[0]
        if f_current == f_previous:
            f_type = dna_dict.get(bp)[1]
            f_end += 1
            N_bp += 1
            N_reads.append(reads_loc_list[i])
        elif (f_current != f_previous or
              (i + start_chr) == end_chr):  # and not f_current.endswith('-A'):
            feature_NameAndType_list.append([f_previous, f_type])
            N_reads_list.append(sum(N_reads))
            N_insrt_list.append(len([ins for ins in N_reads if not ins == 0]))
            if not f_type == None and f_type.startswith('Gene'):
                N10percent = 100  #int(len(N_reads) * 0.1)
                N_reads_truncatedgene_list.append(
                    sum(N_reads[N10percent:-N10percent]))
                N_insrt_truncatedgene_list.append(
                    len([
                        ins for ins in N_reads[N10percent:-N10percent]
                        if not ins == 0
                    ]))
            else:
                N_reads_truncatedgene_list.append(sum(N_reads))
                N_insrt_truncatedgene_list.append(
                    len([ins for ins in N_reads if not ins == 0]))

            N_bp_list.append(N_bp)
            N_reads = []
            N_bp = 1
            f_pos_list.append([f_start, f_end + f_start])
            f_start = f_start + f_end + 1
            f_end = 0
            f_previous = f_current
        i += 1

#    N_reads_per_bp_list = []
#    N_reads_per_bp_central80p_list = []
#    N_insrt_per_bp_list = []
#    N_insrt_per_bp_central80p_list = []
    N_reads_per_ins_list = []
    N_reads_per_ins_truncatedgene_list = []
    for i in range(len(N_reads_list)):
        #        N_reads_per_bp_list.append(N_reads_list[i]/N_bp_list[i])
        #        N_insrt_per_bp_list.append(N_insrt_list[i]/N_bp_list[i])
        #        if not feature_NameAndType_list[i][1] == None and feature_NameAndType_list[i][1].startswith('Gene'):
        #            N_reads_per_bp_central80p_list.append(N_reads_truncatedgene_list[i]/(N_bp_list[i]-200))#*0.8
        #            N_insrt_per_bp_central80p_list.append(N_insrt_truncatedgene_list[i]/(N_bp_list[i]-200))#*0.8
        #        else:
        #            N_reads_per_bp_central80p_list.append(N_reads_list[i]/N_bp_list[i])
        #            N_insrt_per_bp_central80p_list.append(N_insrt_list[i]/N_bp_list[i])

        if N_insrt_list[i] == 0:
            N_reads_per_ins_list.append(0)
            N_reads_per_ins_truncatedgene_list.append(0)
        elif N_insrt_truncatedgene_list[i] == 0:
            N_reads_per_ins_list.append(N_reads_list[i] / N_insrt_list[i])
            N_reads_per_ins_truncatedgene_list.append(0)
        else:
            N_reads_per_ins_list.append(N_reads_list[i] / N_insrt_list[i])
            N_reads_per_ins_truncatedgene_list.append(
                N_reads_truncatedgene_list[i] / N_insrt_truncatedgene_list[i])

    #############get all essential genes together with their aliases##############
    with open(essentials_file, 'r') as f:
        essentials_temp_list = f.readlines()[1:]
    essentials_list = [
        essential.strip('\n') for essential in essentials_temp_list
    ]
    del essentials_temp_list

    gene_alias_dict = gene_aliases(gene_information_file)[0]
    for key, val in gene_alias_dict.items():
        if key in essentials_list:
            for alias in val:
                essentials_list.append(alias)

    #ADD
    essentiality_list = []
    for feature in feature_NameAndType_list:
        if not feature[0] == "noncoding":
            if feature[1] in genomicregions_list:
                essentiality_list.append(None)
            elif feature[0] in essentials_list:
                essentiality_list.append(True)
            else:
                essentiality_list.append(False)
        else:
            essentiality_list.append(None)

    del (key, val, alias, essentials_list, feature, gene_information_file
         )  #, gene_alias_dict)#, reads_loc_list)
    ##############################################################################

    feature_name_list = []
    feature_type_list = []
    feature_alias_list = []
    feature_standardname_list = []
    for feature_name in feature_NameAndType_list:
        feature_name_list.append(feature_name[0])
        feature_type_list.append(feature_name[1])
        if feature_name[1] != None and feature_name[1].startswith(
                'Gene') and feature_name[0] in gene_alias_dict:
            if gene_alias_dict.get(feature_name[0])[0] == feature_name[0]:
                feature_standardname_list.append(feature_name[0])
                feature_alias_list.append('')
            else:
                if len(gene_alias_dict.get(feature_name[0])) > 1:
                    feature_standardname_list.append(
                        gene_alias_dict.get(feature_name[0])[0])
                    feature_alias_list.append(
                        gene_alias_dict.get(feature_name[0])[1:])
                else:
                    feature_standardname_list.append(
                        gene_alias_dict.get(feature_name[0])[0])
                    feature_alias_list.append('')
        else:
            feature_standardname_list.append(feature_name[0])
            feature_alias_list.append('')

    all_features = {
        'Feature_name': feature_name_list,
        'Standard_name': feature_standardname_list,
        'Feature_alias': feature_alias_list,
        'Feature_type': feature_type_list,
        'Essentiality': essentiality_list,
        'Position': f_pos_list,
        'Nbasepairs': N_bp_list,
        'Ninsertions': N_insrt_list,
        'Ninsertions_truncatedgene': N_insrt_truncatedgene_list,
        'Nreads': N_reads_list,
        'Nreads_truncatedgene': N_reads_truncatedgene_list,
        #                    'Ninsertionsperbp':N_insrt_per_bp_list,
        #                    'Ninsertionsperbp_gene_central80p':N_insrt_per_bp_central80p_list,
        #                    'Nreadsperbp':N_reads_per_bp_list,
        #                    'Nreadsperbp_gene_central80p':N_reads_per_bp_central80p_list,
        'Nreadsperinsrt': N_reads_per_ins_list,
        'Nreadsperinsrt_truncatedgene': N_reads_per_ins_truncatedgene_list
    }

    dna_df2 = pd.DataFrame(
        all_features, columns=[column_name for column_name in all_features]
    )  #search for feature using: dna_df2.loc[dna_df2['Feature'] == 'CDC42']
    #CREATE NEW COLUMN WITH ALL DOMAINS OF THE GENE (IF PRESENT) AND ANOTHER COLUMN THAT INCLUDES LISTS OF THE BP POSITIONS OF THESE DOMAINS

    #PRINT INFORMATION FOR THE SELECTED GENE
    if region_type == 'Gene':
        for region_info in dna_df2.itertuples():
            if region_info.Feature_name == region.upper(
            ) or region_info.Standard_name == region.upper():
                print(region_info)

    del (dna_dict, feature_NameAndType_list, feature_name_list,
         feature_type_list, feature_name, f_type, f_previous, f_start, f_end,
         f_pos_list, f_current, N_reads, N_reads_list, N_insrt_list,
         N_reads_truncatedgene_list, N_insrt_truncatedgene_list, N10percent,
         N_bp, N_bp_list, bp, i, start_chr, end_chr, all_features,
         essentiality_list, essentials_file, genomicregions_list)

    #%% NORMALIZE USING WINDOWS

    dna_df2, window_edge_list = reads_normalization_fixed_window(
        dna_df2, len_chr, normalization_window_size, wig_file)

    #%% CREATE BAR PLOT
    if plotting == True:
        noncoding_color = "#002538"
        essential_color = "#10e372"
        nonessential_color = "#d9252e"
        codingdna_color = '#29a7e6'
        textcolor = "#000000"
        textsize = 20

        feature_middle_pos_list = []
        sum_bp = 0
        for x in dna_df2['Nbasepairs']:
            feature_middle_pos_list.append(x / 2 + sum_bp)
            sum_bp += x
        del (x, sum_bp)

        feature_width_list = list(dna_df2['Nbasepairs'])

        barcolor_list = []
        for feature in dna_df2['Feature_name']:
            if feature == 'noncoding':
                barcolor_list.append(noncoding_color)
            elif dna_df2.loc[dna_df2['Feature_name'] ==
                             feature]['Essentiality'].iloc[0] == False:
                barcolor_list.append(nonessential_color)
            elif dna_df2.loc[dna_df2['Feature_name'] ==
                             feature]['Essentiality'].iloc[0] == True:
                barcolor_list.append(essential_color)
            elif dna_df2.loc[dna_df2['Feature_name'] ==
                             feature]['Essentiality'].iloc[0] == None:
                barcolor_list.append(codingdna_color)
        del (feature)

        ###PLOTTING
        plt.figure(figsize=(19, 9))
        grid = plt.GridSpec(20, 1, wspace=0.0, hspace=0.01)

        ax = plt.subplot(grid[0:19, 0])
        if variable == "insertions":
            ax.bar(feature_middle_pos_list,
                   list(dna_df2['Ninsertions']),
                   feature_width_list,
                   color=barcolor_list)
            #        ax.set_ylim(0, max(dna_df2['Ninsertionsperbp']) + 0.1*max(dna_df2['Ninsertionsperbp']))
            ax.set_ylabel("Transposons per region",
                          fontsize=textsize,
                          color=textcolor)
        elif variable == "reads":
            if normalize == False:
                ax.bar(feature_middle_pos_list,
                       list(dna_df2['Nreads']),
                       feature_width_list,
                       color=barcolor_list)
                ax.set_ylabel("Reads per region",
                              fontsize=textsize,
                              color=textcolor)
#                ax.set_ylim(0.0,10.0)
            elif normalize == True:
                ax.bar(feature_middle_pos_list,
                       list(dna_df2['Nreads_normalized_byNCregions']),
                       feature_width_list,
                       color=barcolor_list)
                #                ax.bar(feature_middle_pos_list, list(dna_df2['Nreads_normalized']), feature_width_list, color=barcolor_list)
                #                ax.bar(feature_middle_pos_list, list(dna_df2['Nreads_normalized']), feature_width_list, color=barcolor_list)
                ax.set_ylabel("Normalized reads per region",
                              fontsize=textsize,
                              color=textcolor)
#                ax.set_ylim(0.0, 150.0)

        if roi_start != None and roi_end != None and roi_start < len_chr and roi_end < len_chr:
            ax.set_xlim(roi_start, roi_end)
        else:
            ax.set_xlim(0, len_chr)

        ax.grid(linestyle='-', alpha=1.0)
        ax.tick_params(labelsize=textsize)
        #    ax.set_xticklabels([])
        ax.tick_params(axis='x', which='major', pad=30)
        ax.ticklabel_format(axis='x', style='sci', scilimits=(0, 0))
        ax.xaxis.get_offset_text().set_fontsize(textsize)
        ax.set_xlabel("Basepair position on chromosome " + chrom,
                      fontsize=textsize,
                      color=textcolor,
                      labelpad=10)
        ax.set_title(create_plottitle, fontsize=textsize, color=textcolor)
        legend_noncoding = mpatches.Patch(color=noncoding_color,
                                          label="Noncoding DNA")
        legend_essential = mpatches.Patch(color=essential_color,
                                          label="Annotated essential genes")
        legend_nonessential = mpatches.Patch(color=nonessential_color,
                                             label="Nonessential genes")
        legend_coding = mpatches.Patch(color=codingdna_color,
                                       label="Other genomic regions")
        leg = ax.legend(handles=[
            legend_noncoding, legend_essential, legend_nonessential,
            legend_coding
        ])  #ADD
        for text in leg.get_texts():
            text.set_color(textcolor)
        del text

        count = 0
        for i in range(len(window_edge_list) - 1):
            if count % 2 == 0:
                ax.axvspan(window_edge_list[i],
                           window_edge_list[i + 1],
                           facecolor=[0.0, 0.0, 0.0, 0.1])
            else:
                ax.axvspan(window_edge_list[i],
                           window_edge_list[i + 1],
                           facecolor=[0.0, 0.0, 0.0, 0.0])
            count += 1

        axc = plt.subplot(grid[19, 0])

        l = 0
        counter = 0
        for width in feature_width_list:
            if dna_df2.loc[counter][4] == True:
                axc.axvspan(l, l + width, facecolor=essential_color, alpha=0.3)
            elif dna_df2.loc[counter][
                    4] == False and not dna_df2.loc[counter][0] == 'noncoding':
                axc.axvspan(l,
                            l + width,
                            facecolor=nonessential_color,
                            alpha=0.3)
            elif dna_df2.loc[counter][
                    4] == None and not dna_df2.loc[counter][0] == 'noncoding':
                axc.axvspan(l, l + width, facecolor=codingdna_color, alpha=0.5)
            l += width
            counter += 1
        if roi_start != None and roi_end != None and roi_start < len_chr and roi_end < len_chr:
            axc.set_xlim(roi_start, roi_end)
        else:
            axc.set_xlim(0, len_chr)
        axc.tick_params(labelsize=textsize)
        axc.set_yticklabels([])
        axc.tick_params(
            axis='x',  # changes apply to the x-axis
            which='both',  # both major and minor ticks are affected
            bottom=False,  # ticks along the bottom edge are off
            top=False,  # ticks along the top edge are off
            labelbottom=False)  # labels along the bottom edge are off

        axc.tick_params(
            axis='y',  # changes apply to the y-axis
            which='both',  # both major and minor ticks are affected
            left=False,  # ticks along the bottom edge are off
            right=False,  # ticks along the top edge are off
            labelleft=False)  # labels along the bottom edge are off

        if savefigure == True:
            if normalize == True and variable == 'reads':
                saving_name = os.path.join(
                    file_dirname, 'GenomicFeaturesReads_Barplot_Chrom' +
                    chrom + '_Normalized_with_Windowsize_' +
                    str(normalization_window_size))
            elif normalize == False and variable == 'reads':
                saving_name = os.path.join(
                    file_dirname, 'GenomicFeaturesReads_Barplot_Chrom' +
                    chrom + '_NonNormalized')
            else:
                saving_name = os.path.join(
                    file_dirname, 'GenomicFeaturesInsertions_Barplot_Chrom' +
                    chrom + '_NonNormalized')
            plt.savefig(saving_name, orientation='landscape', dpi=200)
            plt.close()

#        del (barcolor_list, codingdna_color, essential_color, feature_middle_pos_list, feature_width_list, noncoding_color, nonessential_color, textcolor, textsize, l, counter, width, normalization_window_size)

#%% RETURN STATEMENT
    return (dna_df2)
示例#4
0
def compareplot(bed_files=None,
                variable="insertions",
                chromosome=None,
                set_barwidth=None,
                set_logscale=False,
                savefig=False):
    '''This function creates a bar plot along a specified chromosome for the number of transposons.
    The height of each bar represents the number of transposons at the genomic position indicated on the x-axis.
    The input is as follows:
        -The bed-files ('bed_files', a list containing two paths, each refering to a bed-file [mandatory]),
        -Which chromosome ('chromosome', indicated by roman numeral or list of roman numerals [optional]),
        -The width of the bars ('bar_width-user_set', indicated by an integer [optional]),
        -Path to where to save the figures ('savefigure_path', string containing an existing path [optional]),
        -Name of the figures ('savefigure_name', string containing a single name, the name will be automatically extended with the chromosomal number [optional]).
    
    The bed_file is one of the files created by the Matlab code from the kornmann-lab.
    The figure shows two graphs, the top one represents the first bed-file given in the list, the bottom plot the second bed-file in the list.
    If the chromosome number is not set by the user, it automatically loops over all chromosomes and determines the figures for each of them.    
    The bar_width determines how many basepairs are put in one bin. Little basepairs per bin may be slow. Too many basepairs in one bin and possible low transposon areas might be obscured.
    When either the savefigure_path and/or the savefigure_name is left empty, the figure won't be saved.
    If the both these variables are given, the figures are saved using the path/figurename_chromX where the _chromX extension is automatically added.
    
    The background of the graph is color coded to indicate areas that code for genes.
    For this a list for essential genes is needed (used in 'list_known_essentials' function) and a .gff file is required (for the functions in 'chromosome_and_gene_positions.py') and a list for gene aliases (used in the function 'gene_aliases').
    '''
    #%% USED FILES
    gff_file = os.path.join(file_dirname, '..', 'data_files',
                            'Saccharomyces_cerevisiae.R64-1-1.99.gff3')
    essential_genes_files = [
        os.path.join(file_dirname, '..', 'data_files',
                     'Cerevisiae_EssentialGenes_List_1.txt'),
        os.path.join(file_dirname, '..', 'data_files',
                     'Cerevisiae_EssentialGenes_List_2.txt')
    ]
    gene_information_file = os.path.join(file_dirname, '..', 'data_files',
                                         'Yeast_Protein_Names.txt')
    #%% GET CHROMOSOME LENGTHS AND POSITIONS
    chr_length_dict, chr_start_pos_dict, chr_end_pos_dict = chromosome_position(
        gff_file)

    #%% GET ALL GENES IN CURRENT CHROMOSOME
    gene_pos_dict = gene_position(gff_file)
    genes_essential_list = list_known_essentials(essential_genes_files,
                                                 verbose=False)
    gene_alias_list = gene_aliases(gene_information_file)[0]

    #%% DETERMINE WHICH CHROMOSOME NEEDS TO BE ANALYZED AND LOOP OVER THE CHROMOSOMES
    if type(chromosome) is list:
        chrom_list = chromosome
    elif type(chromosome) is str:
        chrom_list = [chromosome.upper()]
    else:
        chrom_list = []
        roman_to_arabic_numerals = chromosomename_roman_to_arabic()[1]
        for keys in roman_to_arabic_numerals:
            chrom_list.append(keys)

    for chrom in chrom_list:
        print('')
        print('Analyzing chromosome: ', chrom)
        genes_currentchrom_pos_list = [
            k for k, v in gene_pos_dict.items() if chrom in v
        ]

        #%% READ BED FILE
        allinsertionsites_allfiles_list = []
        alltransposoncounts_allfiles_binnedlist = []
        for bed_file in bed_files:
            print("Processing file: %s" % bed_file)
            with open(bed_file) as f:
                lines = f.readlines()

#%% GET NAMES FOR THE CHROMOSOMES IN THE BED FILE
            chrom_start_index_dict, chrom_end_index_dict = chromosome_name_bedfile(
                bed_file)[1:3]

            #%% GET ALL TRANSPOSON COUNTS
            allcounts_list = np.zeros(chr_length_dict.get(chrom) + 2)
            if variable == "insertions":
                for line in lines[chrom_start_index_dict.
                                  get(chrom):chrom_end_index_dict.get(chrom) +
                                  1]:
                    line = line.strip('\n').split()
                    allcounts_list[int(line[1])] += 1

            elif variable == "reads":
                for line in lines[chrom_start_index_dict.
                                  get(chrom):chrom_end_index_dict.get(chrom) +
                                  1]:
                    line = line.strip('\n').split()
                    allcounts_list[int(line[1])] += int(line[4])

#%% BINNING OF THE READS
            if set_barwidth == None:
                bar_width = int(chr_length_dict.get(chrom) / 500)
            else:
                bar_width = set_barwidth

            allcounts_binnedlist = []
            val_counter = 0
            sum_values = 0
            if bar_width == 1:
                allcounts_binnedlist = allcounts_list
                allinsertionsites_list = np.linspace(
                    0, chr_length_dict.get(chrom),
                    int(chr_length_dict.get(chrom) / float(bar_width)))
            else:
                for n in range(len(allcounts_list)):
                    if val_counter % bar_width != 0:
                        sum_values += allcounts_list[n]
                    elif val_counter % bar_width == 0:
                        allcounts_binnedlist.append(sum_values)
                        sum_values = 0
                    val_counter += 1

                allinsertionsites_list = np.linspace(
                    0, chr_length_dict.get(chrom),
                    int(chr_length_dict.get(chrom) / bar_width) + 1)

            allinsertionsites_allfiles_list.append(allinsertionsites_list)
            alltransposoncounts_allfiles_binnedlist.append(
                allcounts_binnedlist)

#%% DETERMINE DIFFERENCE BETWEEN DATASETS TRANSPOSONCOUNTS
        transposoncounts_positivedifference_list = [0] * len(
            alltransposoncounts_allfiles_binnedlist[0])
        transposoncounts_negativedifference_list = [0] * len(
            alltransposoncounts_allfiles_binnedlist[0])
        for i in range(0, len(alltransposoncounts_allfiles_binnedlist[0])):
            difference = alltransposoncounts_allfiles_binnedlist[0][
                i] - alltransposoncounts_allfiles_binnedlist[1][i]
            if difference >= 0:
                transposoncounts_positivedifference_list[i] = difference
            elif difference < 0:
                transposoncounts_negativedifference_list[i] = -difference

#%% PLOTTING
        print('Plotting chromosome ', chrom, '...')
        print('bar width for plotting is ', bar_width)
        binsize = bar_width
        font_size = 12
        max_ylim = max(
            [
                item for sublist in alltransposoncounts_allfiles_binnedlist
                for item in sublist
            ]
        )  #GET MAXIMUM VALUE FOR SETTING THE Y AXIS LIMIT EQUAL FOR BOTH GRAPHS
        max_ylim = max_ylim + 0.1 * max_ylim

        plt.figure(figsize=(19, 9))
        grid = plt.GridSpec(2, 1, wspace=0.0, hspace=0.0)

        ax1 = plt.subplot(grid[0, 0])
        for gene in genes_currentchrom_pos_list:
            gene_start_pos = int(gene_pos_dict.get(gene)[1])
            gene_end_pos = int(gene_pos_dict.get(gene)[2])
            if gene in genes_essential_list:
                ax1.axvspan(gene_start_pos,
                            gene_end_pos,
                            facecolor='g',
                            alpha=0.3)
                ax1.text(gene_start_pos,
                         max_ylim,
                         gene_alias_list.get(gene)[0],
                         rotation=45)
            else:
                ax1.axvspan(gene_start_pos,
                            gene_end_pos,
                            facecolor='r',
                            alpha=0.3)

        ax1.bar(allinsertionsites_allfiles_list[0],
                alltransposoncounts_allfiles_binnedlist[0],
                width=binsize,
                color=(0.2, 0.2, 0.2, 0.8))
        ax1.bar(allinsertionsites_allfiles_list[0],
                transposoncounts_positivedifference_list,
                width=binsize,
                color=(0.52, 0.71, 0.90, 0.8))

        if set_logscale == True:
            ax1.set_yscale('log')
        else:
            ax1.set_ylim(0, max_ylim)
        ax1.set_axisbelow(True)
        ax1.grid(True)
        if variable == "insertions":
            ax1.set_ylabel('Aboslute insertion count', fontsize=font_size)
        elif variable == "reads":
            ax1.set_ylabel('Aboslute read count', fontsize=font_size)
        ax1.set_xlim(0, chr_length_dict.get(chrom))

        ax2 = plt.subplot(grid[1, 0])
        for gene in genes_currentchrom_pos_list:
            gene_start_pos = int(gene_pos_dict.get(gene)[1])
            gene_end_pos = int(gene_pos_dict.get(gene)[2])
            if gene in genes_essential_list:
                ax2.axvspan(gene_start_pos,
                            gene_end_pos,
                            facecolor='g',
                            alpha=0.3)
            else:
                ax2.axvspan(gene_start_pos,
                            gene_end_pos,
                            facecolor='r',
                            alpha=0.3)

        if variable == "insertions":
            ax2.bar(allinsertionsites_allfiles_list[1],
                    alltransposoncounts_allfiles_binnedlist[1],
                    width=binsize,
                    color=(0.2, 0.2, 0.2, 0.8),
                    label='Number of transposons')
        elif variable == "reads":
            ax2.bar(allinsertionsites_allfiles_list[1],
                    alltransposoncounts_allfiles_binnedlist[1],
                    width=binsize,
                    color=(0.2, 0.2, 0.2, 0.8),
                    label='Number of reads')
        ax2.bar(allinsertionsites_allfiles_list[1],
                transposoncounts_negativedifference_list,
                width=binsize,
                color=(0.52, 0.71, 0.90, 0.8),
                label='Absolute difference datasets (set1-set2)')

        if set_logscale == True:
            ax2.set_yscale('log')
        else:
            ax2.set_ylim(0, max_ylim)
        ax2.set_axisbelow(True)
        ax2.grid(True)
        if variable == "insertions":
            ax2.set_ylabel('Aboslute insertion count', fontsize=font_size)
        elif variable == "reads":
            ax2.set_ylabel('Aboslute read count', fontsize=font_size)
        ax2.set_xlabel('Basepair position on chromosome ' + chrom,
                       fontsize=font_size)
        ax2.set_xlim(0, chr_length_dict.get(chrom))
        ax2.invert_yaxis()
        ax2.legend(loc='lower left', fontsize=font_size)

        plt.tight_layout()

        if savefig == True:
            saving_name = os.path.join(
                os.path.dirname(bed_files[0]),
                os.path.basename(bed_files[0]).strip(".bed") +
                "_compareplot_chrom" + chrom + ".png")
            plt.savefig(saving_name)
            plt.close()
示例#5
0
def transposonmapper(bamfile=bam_arg,
                     gfffile=None,
                     essentialfiles=None,
                     genenamesfile=None):
    '''
    This function is created for analysis of SATAY data using the species Saccharomyces Cerevisiae.
    It outputs the following files that store information regarding the location of all insertions:
        - .bed-file: Includes all individual basepair locations of the whole genome where at least one transposon has been mapped and the number of insertions for each locations (the number of reads) according to the Browser Extensible Data (bed) format.
                    A distinction is made between reads that had a different reading orientation during sequencing. The number of reads are stored using the equation #reads*20+100 (e.g. 2 reads is stored as 140).
        - .wig-file: Includes all individual basepair locations of the whole genome where at least one transposon has been mapped and the number of insertions for each locations (the number of reads) according to the Wiggle (wig) format.
                    In this file no distinction is made between reads that had a different reading orientation during sequencing. The number of reads are stored as the absolute count.
        - _pergene.txt-file: Includes all genes (currently 6600) with the total number of insertions and number of reads within the genomic region of the gene.
        - _peressential.txt-file: Includes all annotated essential genes (currently 1186) with the total number of insertions and number of reads within the genomic region of the gene.
        - _pergene_insertions.txt-file: Includes all genes with their genomic location (i.e. chromosome number, start and end position) and the locations of all insertions within the gene location. It also include the number number of reads per insertions.
        - _peressential_insertions.txt-file: Includes all essential genes with their genomic location (i.e. chromosome number, start and end position) and the locations of all insertions within the gene location. It also include the number number of reads per insertions.
          (note that in the latter two files, the genomic locations are continous, for example chromosome II does not start at 0, but at 'length chromosome I + 1' etc.).
    The output files are saved at the location of the input file using the same name as the input file, but with the corresponding extension.
    
    The function assumes that the reads are already aligned to a reference genome.
    The input data should be a .bam-file and the location where the .bam-file is stored should also contain an index file (.bam.bai-file, which for example can be created using sambamba).
    This function takes the following inputs:
        - bamfile [required]: Path to the bamfile. This location should also contain the .bam.bai index file (does not need to be input in this function).
        - gfffile [optional]: Path to a .gff-file including all gene information (e.g. downloaded from SGD). Default file is 'Saccharomyces_cerevisiae.R64-1-1.99.gff3'.
        - essentialfiles [optional]: Path to a .txt file containing a list all essential genes. Every line should consist of a single essential gene and the file should have one header line. Ideally this file is created using 'Create_EssentialGenes_list.py'. Default file is 'Cerevisiae_AllEssentialGenes_List.txt'.
        - genenamesfile [optional]: Path to text file that includes aliases for all genes. Default file is 'Yeast_Protein_Names.txt'.
    When the arguments for the optional files are not given, the files are used that are stored at the following location:
        "path_current_pythonscript/../data_files"
    The function uses the pysam package for handling bam files (see pysam.readthedocs.io/en/latest/index.html) and therefore this function only runs on Linux systems with SAMTools installed.
    '''

    #%% LOADING BAM FILE
    if bamfile is None:
        path = os.path.join('/home', 'gregoryvanbeek', 'Documents',
                            'data_processing')
        # filename = 'E-MTAB-4885.WT2.bam'
        filename = 'SRR062634.filt_trimmed.sorted.bam'
        bamfile = os.path.join(path, filename)
    else:
        filename = os.path.basename(bamfile)
        path = bamfile.replace(filename, '')

    assert os.path.isfile(
        bamfile
    ), 'Bam file not found at: %s' % bamfile  #check if given bam file exists

    #%% LOADING ADDITIONAL FILES
    files_path = os.path.join(dirname, '..', 'data_files')

    #LOADING GFF-FILE
    if gfffile is None:
        gfffile = os.path.join(files_path,
                               'Saccharomyces_cerevisiae.R64-1-1.99.gff3')
    assert os.path.isfile(gfffile), 'Path to GFF-file does not exist.'

    #LOADING TEXT FILES WITH ESSENTIAL GENES
    if essentialfiles is None:
        essentialfiles = os.path.join(files_path,
                                      'Cerevisiae_AllEssentialGenes_List.txt')
    assert os.path.isfile(
        essentialfiles), 'Following path does not exist: %s' % essentialfiles
    del essentialfiles

    #LOADING TEXT FILE WITH GENE NAME ALIASES
    if genenamesfile is None:
        genenamesfile = os.path.join(files_path, 'Yeast_Protein_Names.txt')
    assert os.path.isfile(
        genenamesfile), 'Following path does not exist: %s' % genenamesfile

    #%% READ BAM FILE
    bam = pysam.AlignmentFile(bamfile,
                              'rb')  #open bam formatted file for reading

    #%% GET NAMES OF ALL CHROMOSOMES AS STORED IN THE BAM FILE
    ref_tid_dict = {}  # 'I' | 0, 'II' | 1, ...
    ref_name_list = []  # 'I', 'II', ...
    for i in range(
            bam.nreferences
    ):  #if bam.nreferences does not work, use range(17) #16 chromosomes and the mitochondrial chromosome
        ref_name = bam.get_reference_name(i)
        ref_tid_dict[ref_name] = bam.get_tid(ref_name)
        ref_name_list.append(ref_name)

    del (ref_name, i)

    #%% CONVERT CHROMOSOME NAMES IN DATA FILE TO ROMAN NUMERALS
    ref_romannums = chromosomename_roman_to_arabic()[0]
    ref_tid_roman_dict = {}
    for key, val in ref_tid_dict.items():
        ref_tid_roman_dict[ref_romannums[int(val) + 1]] = key

    del (key, val, ref_romannums)

    #%% GET SEQUENCE LENGTHS OF ALL CHROMOSOMES
    chr_length_dict = {}  # 'I' | 230218, 'II' | 813184, ...
    chr_summedlength_dict = {}  # 'I' | 0, 'II' | 230218, 'III' |  1043402, ...
    ref_summedlength = 0
    for key in ref_tid_dict:
        ref_length = bam.get_reference_length(key)
        chr_length_dict[key] = ref_length
        chr_summedlength_dict[key] = ref_summedlength
        ref_summedlength += ref_length

    del (key, ref_length, ref_summedlength)

    #%% GET NUMBER OF MAPPED, UNMAPPED AND TOTAL AMOUNT OF READS PER CHROMOSOME
    # total_reads = bam.mapped
    stats = bam.get_index_statistics()
    chr_mappedreads_dict = {}  # 'I' | [mapped, unmapped, total reads]
    for stat in stats:
        chr_mappedreads_dict[stat[0]] = [stat[1], stat[2], stat[3]]
        if stat[2] != 0:
            warnings.warn('Unmapped reads found in chromosome ' + stat[0])

    del (stat, stats)

    #%% GET ALL READS WITHIN A SPECIFIED GENOMIC REGION
    tnnumber_dict = {}
    ll = 0  #Number of unique insertions in entire genome
    for kk in ref_name_list:
        timer_start = timeit.default_timer()
        read_counter = 0

        N_reads_kk = chr_mappedreads_dict[kk][2]
        start_array = np.empty(shape=(N_reads_kk), dtype=int)
        flag_array = np.empty(shape=(N_reads_kk), dtype=int)
        readlength_array = np.empty(shape=(N_reads_kk), dtype=int)

        #RETREIVING ALL THE READS FROM THE CURRENT CHROMOSOME.
        print('Getting reads for chromosome %s ...' % kk)
        for reads in bam.fetch(kk, 0, chr_length_dict[kk], until_eof=True):
            read = str(reads).split('\t')

            start_array[read_counter] = int(read[3]) + 1

            #GET FLAG FOR EACH READ. IF READ ON FORWARD STRAND, ASSIGN VALUE 1, IF READ ON REVERSE STRAND ASSIGN VALUE -1, IF READ UNMAPPED OR SECONDARY ALIGNMENT ASSIGN VALUE 0
            #            flag_array[read_counter] = int(read[1])
            samprop = samflags(flag=int(read[1]), verbose=False)[1]
            if 'read reverse strand' in samprop:
                flag_array[read_counter] = -1
            else:
                flag_array[read_counter] = 1
            if 'not primary alignment' in samprop or 'read unmapped' in samprop:
                flag_array[read_counter] = 0

            cigarmatch_list = []
            if not reads.cigartuples == None:
                for cigar_type, cigar_length in reads.cigartuples:
                    if cigar_type == 0:
                        cigarmatch_list.append(cigar_length)
                    elif cigar_type == 2:
                        cigarmatch_list.append(cigar_length)
            match_length = sum(cigarmatch_list)

            readlength_array[read_counter] = match_length  #int(len(read[9]))

            read_counter += 1

        #CORRECT STARTING POSITION FOR READS WITH REVERSED ORIENTATION
#        flag0coor_array = np.where(flag_array==0) #coordinates reads 5' -> 3'
#        flag16coor_array = np.where(flag_array==16) # coordinates reads 3' -> 5'
        flag0coor_array = np.where(
            flag_array == 1)  #coordinates reads 5' -> 3'
        flag16coor_array = np.where(
            flag_array == -1)  # coordinates reads 3' -> 5'

        startdirect_array = start_array[flag0coor_array]
        flagdirect_array = flag_array[flag0coor_array]

        startindirect_array = start_array[flag16coor_array] + readlength_array[
            flag16coor_array]
        flagindirect_array = flag_array[flag16coor_array]

        start2_array = np.concatenate((startdirect_array, startindirect_array),
                                      axis=0)
        flag2_array = np.concatenate((flagdirect_array, flagindirect_array),
                                     axis=0)

        del (flag0coor_array, flag16coor_array, startdirect_array,
             flagdirect_array, startindirect_array, flagindirect_array)

        start2_sortindices = start2_array.argsort(
            kind='mergesort')  #use mergesort for stable sorting
        start2_array = start2_array[start2_sortindices]
        flag2_array = flag2_array[start2_sortindices]

        del start2_sortindices

        #CREATE ARRAY OF START POSITION AND FLAGS OF ALL READS IN GENOME
        ref_tid_kk = int(ref_tid_dict[kk] + 1)
        if ll == 0:
            tncoordinates_array = np.array([])

        mm = 0  # Number of unique reads per insertion
        jj = 1  # Number of unique reads in current chromosome (Number of transposons in current chromosome)
        for ii in range(1, len(start2_array)):
            if abs(
                    start2_array[ii] - start2_array[ii - 1]
            ) <= 2 and flag2_array[ii] == flag2_array[
                    ii -
                    1]:  #If two subsequent reads are within two basepairs and have the same orientation, add them together.
                mm += 1
            else:
                avg_start_pos = abs(
                    round(np.mean(start2_array[ii - mm - 1:ii])))
                if tncoordinates_array.size == 0:  #include first read
                    tncoordinates_array = np.array([
                        ref_tid_kk,
                        int(avg_start_pos),
                        int(flag2_array[ii - 1])
                    ])
                    readnumb_list = [mm + 1]
                else:
                    tncoordinates_array = np.vstack((tncoordinates_array, [
                        ref_tid_kk,
                        int(avg_start_pos),
                        int(flag2_array[ii - 1])
                    ]))
                    readnumb_list.append(mm + 1)
                mm = 0
                jj += 1
                ll += 1

            if ii == len(start2_array) - 1:  #include last read
                avg_start_pos = abs(
                    round(np.mean(start2_array[ii - mm - 1:ii])))
                tncoordinates_array = np.vstack((tncoordinates_array, [
                    ref_tid_kk,
                    int(avg_start_pos),
                    int(flag2_array[ii - 1])
                ]))
                readnumb_list.append(mm + 1)

        tnnumber_dict[kk] = jj

        del (jj, start_array, flag_array, readlength_array, flag2_array,
             start2_array, ref_tid_kk)

        timer_end = timeit.default_timer()
        print('Chromosome %s completed in %.3f seconds' %
              (kk, (timer_end - timer_start)))
        print('')

    readnumb_array = np.array(readnumb_list)
    del readnumb_list

    tncoordinatescopy_array = np.array(tncoordinates_array, copy=True)

    #%% GET LIST OF ALL GENES AND ALL ESSENTIAL GENES
    print('Getting coordinates of all genes ...')

    # GET POSITION GENES
    gff_path = os.path.join(files_path,
                            'Saccharomyces_cerevisiae.R64-1-1.99.gff3')
    genecoordinates_dict = gene_position(
        gff_path)  #'YAL069W' | ['I', 335, 649], ...

    # GET ALL ANNOTATED ESSENTIAL GENES
    essential_path = os.path.join(files_path,
                                  'Cerevisiae_AllEssentialGenes_List.txt')
    essentialcoordinates_dict = {}
    with open(essential_path, 'r') as f:
        genes = f.readlines()[1:]
        for gene in genes:
            name = gene.strip('\n')
            essentialcoordinates_dict[name] = genecoordinates_dict.get(
                name).copy()

    # GET ALIASES OF ALL GENES
    names_path = os.path.join(files_path, 'Yeast_Protein_Names.txt')
    aliases_designation_dict = gene_aliases(names_path)[
        0]  #'YMR056C' \ ['AAC1'], ...

    del (gff_path, gene, genes, name, essential_path)

    #%% CONCATENATE ALL CHROMOSOMES

    #FOR EACH INSERTION LOCATION, ADD THE LENGTH OF ALL PREVIOUS CHROMOSOMES.
    ll = 0
    for ii in range(1, len(ref_name_list)):
        ll += chr_length_dict[ref_name_list[ii - 1]]
        aa = np.where(tncoordinatescopy_array[:, 0] == ii + 1)
        tncoordinatescopy_array[aa, 1] = tncoordinatescopy_array[aa, 1] + ll

    #FOR EACH GENE LOCATION, ADD THE LENGTH OF ALL PREVIOUS CHROMOSOMES.
    for key in genecoordinates_dict:
        gene_chrom = ref_tid_roman_dict.get(genecoordinates_dict.get(key)[0])
        genecoordinates_dict[key][1] = genecoordinates_dict.get(
            key)[1] + chr_summedlength_dict.get(gene_chrom)
        genecoordinates_dict[key][2] = genecoordinates_dict.get(
            key)[2] + chr_summedlength_dict.get(gene_chrom)

    #FOR EACH ESSENTIAL GENE LOCATION, ADD THE LENGTH OF ALL PREVIOUS CHROMOSOMES.
    for key in essentialcoordinates_dict:
        gene_chrom = ref_tid_roman_dict.get(
            essentialcoordinates_dict.get(key)[0])
        essentialcoordinates_dict[key][1] = essentialcoordinates_dict.get(
            key)[1] + chr_summedlength_dict.get(gene_chrom)
        essentialcoordinates_dict[key][2] = essentialcoordinates_dict.get(
            key)[2] + chr_summedlength_dict.get(gene_chrom)

    del (ii, ll, aa, key, gene_chrom)

    #%% GET NUMBER OF TRANSPOSONS AND READS PER GENE
    print('Get number of insertions and reads per gene ...')

    #ALL GENES
    tnpergene_dict = {}
    readpergene_dict = {}
    tncoordinates_pergene_dict = {}
    # readpergenecrude_dict = {}
    for gene in genecoordinates_dict:
        xx = np.where(
            np.logical_and(
                tncoordinatescopy_array[:, 1] >=
                genecoordinates_dict.get(gene)[1],
                tncoordinatescopy_array[:, 1] <= genecoordinates_dict.get(gene)
                [2]))  #get all insertions within range of current gene
        tnpergene_dict[gene] = np.size(xx)
        readpergene_dict[gene] = sum(readnumb_array[xx]) - max(
            readnumb_array[xx],
            default=0)  #REMOVE LARGEST VALUE TO REDUCE NOISE
        # readpergenecrude_dict[gene] = sum(readnumb_array[xx])

        if np.size(xx) > 0:
            tncoordinates_pergene_dict[gene] = [
                genecoordinates_dict.get(gene)[0],
                genecoordinates_dict.get(gene)[1],
                genecoordinates_dict.get(gene)[2],
                list(tncoordinatescopy_array[xx[0][0]:xx[0][-1] + 1, 1]),
                list(readnumb_array[xx])
            ]
        else:
            tncoordinates_pergene_dict[gene] = [
                genecoordinates_dict.get(gene)[0],
                genecoordinates_dict.get(gene)[1],
                genecoordinates_dict.get(gene)[2], [], []
            ]

    #ONLY ESSENTIAL GENES
    tnperessential_dict = {}
    readperessential_dict = {}
    tncoordinates_peressential_dict = {}
    # readperessentialcrude_dict = {}
    for gene in essentialcoordinates_dict:
        xx = np.where(
            np.logical_and(
                tncoordinatescopy_array[:, 1] >=
                essentialcoordinates_dict.get(gene)[1],
                tncoordinatescopy_array[:, 1] <=
                essentialcoordinates_dict.get(gene)[2]))
        tnperessential_dict[gene] = np.size(xx)
        readperessential_dict[gene] = sum(readnumb_array[xx]) - max(
            readnumb_array[xx], default=0)
        # readperessentialcrude_dict[gene] = sum(readnumb_array[xx])

        if np.size(xx) > 0:
            tncoordinates_peressential_dict[gene] = [
                essentialcoordinates_dict.get(gene)[0],
                essentialcoordinates_dict.get(gene)[1],
                essentialcoordinates_dict.get(gene)[2],
                list(tncoordinatescopy_array[xx[0][0]:xx[0][-1] + 1, 1]),
                list(readnumb_array[xx])
            ]
        else:
            tncoordinates_peressential_dict[gene] = [
                essentialcoordinates_dict.get(gene)[0],
                essentialcoordinates_dict.get(gene)[1],
                essentialcoordinates_dict.get(gene)[2], [], []
            ]

    del (xx, gene)

    #%% CREATE BED FILE
    bedfile = bamfile + '.bed'
    print('Writing bed file at: ', bedfile)
    print('')

    with open(bedfile, 'w') as f:

        f.write('track name=' + filename + ' useScore=1\n')

        coordinates_counter = 0
        for tn in tncoordinates_array:
            refname = [
                key for key, val in ref_tid_dict.items() if val == tn[0] - 1
            ][0]
            if refname == 'Mito':
                refname = 'M'
            f.write('chr' + refname + ' ' + str(tn[1]) + ' ' + str(tn[1] + 1) +
                    ' . ' +
                    str(100 + readnumb_array[coordinates_counter] * 20) + '\n')
            coordinates_counter += 1

    del (bedfile, coordinates_counter, refname)

    #%% CREATE TEXT FILE WITH TRANSPOSONS AND READS PER GENE
    # NOTE THAT THE TRANSPOSON WITH THE HIGHEST READ COUNT IS IGNORED.
    # E.G. IF THIS FILE IS COMPARED WITH THE _PERGENE_INSERTIONS.TXT FILE THE READS DON'T ADD UP (SEE https://groups.google.com/forum/#!category-topic/satayusers/bioinformatics/uaTpKsmgU6Q)
    # TOO REMOVE THIS HACK, CHANGE THE INITIALIZATION OF THE VARIABLE readpergene
    pergenefile = bamfile + '_pergene.txt'
    print('Writing pergene.txt file at: ', pergenefile)
    print('')

    with open(pergenefile, 'w') as f:

        f.write(
            'Gene name\tNumber of transposons per gene\tNumber of reads per gene\n'
        )

        for gene in tnpergene_dict:
            tnpergene = tnpergene_dict[gene]
            readpergene = readpergene_dict[gene]
            if gene in aliases_designation_dict:
                gene_alias = aliases_designation_dict.get(gene)[0]
            else:
                gene_alias = gene
            f.write(gene_alias + '\t' + str(tnpergene) + '\t' +
                    str(readpergene) + '\n')

    del (pergenefile, gene, gene_alias, tnpergene, readpergene)

    #%% CREATE TEXT FILE TRANSPOSONS AND READS PER ESSENTIAL GENE
    peressentialfile = bamfile + '_peressential.txt'
    print('Writing peressential.txt file at: ', peressentialfile)
    print('')

    with open(peressentialfile, 'w') as f:

        f.write(
            'Gene name\tNumber of transposons per gene\tNumber of reads per gene\n'
        )

        for essential in tnperessential_dict:
            tnperessential = tnperessential_dict[essential]
            readperessential = readperessential_dict[essential]
            if essential in aliases_designation_dict:
                essential_alias = aliases_designation_dict.get(essential)[0]
            else:
                essential_alias = essential
            f.write(essential_alias + '\t' + str(tnperessential) + '\t' +
                    str(readperessential) + '\n')

    del (peressentialfile, essential, essential_alias, tnperessential,
         readperessential)

    #%% CREATE TEXT FILE WITH LOCATION OF INSERTIONS AND READS PER GENE
    pergeneinsertionsfile = bamfile + '_pergene_insertions.txt'
    print('Witing pergene_insertions.txt file at: ', pergeneinsertionsfile)
    print('')

    with open(pergeneinsertionsfile, 'w') as f:

        f.write(
            'Gene name\tChromosome\tStart location\tEnd location\tInsertion locations\tReads per insertion location\n'
        )

        for gene in tncoordinates_pergene_dict:
            gene_chrom = ref_tid_roman_dict.get(
                genecoordinates_dict.get(gene)[0])
            tncoordinates = [
                ins - chr_summedlength_dict.get(gene_chrom)
                for ins in tncoordinates_pergene_dict[gene][3]
            ]

            if gene in aliases_designation_dict:
                gene_alias = aliases_designation_dict.get(gene)[0]
            else:
                gene_alias = gene

            f.write(gene_alias + '\t' +
                    str(tncoordinates_pergene_dict[gene][0]) + '\t' +
                    str(tncoordinates_pergene_dict[gene][1] -
                        chr_summedlength_dict.get(gene_chrom)) + '\t' +
                    str(tncoordinates_pergene_dict[gene][2] -
                        chr_summedlength_dict.get(gene_chrom)) + '\t' +
                    str(tncoordinates) + '\t' +
                    str(tncoordinates_pergene_dict[gene][4]) + '\n')

    del (gene, gene_chrom, tncoordinates, gene_alias, pergeneinsertionsfile)

    #%% CREATE TEXT FILE WITH LOCATION OF INSERTIONS AND READS PER ESSENTIAL GENE
    peressentialinsertionsfile = bamfile + '_peressential_insertions.txt'
    print('Writing peressential_insertions.txt file at: ',
          peressentialinsertionsfile)
    print('')

    with open(peressentialinsertionsfile, 'w') as f:

        f.write(
            'Essential gene name\tChromosome\tStart location\tEnd location\tInsertion locations\tReads per insertion location\n'
        )

        for essential in tncoordinates_peressential_dict:
            gene_chrom = ref_tid_roman_dict.get(
                genecoordinates_dict.get(essential)[0])
            tncoordinates = [
                ins - chr_summedlength_dict.get(gene_chrom)
                for ins in tncoordinates_peressential_dict[essential][3]
            ]

            if essential in aliases_designation_dict:
                essential_alias = aliases_designation_dict.get(essential)[0]
            else:
                essential_alias = essential

            f.write(essential_alias + '\t' +
                    str(tncoordinates_peressential_dict[essential][0]) + '\t' +
                    str(tncoordinates_peressential_dict[essential][1] -
                        chr_summedlength_dict.get(gene_chrom)) + '\t' +
                    str(tncoordinates_peressential_dict[essential][2] -
                        chr_summedlength_dict.get(gene_chrom)) + '\t' +
                    str(tncoordinates) + '\t' +
                    str(tncoordinates_peressential_dict[essential][4]) + '\n')

    del (essential, gene_chrom, tncoordinates, essential_alias,
         peressentialinsertionsfile)

    #%% ADD INSERTIONS AT SAME LOCATION BUT WITH DIFFERENT ORIENTATIONS TOGETHER (FOR STORING IN WIG-FILE)
    wigfile = bamfile + '.wig'
    print('Writing wig file at: ', wigfile)
    print('')

    readnumbwig_array = readnumb_array.copy()

    unique_index_array = np.array([], dtype=int)  #=cc
    N_uniques_perchr_list = []
    ll = 0
    for kk in ref_name_list:
        index = np.where(tncoordinates_array[:, 0] == int(
            ref_tid_dict[kk] + 1))  #get indices for current chromosome.
        unique_index = np.unique(
            tncoordinates_array[index][:, 1], return_index=True
        )[1]  #get all insertion locations (in tncoordinates, all rows, column 1)

        unique_index_array = np.append(unique_index_array, (unique_index + ll),
                                       axis=0)

        ll += np.count_nonzero(tncoordinates_array[:,
                                                   0] == int(ref_tid_dict[kk] +
                                                             1))
        N_uniques_perchr_list.append(
            ll)  #total amount unique indices found untill current chromosome

    del (ll, kk, unique_index)

    duplicate_list = []  #=dd
    ll = 0
    index_last_unique_previous_chromosome = 0
    for ii in N_uniques_perchr_list:
        index_last_unique = np.where(unique_index_array <= ii)[0][-1]
        for jj in range(ll, ii):
            if int(jj) not in unique_index_array[
                    index_last_unique_previous_chromosome:index_last_unique]:
                duplicate_list.append(jj)
        index_last_unique_previous_chromosome = index_last_unique
        ll = ii

    #SUM READNUMB VALUES AT INDEX IN DUPLICATE_LIST AND DUPLICATE_LIST-1
    for ii in duplicate_list:
        readnumbwig_array[ii -
                          1] = readnumbwig_array[ii -
                                                 1] + readnumbwig_array[ii]

    tncoordinateswig_duplicatesremoved_array = np.delete(tncoordinates_array,
                                                         duplicate_list,
                                                         axis=0)
    readnumbwig_duplicatesremoved_array = np.delete(readnumbwig_array,
                                                    duplicate_list,
                                                    axis=0)

    del (ll, ii, jj, N_uniques_perchr_list, index_last_unique, duplicate_list,
         readnumbwig_array)

    #%% CREATING WIG FILE
    with open(wigfile, 'w') as f:
        f.write('track type=wiggle_0 ,maxheightPixels=60 name=' + filename +
                '\n')
        for kk in ref_name_list:
            f.write('VariableStep chrom=chr' + kk + '\n')

            index = np.where(tncoordinateswig_duplicatesremoved_array[:, 0] ==
                             int(ref_tid_dict[kk] +
                                 1))  #get indices for current chromosome.
            for ii in index[0]:
                f.write(
                    str(tncoordinateswig_duplicatesremoved_array[ii][1]) +
                    ' ' + str(readnumbwig_duplicatesremoved_array[ii]) + '\n')

    del (wigfile, kk, ii, index)
def tninserts_analysis():
    '''
    IMPORTANT VARIABLES IN THIS FUNCTION:

    gene_position_dict: position of all genes
    gene_inserts_dict: insertion locations of transposons for all genes
    gene_reads_dict: number of reads for all tn inserts for all genes

    essential_position_dict: only for essential genes.
    essential_inserts_dict: only for essential genes.
    essential_reads_dict: only for essential genes.

    nonessential_position_dict: only for nonessential genes.
    nonessential_inserts_dict: only for nonessential genes.
    nonessential_reads_dict: only for nonessential genes.
    
    df: dataframe to store all information for analysis
    '''
    #%% READ FILE AND PUT ALL VALUES IN DICTIONARIES. DO NOT CHANGE THIS SECTION.
    #    filepath = r"C:\Users\gregoryvanbeek\Documents\testing_site\wt1_testfolder\align_out"
    #    filename = "ERR1533148_trimmed.sorted.bam_pergene_insertions.txt"
    filepath = r"C:\Users\gregoryvanbeek\Documents\testing_site\wt2_testfolder\WT2_dataset_analysis_temp202008051429_new2"
    filename = r"E-MTAB-4885.WT2.bam_pergene_insertions.txt"
    datafile = os.path.join(filepath, filename)

    with open(datafile) as f:
        lines = f.readlines()

    gene_position_dict = {}
    gene_inserts_dict = {}
    gene_reads_dict = {}

    gene_inserts_distance_dict = {}  #distance between subsequent inserts
    gene_inserts_trunc_dict = {
    }  #inserts in the gene where 10% of the edges is truncated (so, only the center part of the gene is considered).
    gene_reads_trunc_dict = {
    }  #reads in the gene where 10% of the edges is truncated (so, only the center part of the gene is considered).
    for line in lines[1:]:
        line_split = line.strip('\n').split('\t')
        genename = line_split[0]
        gene_chrom = line_split[1]
        gene_start = int(line_split[2])
        gene_end = int(line_split[3])

        gene_position_dict[genename] = [gene_chrom, gene_start, gene_end]

        geneinserts_str = line_split[4].strip('[]')
        if not geneinserts_str == '':
            geneinserts_list = [int(ins) for ins in geneinserts_str.split(',')]
        else:
            geneinserts_list = []
        gene_inserts_dict[genename] = geneinserts_list

        ins_list = []
        ins_indx_list = []
        for ins in geneinserts_list:  #GET INSERTIONS THAT ARE MORE THAN 10% OF LENGTH GENE AWAY FROM THE EDGES OF THE GENE.
            l = gene_end - gene_start
            if (gene_start + 0.1 * l) < ins < (gene_end - 0.1 * l):
                ins_list.append(ins)
                ins_indx_list.append(geneinserts_list.index(ins))
        gene_inserts_trunc_dict[genename] = ins_list

        if not len(geneinserts_list) < 2:
            d = []
            for i in range(1, len(
                    geneinserts_list)):  #DISTANCES BETWEEN SUBSEQUENT INSERTS
                d.append(geneinserts_list[i] - geneinserts_list[i - 1])
            gene_inserts_distance_dict[genename] = d
        elif len(geneinserts_list) == 1:
            gene_inserts_distance_dict[genename] = [0]  #[0] #only one insert
        else:
            gene_inserts_distance_dict[genename] = [0]  #[-1] #no insert

        genereads_str = line_split[5].strip('[]')
        if not genereads_str == '':
            genereads_list = [int(read) for read in genereads_str.split(',')]
        else:
            genereads_list = []
        gene_reads_dict[genename] = genereads_list
        gene_reads_trunc_dict[genename] = [
            genereads_list[i] for i in ins_indx_list
        ]

        if len(geneinserts_list) != len(genereads_list):
            print(
                'WARNING: %s has different number of reads compared with the number of inserts'
                % genename)

    del (datafile, lines, line, line_split, genename, gene_chrom, gene_start,
         gene_end, geneinserts_str, geneinserts_list, genereads_str,
         genereads_list, i, d, ins, ins_list, l)
    #remains: gene_inserts_dict, gene_position_dict, gene_reads_dict

    #%% GET ANNOTATED ESSENTIAL GENES. DO NOT CHANGE THIS SECTION.
    essentialsfile = r"C:\Users\gregoryvanbeek\Documents\GitHub\LaanLab-SATAY-DataAnalysis\Python_scripts\Data_Files\Cerevisiae_AllEssentialGenes_List.txt"

    with open(essentialsfile) as f:
        lines = f.readlines()

    aliases_dict = gene_aliases(
        r"C:\Users\gregoryvanbeek\Documents\GitHub\LaanLab-SATAY-DataAnalysis\Python_scripts\Data_Files\Yeast_Protein_Names.txt"
    )[0]

    essential_position_dict = {}
    essential_inserts_dict = {}
    essential_reads_dict = {}

    nonessential_position_dict = copy.deepcopy(gene_position_dict)
    nonessential_inserts_dict = copy.deepcopy(gene_inserts_dict)
    nonessential_reads_dict = copy.deepcopy(gene_reads_dict)

    for line in lines[1:]:
        essential = line.strip('\n')

        essentiality = 'nonessential'

        if essential in gene_position_dict:
            essentiality = 'essential'
            alias = essential
        else:
            for alias in aliases_dict.get(essential):
                if alias in gene_position_dict:
                    essentiality = 'essential'
                    break

        if essentiality == 'essential':
            essential_position_dict[alias] = gene_position_dict.get(alias)
            essential_inserts_dict[alias] = gene_inserts_dict.get(alias)
            essential_reads_dict[alias] = gene_reads_dict.get(alias)

            del nonessential_position_dict[alias]
            del nonessential_inserts_dict[alias]
            del nonessential_reads_dict[alias]

    del (essentialsfile, lines, line, aliases_dict, essential, essentiality,
         alias)
    #remain: essential_position_dict, essential_inserts_dict, essential_reads_dict, nonessential_position_dict, nonessential_inserts_dict, nonessential_reads_dict

    #%% CREATE DATAFRAME FOR ALL GENES. ADD STATISTICS HERE
    genename_list = []
    essentiality_list = []
    N_inserts_list = []
    N_inserts_trunc_list = []
    N_reads_trunc_list = []
    distance_max_inserts_list = []
    N_reads_list = []
    for gene in gene_position_dict:
        genename_list.append(gene)  #GENENAME LIST

        if gene in essential_position_dict:  #ESSENTIALITY_LIST
            essentiality_list.append(True)
        elif gene in nonessential_position_dict:
            essentiality_list.append(False)
        else:
            print('WARNING: %s not found.' % gene)

        N_inserts_list.append(len(gene_inserts_dict.get(
            gene)))  #N_INSERTS_LIST (NUMBER OF INSERTIONS)

        N_inserts_trunc_list.append(
            len(gene_inserts_trunc_dict.get(gene))
        )  #N_INSERTS_CENTER_LIST (NUMBER OF INSERTIONS IN THE GENE WHERE 10% OF THE GENE LENGTH IS TRUNCATED)

        N_reads_trunc_list.append(sum(gene_reads_trunc_dict.get(gene)))

        distance_max_inserts_list.append(
            np.nanmax(gene_inserts_distance_dict.get(gene)) /
            (gene_position_dict.get(gene)[2] - gene_position_dict.get(gene)[1])
        )  #DISTANCE_MAX_INSERTS_LIST (LARGEST DISTANCE BETWEEN SUBSEQUENT INSERTIONS NORMALIZED TO GENE LENGTH)

        N_reads_list.append(sum(gene_reads_dict.get(
            gene)))  #N_READS_LIST (TOTAL NUMBER OF READS IN GENE)

    allgenes = {
        'Gene_Name': genename_list,
        'Essentiality': essentiality_list,
        'Number_Insertions_Full_Gene': N_inserts_list,
        'Number_Insertions_Truncated_Gene': N_inserts_trunc_list,
        'Max_Insertion_Distance': distance_max_inserts_list,
        'Number_Reads_Full_Gene': N_reads_list,
        'Number_Reads_Truncated_Gene': N_reads_trunc_list
    }

    df = pd.DataFrame(allgenes,
                      columns=[column_name for column_name in allgenes])

    del (gene, genename_list, essentiality_list, N_inserts_list,
         N_inserts_trunc_list, distance_max_inserts_list, N_reads_list,
         N_reads_trunc_list, allgenes)

    #%%TEST GRAPH
    sns.set(style="whitegrid")

    #POTENTIALLY USEFUL; NUMBER OF INSERTIONS IN THE ENTIRE GENE.
    sns.boxplot(x='Essentiality', y='Number_Insertions_Full_Gene', data=df)

    #USEFUL; NUMBER OF INSERTIONS IN THE MIDDLE 80% OF THE GENE (I.E. INSERTIONS IN THE FIRST AND LAST 10% OF THE LENGTH OF THE GENE ARE NOT CONSIDERED)
    sns.violinplot(x='Essentiality',
                   y='Number_Insertions_Truncated_Gene',
                   data=df,
                   cut=0)
    sns.boxplot(x='Essentiality',
                y='Number_Insertions_Truncated_Gene',
                data=df)

    #NOT USEFUL (?); LARGEST DISTANCE BETWEEN SUBSEQUENT INSERTIONS FOR EACH GENE. Q: WHAT TO DO WITH CASES WHERE THERE IS ONLY A SINGLE OF NO INSERTIONS? -> IF THOSE SITUATIONS SET TO 0 IT DOES GIVE A CLEAR DISTINCTION BETWEEN ESSENTIALITY.
    ax = sns.stripplot(x='Essentiality',
                       y='Max_Insertion_Distance',
                       data=df,
                       alpha=0.23,
                       palette='coolwarm')
    sns.violinplot(x='Essentiality',
                   y='Max_Insertion_Distance',
                   data=df,
                   cut=0,
                   palette=['white'])
    df_select = df[df['Number_Insertions_Full_Gene'] > 1]
    sns.barplot(x='Essentiality', y='Max_Insertion_Distance', data=df_select)
    del (df_select, ax)

    #POTENTIALLY USEFUL;
    df_select = df[df['Number_Reads_Full_Gene'] < 10000]
    sns.boxplot(x='Essentiality', y='Number_Reads_Full_Gene', data=df_select)
    sns.boxplot(x='Essentiality',
                y='Number_Reads_Truncated_Gene',
                data=df_select)
    #    print('Number of outliers for essential genes is %i' % len(boxplot_stats(df.Number_Reads_Truncated_Gene).pop(0)['fliers']))
    del df_select
示例#7
0
def hit_free_region(gene_name='None', region=None, bed_file=None):
    '''This script makes a profile plot for the number of reads per tranposon for a specific genomic region.
    Input is a region and the .bed file from the output of the Matlab code from the Kornmann-lab.
    The region can be defined either as a gene name (e.g. 'bem1') or as a list consisting of three elements where the first element is the chromosome name, the start and end position respectively (e.g. ['I',1,4000]).
    If a gene name is input, the script searches in a .gff file (downloaded from yeastgenome.org).
    The output is a bar plot where the number of reads divided by the number of transposons.
    '''
    #%% USED FILES
    datafile_dirname = os.path.join(file_dirname, '..')

    gff_file = os.path.join(datafile_dirname, 'Data_Files',
                            'Saccharomyces_cerevisiae.R64-1-1.99.gff3')
    gene_information_file = os.path.join(datafile_dirname, 'Data_Files',
                                         'Yeast_Protein_Names.txt')

    #%% GET START AND END POSITION OF GENE
    if gene_name.upper() == 'HOLOCUS' or gene_name == 'HO-LOCUS':
        gene_pos = ['IV', 46271, 48031]
        gene_name = 'HOlocus'

    elif gene_name != 'None':
        gene_pos_dict = gene_position(
            gff_file)  #GET POSITION INFORMATION OF ALL GENES

        gene_name = gene_name.upper()  #CAPITALIZE GENE NAME
        if gene_pos_dict.get(
                gene_name
        ) == None:  #CHECK IF GENE_NAME EXISTS IN GENE_POS_DICT. IF NOT, CHECK IF ANY OF THE ALIASES EXISTS
            gene_alias_dict = gene_aliases(gene_information_file)[0]
            gene_alias_key = [
                k for k, v in gene_alias_dict.items() if gene_name in v
            ]
            print('gene_alias_key ', gene_alias_key[0])
            if gene_pos_dict.get(
                    gene_alias_key[0]
            ) == None:  #IF KEY DOES ALSO NOT EXISTS IN GENE_POS_DICT, CHECK IF MORE ALIASES EXISTS OF GENE_NAME
                gene_alias_list = gene_alias_dict.get(gene_alias_key[0])
                for gene_alias in gene_alias_list:
                    if gene_pos_dict.get(gene_alias) != None:
                        gene_pos = gene_pos_dict.get(gene_alias)
                        print('The alias ', gene_alias,
                              ' is used for the gene ', gene_name)
            else:
                gene_pos = gene_pos_dict.get(gene_alias_key[0])
                print('The alias ', gene_alias_key[0],
                      ' is used for the gene ', gene_name)

        else:
            gene_pos = gene_pos_dict.get(gene_name)

    elif region != None:
        gene_pos = region

    gene_chr = gene_pos[0]
    gene_start = int(gene_pos[1])
    gene_end = int(gene_pos[2])
    if gene_name != None:
        print(gene_name, ' starts at basepair ', gene_start,
              ' and ends at basepair ', gene_end, ' in chromosome', gene_chr)
    else:
        print('Selected region starts at basepair ', gene_start,
              ' and ends at basepair ', gene_end, ' in chromosome', gene_chr)

#%% READ THE BED FILE
    with open(bed_file) as f:
        lines = f.readlines()

#%% GET POSITION FOR THE CHROMOSOMES IN THE BED FILE

    chrom_start_line_dict, chrom_end_line_dict = chromosome_name_bedfile(
        lines)[1:3]

    #%% GET ALL READS WITHIN THE GENE
    insertion_list = []
    read_list = []
    for line in lines[chrom_start_line_dict.get(gene_chr):chrom_end_line_dict.
                      get(gene_chr)]:
        line_list = line.strip('\n').split()
        if gene_start <= int(line_list[1]) <= gene_end:
            insertion_list.append(int(line_list[1]))

            read_value = (
                int(line_list[4]) - 100
            ) / 20  #the matlab script by benoit takes the number of reads*20+100. This line makes this undone
            read_list.append(read_value)

#%% ACCOUNT FOR DOUBLE INSERTIONS FOR PLOTTING
#see for example chromosome I, bp 3891
    unique_insertion_list = []
    duplicate_insertion_list = []
    for ins in insertion_list:  #FIND THE CHROMOSOME POSITION OF ALL DUPLICATED TRANSPOSON INSERTION SITES
        if ins not in unique_insertion_list:
            unique_insertion_list.append(ins)
        else:
            duplicate_insertion_list.append(ins)
    duplicate_insertion_list = np.unique(
        duplicate_insertion_list
    )  #ACCOUNT FOR THE SITUATION WHERE THERE ARE MORE THAN TWO INSERTIONS AT THE SAME LOCATION

    duplicate_index_list = []
    for dup in duplicate_insertion_list:
        insertion_arr = np.asarray(insertion_list)
        duplicate_index_list.append(
            np.where(insertion_arr == dup)
        )  #GET ALL INDICES OF THE LIST OF TRANSPOSON INSERTIONS WHERE THE DUPLICATES ARE PRESENT. EACH INSERTION LOCATION GETS ITS OWN NUMPY ARRAY WITHIN THIS LIST

    if len(duplicate_index_list) > 0:
        number_of_duplicates_list = [1] * len(
            insertion_list
        )  #MAKE LIST OF ONES WITH SAME LENGTH AS INSERTION_LIST FOR STORING NUMBER OF DUPLICATES
        delete_index = []
        for ind_arr in duplicate_index_list:  #LOOP OVER ALL INDICES OF DUPLICATES
            ind_list = ind_arr[0]
            ind_list_max = max(ind_list)  #GET THE LAST INDEX OF THE DUPLICATES
            #            print('Mulitple transposons found at ',ind_list)
            for ind in ind_list:
                if not ind == ind_list_max:
                    read_list[ind_list_max] += read_list[
                        ind]  #ADD UP THE READS TO THE LAST DUPLICATE
                    number_of_duplicates_list[ind_list_max] = len(
                        ind_list)  #UPDATE NUMBER OF DUPLICATES
                    delete_index.append(ind)

        #REVERSE LOOP OVER LIST FOR DELETING
        for del_ind in reversed(delete_index):
            del read_list[
                del_ind]  #DELETES THE INDEX WHICH IS NOW ADDED UP TO THE LAST INDEX
            del insertion_list[
                del_ind]  #DELETES THE SAME INDICES IN THE INSERTION LISTS.
            del number_of_duplicates_list[del_ind]

        readspertransposon_list = [
            x / y for x, y in zip(read_list, number_of_duplicates_list)
        ]  #DIVIDE THE NUMBER OF READS BY THE NUMBER OF TRANSPOSONS
    else:
        readspertransposon_list = read_list

#%% MAKE LIST OF ALL LOCATIONS IN THE GENE WITH THE NUMBER OF READS IN EACH LOCATION
    gene_length = gene_end - gene_start
    print('Length of region of interest is ', gene_length)
    insertion_roi_list = list(range(gene_start, gene_end + 1))
    reads_roi_list = list(np.zeros(gene_length + 1))

    read_index = 0
    for position in insertion_list:
        roi_index = insertion_roi_list.index(position)
        reads_roi_list[roi_index] = float(readspertransposon_list[read_index])
        read_index += 1

#%% CALCULATE SOME STATISTICAL VALUES FOR THE SELECTED REGION
#insertion_roi_list := list of all potential insertion sites in the region
#reads_roi_list := number of reads in the selected region.

    if insertion_list != []:
        bp_between_tn_insertions = [
            abs(y - x) for x, y in zip(insertion_list[:-1], insertion_list[1:])
        ]
        bp_between_tn_insertions.insert(0, insertion_list[0] -
                                        gene_start)  #ADD START OF GENE (bp=0)
        bp_between_tn_insertions.append(
            gene_end - insertion_list[-1]
        )  #ADD END OF GENE (bp=INDEX LAST TN - GENE LENGTH)

        max_empty_region = max(bp_between_tn_insertions)

    else:
        max_empty_region = gene_length
        bp_between_tn_insertions = [
            abs(y - x) for x, y in zip(insertion_list[:-1], insertion_list[1:])
        ]

#%%
    print('insertion_list: ', insertion_list)
    print('read_list: ', read_list)
    print(
        'Basepairs between subsequent insertions: ', bp_between_tn_insertions
    )  #FIRST AND LAST DIFFERENCE IS NUMBER OF BASEPAIRS BETWEEN FIRST AND LAST INSERTION AND THE BEGINNING AND END OF THE REGION
    print('max_empty_region: ', max_empty_region)

    return (insertion_list, read_list, max_empty_region,
            bp_between_tn_insertions)