示例#1
0
def dist_pI(output, plot_suffix, EC):
    """
    Plot distribution of protein pI for all sequences in FASTA file.

    """
    fig, ax = plt.subplots(figsize=(8, 5))
    width = 0.5
    X_bins = arange(0, 14.1, width)
    hist, bin_edges = histogram(a=list(output.pI), bins=X_bins)
    # output.GRAVY.plot.hist(bins=50,
    #                        color='#607c8e')
    ax.bar(bin_edges[:-1],
           hist,
           align='edge',
           alpha=0.4,
           width=width,
           color=EC_descriptions()[str(EC)][1],
           edgecolor='k',
           label=EC_descriptions()[str(EC)][0])
    # ax.plot(X_bins[:-1]+width/2, hist, c='k', lw='2')
    dist_plot(fig,
              ax,
              name='pI',
              xlim=(0, 14),
              xtitle='pI',
              plot_suffix=plot_suffix)
示例#2
0
def dist_GRAVY(output, plot_suffix, EC):
    """
    Plot distribution of protein GRAVY for all sequences in FASTA file.

    """
    fig, ax = plt.subplots(figsize=(8, 5))
    width = 0.05
    X_bins = arange(-2, 2.2, width)
    hist, bin_edges = histogram(a=list(output.GRAVY), bins=X_bins)
    # output.GRAVY.plot.hist(bins=50,
    #                        color='#607c8e')
    ax.bar(bin_edges[:-1],
           hist,
           align='edge',
           alpha=0.4,
           width=width,
           color=EC_descriptions()[str(EC)][1],
           edgecolor='k',
           label=EC_descriptions()[str(EC)][0])
    # ax.plot(X_bins[:-1]+width/2, hist, c='k', lw='2')

    # GRAVY specific visuals
    # ax.text(-1.45, 40, 'hydrophilic', fontsize=16)
    # get ylim
    ylim = ax.get_ylim()
    ax.text(0.55,
            max(ylim) / 2 + 0.05 * (max(ylim) / 2),
            'hydrophobic',
            fontsize=16)
    ax.arrow(0.5,
             max(ylim) / 2,
             0.7,
             0,
             head_width=0.05 * (max(ylim) / 2),
             head_length=0.1,
             fc='k',
             ec='k')
    # avg_GRAVY = -0.4
    # ax.axvline(x=avg_GRAVY, c='grey', alpha=1.0, linestyle='--')
    # catalase_GRAVY = -0.605
    # ax.axvline(x=catalase_GRAVY, c='r', alpha=1.0)
    # urease_GRAVY = -0.1524
    # ax.axvline(x=urease_GRAVY, c='b', alpha=1.0)
    dist_plot(fig,
              ax,
              name='GRAVY',
              xlim=(-1.5, 1.5),
              xtitle='GRAVY',
              plot_suffix=plot_suffix)
示例#3
0
def dist_Iindex(output, plot_suffix, EC):
    """
    Plot distribution of protein I index for all sequences in
    FASTA file.

    """
    fig, ax = plt.subplots(figsize=(8, 5))
    width = 5
    X_bins = arange(0, 150, width)
    hist, bin_edges = histogram(a=list(output.I_index), bins=X_bins)
    # output.GRAVY.plot.hist(bins=50,
    #                        color='#607c8e')
    # ax.plot(X_bins[:-1]+width/2, hist, c='k', lw='2')
    ax.bar(bin_edges[:-1],
           hist,
           align='edge',
           alpha=0.4,
           width=width,
           color=EC_descriptions()[str(EC)][1],
           edgecolor='k',
           label=EC_descriptions()[str(EC)][0])

    # instability specific visuals
    # get ylim
    ylim = ax.get_ylim()
    ax.text(51,
            max(ylim) / 2 + 0.05 * (max(ylim) / 2),
            'unstable',
            fontsize=16)
    ax.arrow(50,
             max(ylim) / 2,
             30,
             0,
             head_width=0.05 * (max(ylim) / 2),
             head_length=4,
             fc='k',
             ec='k')
    II_cutoff = 40
    ax.axvline(x=II_cutoff, c='k', alpha=1.0, linestyle='--', lw=2)
    # catalase_II = 27.010
    # ax.axvline(x=catalase_II, c='r', alpha=1.0)
    # urease_II = 31.75
    # ax.axvline(x=urease_II, c='b', alpha=1.0)
    dist_plot(fig,
              ax,
              name='Iindex',
              xlim=(0, 100),
              xtitle='instability index',
              plot_suffix=plot_suffix)
示例#4
0
def main_analysis(plot_suffix, fasta_file, output_file, EC):
    """Analyse all sequences in FASTA file from BRENDA.

    """
    print('---------------------------------------------------------')
    print('Analyse properties of all sequences in FASTA:', fasta_file)
    print('---------------------------------------------------------')
    # percent_w_sequence(output_dir=search_output_dir)
    temp_time = time.time()
    if input('run calculations? (t/f)') == 't':
        get_fasta_sequence_properties(output_file=output_file,
                                      fasta_file=fasta_file)
    # do plotting + analysis -- plots
    if EC in list(EC_descriptions().keys()):
        print('------------------------------------------------------')
        print('doing analysis...')
        # load existing data from this FASTA file
        fasta_plotting(output_file=output_file, plot_suffix=plot_suffix, EC=EC)
        print('--- time taken =', '{0:.2f}'.format(time.time() - temp_time),
              's')
    else:
        print('------------------------------------------------------')
        print('doing specific sequence analysis...')
        output = read_seq_output(output_file)
        dist_TMindex_specific(output, plot_suffix, EC)
示例#5
0
def screen_pIs(database_names, redo_pI, redo_pI_plots, pI_csv, pI_output_dir,
               cutoff_pi, descriptors):
    """
    Screen the pI of all sequences with chosen EC numbers.

    """
    if descriptors is None:
        descriptors = {}
    for EC_file in database_names:
        EC = EC_file.replace(pI_output_dir, '')
        EC = EC.replace('__BRENDA_sequences.fasta', '').replace('_', '.')
        top_EC = EC.split('.')[0]
        # read the file but to avoid memory issues # we will calculate
        # the pI on the fly using the bio python module
        print('doing:', EC_file)
        file_mod = EC_file.replace(".fasta", "_mod.fasta")
        if redo_pI is True:
            calculate_pI_from_file(file_mod, pI_output_dir, cutoff_pi, pI_csv)
        if redo_pI_plots is True:
            print('plot distribution of pIs')
            pi_data = pd.read_csv(pI_output_dir + pI_csv, index_col=False)
            EC_pi_data = pi_data[pi_data['fasta_file'] == file_mod]
            plot_EC_pI_dist(EC_pi_data,
                            filename=file_mod.replace('.fasta', '.pdf'),
                            title=EC_descriptions()[top_EC][0],
                            cutoff_pi=cutoff_pi)
        print('done')
    if redo_pI_plots is True:
        print('plot full distribution of pIs')
        pi_data = pd.read_csv(pI_output_dir + pI_csv, index_col=False)
        plot_pI_dist(pi_data,
                     filename='full_pI_dist.pdf',
                     output_dir=pI_output_dir,
                     cutoff_pi=cutoff_pi)
示例#6
0
def dist_Aindex(output, plot_suffix, EC):
    """
    Plot distribution of protein Aindex for all sequences in FASTA file.

    """
    fig, ax = plt.subplots(figsize=(8, 5))
    width = 5
    X_bins = arange(0, 150, width)
    hist, bin_edges = histogram(a=list(output.A_index), bins=X_bins)
    # output.GRAVY.plot.hist(bins=50,
    #                        color='#607c8e')
    # ax.plot(X_bins[:-1]+width/2, hist, c='k', lw='2')
    ax.bar(bin_edges[:-1],
           hist,
           align='edge',
           alpha=0.4,
           width=width,
           color=EC_descriptions()[str(EC)][1],
           edgecolor='k',
           label=EC_descriptions()[str(EC)][0])

    # AI specific visuals
    ylim = ax.get_ylim()
    ax.text(10,
            max(ylim) / 2 + 0.05 * (max(ylim) / 2),
            'more stable',
            fontsize=16)
    ax.arrow(10,
             max(ylim) / 2,
             40,
             0,
             head_width=0.05 * (max(ylim) / 2),
             head_length=5,
             fc='k',
             ec='k')
    # catalase_AI = 68
    # ax.axvline(x=catalase_AI, c='r', alpha=1.0)
    # urease_AI = 90.476
    # ax.axvline(x=urease_AI, c='b', alpha=1.0)
    dist_plot(fig,
              ax,
              name='Aindex',
              xlim=(0, 150),
              xtitle='aliphatic index',
              plot_suffix=plot_suffix)
示例#7
0
def dist_TMindex(output, plot_suffix, EC):
    """
    Plot distribution of protein TM index for all sequences in
    FASTA file.

    """
    fig, ax = plt.subplots(figsize=(8, 5))
    width = 0.2
    X_bins = arange(-5, 5.1, width)
    hist, bin_edges = histogram(a=list(output.TM_index), bins=X_bins)
    # output.GRAVY.plot.hist(bins=50,
    #                        color='#607c8e')
    # ax.plot(X_bins[:-1]+width/2, hist, c='k', lw='2')
    ax.bar(bin_edges[:-1],
           hist,
           align='edge',
           alpha=0.4,
           width=width,
           color=EC_descriptions()[str(EC)][1],
           edgecolor='k',
           label=EC_descriptions()[str(EC)][0])

    # melting temperature index specific visuals
    TM_cutoff = (0, 1)
    ax.axvspan(xmin=TM_cutoff[0],
               xmax=TM_cutoff[1],
               facecolor='grey',
               alpha=0.2)
    # catalase_TMI = 1.22
    # ax.axvline(x=catalase_TMI, c='r', alpha=1.0)
    # urease_TMI = 0.62
    # ax.axvline(x=urease_TMI, c='b', alpha=1.0)
    dist_plot(fig,
              ax,
              name='TMindex',
              xlim=(-5, 5),
              xtitle='thermostability index',
              plot_suffix=plot_suffix)
示例#8
0
def all_EC_violin_plot():
    """Do violin plots of all properties for all EC output files.

    """
    properties = ['I_index', 'A_index', 'TM_index', 'pI', 'GRAVY']
    prop_label = [
        'instability index', 'aliphatic index', 'TM index', 'pI', 'GRAVY'
    ]
    prop_lim = [(0, 100), (0, 150), (-5, 5), (0, 14), (-1.5, 1.5)]
    ECs = ['1', '2', '3', '4', '5', '6']
    output_files = [i + '__BRENDA_sequences_output.csv' for i in ECs]

    for i, prop in enumerate(properties):
        print('doing', prop, '....')
        fig, ax = plt.subplots(figsize=(8, 5))
        for out_file in output_files:
            print(out_file)
            EC = out_file[0]
            print(EC)
            output = read_seq_output(out_file)
            parts = ax.violinplot(
                output[prop],
                [int(EC)],
                showmeans=False,
                showmedians=False,
                showextrema=False,
            )
            for pc in parts['bodies']:
                pc.set_facecolor(EC_descriptions()[EC][1])
                pc.set_edgecolor('black')
                pc.set_alpha(0.6)
        if prop == 'TM_index':
            # melting temperature index specific visuals
            TM_cutoff = (0, 1)
            ax.axhspan(ymin=TM_cutoff[0],
                       ymax=TM_cutoff[1],
                       facecolor='grey',
                       alpha=0.2)
        if prop == 'I_index':
            II_cutoff = 40
            ax.axhline(y=II_cutoff, c='k', alpha=1.0, linestyle='--', lw=2)
        if prop == 'A_index':
            ax.text(0.21,
                    60,
                    'more stable',
                    fontsize=16,
                    ha='left',
                    va='bottom',
                    rotation=90)
            ax.arrow(0.5,
                     40,
                     0,
                     80,
                     head_width=0.2,
                     head_length=10,
                     fc='k',
                     ec='k')
        ax.tick_params(axis='both', which='major', labelsize=16)
        ax.set_xlabel('EC number', fontsize=16)
        ax.set_ylabel(prop_label[i], fontsize=16)
        ax.set_xlim(0, 7)
        ax.set_ylim(prop_lim[i])
        ax.set_xticks([1, 2, 3, 4, 5, 6])
        ax.set_xticklabels(['1', '2', '3', '4', '5', '6'])
        fig.tight_layout()
        fig.savefig("violin_" + prop + ".pdf", dpi=720, bbox_inches='tight')