Пример #1
0
def correctness_abacus_plot(output_directory, file_prefix, df,
                                   x_series_index, y_series_index, facet_index, peptide_index, series_color, plot_title = '', x_axis_label = '', y_axis_label = '',
                                   fcorrect_x_cutoff = 1.0, fcorrect_y_cutoff = 1.0,
                                   min_experimental_ddg = None,
                                   max_experimental_ddg = None):
    try:
        os.mkdir(output_directory)
    except:
        pass
    assert (os.path.exists(output_directory))

    #first_peptide = df.ix[:, peptide_index].min()
    #last_peptide = df.ix[:, peptide_index].max()

    df['Categorization'] = df.apply(lambda r: _determine_fraction_correct_class(r[x_series_index], r[y_series_index])[0], axis = 1)
    categorization_index = len(df.columns.values) - 1
    df['CategorizationShape'] = df.apply(lambda r: _determine_fraction_correct_class(r[x_series_index], r[y_series_index])[1], axis = 1)
    categorization_shape_index = len(df.columns.values) - 1
    df['CategorizationColor'] = df.apply(lambda r: _determine_fraction_correct_class(r[x_series_index], r[y_series_index])[2], axis = 1)
    categorization_color_index = len(df.columns.values) - 1

    # Create the R script
    boxplot_r_script = '''
library(ggplot2)
library(gridExtra)
library(scales)
library(qualV)

# PNG generation
png('%(file_prefix)s.png', width=2560, height=2048, bg="white", res=600)
txtalpha <- 0.6
redtxtalpha <- 0.6

%(png_plot_commands)s
        '''

    xy_table_filename = '{0}.txt'.format(file_prefix)
    xy_table_filepath = os.path.join(output_directory, xy_table_filename)

    header_names = df.columns.values
    #x_series = header_names[x_series_index]
    #y_series = header_names[y_series_index]
    facet_series = header_names[facet_index]
    peptide_series = header_names[peptide_index]
    #categorization_series = header_names[categorization_index]
    #print(x_series,y_series, facet_series, peptide_series, categorization_series)

    data_table = df.to_csv(header = True, index = False)
    print(data_table)

    df = df.sort_values([facet_series, peptide_series])
    data_table = df.to_csv(header = True, index = False)
    print(data_table)

    write_file(xy_table_filepath, data_table)

    main_plot_script = '''
# Set the margins
par(mar=c(5, 5, 1, 1))

xy_data <- read.csv('%(xy_table_filename)s', header=T)

names(xy_data)[%(x_series_index)d + 1] <- "xvalues"
names(xy_data)[%(y_series_index)d + 1] <- "yvalues"
names(xy_data)[%(facet_index)d + 1] <- "facets"
names(xy_data)[%(peptide_index)d + 1] <- "peptides"
names(xy_data)[%(categorization_index)d + 1] <- "categorization"
names(xy_data)[%(categorization_shape_index)d + 1] <- "categorization_shape"
names(xy_data)[%(categorization_color_index)d + 1] <- "categorization_color"


xy_data[%(peptide_index)d + 1]

peptide_names <- sort(xy_data[[%(peptide_index)d + 1]])

peptide_names
class(peptide_names)

first_peptide = peptide_names[1]
last_peptide = peptide_names[length(peptide_names)]

xlabel <- "%(x_axis_label)s"
ylabel <- "%(y_axis_label)s"
plot_title <- "%(plot_title)s"

xy_data

# Set graph limits and the position for the correlation value

miny <- min(0.0, min(xy_data$xvalues) - 0.1) # "X-axis" values are plotted on to Y-axis
maxy <- max(1.0, max(xy_data$xvalues) + 0.1)
'''

    if min_experimental_ddg != None:
        main_plot_script += '''
miny <- min(miny  - 0.2, %(min_experimental_ddg)f  - 0.2)
'''
    if min_experimental_ddg != None:
        main_plot_script += '''
maxy <- max(maxy + 0.5, %(min_experimental_ddg)f  + 0.5)

first_peptide
last_peptide
'''

    main_plot_script += '''

#aes(color = categorization_color, shape = categorization_shape)

p <- ggplot(data=xy_data, aes(x=peptides, y = xvalues, color = categorization_color, shape = categorization_color, group = facets)) +
       theme(legend.position = "none") + # hide the legend
       annotate("rect", xmin = first_peptide, xmax = last_peptide, ymin = -1, ymax = +1, alpha = .2) +
       xlab(xlabel) +
       labs(title = "%(plot_title)s") +
       theme(plot.title = element_text(color = "#555555", size=rel(0.55))) +
       labs(x = xlabel, y = ylabel) +
       theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 3)) +
       geom_point() +
       scale_colour_manual(values = c("black", "blue", "green", "red")) +
       scale_shape_manual(values = c(16, 18, 25, 17)) +
       facet_wrap(~facets)

# Plot graph
p
dev.off()
        '''

    # Create the R script
    plot_type = 'png'
    png_plot_commands = main_plot_script % locals()
    boxplot_r_script = boxplot_r_script % locals()
    r_script_filename = '{0}.R'.format(file_prefix)
    r_script_filepath = os.path.join(output_directory, r_script_filename)
    write_file(r_script_filepath, boxplot_r_script)

    # Run the R script
    run_r_script(r_script_filename, cwd = output_directory)
def error_by_error_scatterplot(output_directory, file_prefix, df,
                             reference_series_index, x_series_index, y_series_index,
                             x_color, y_color,
                             x_series_name = None, y_series_name = None,
                             plot_title = '', x_axis_label = '', y_axis_label = '', similarity_range = 0.25,
                             add_similarity_range_annotation = True,
                             shape_by_category = False, shape_category_series_index = None, shape_category_title = 'Case',
                             label_series_index = None, label_outliers = True,
                             use_geom_text_repel = True,
                             ):

    """ Creates a scatterplot of error versus error intended to show which computational method (X or Y) has the least amount of error relative to a reference series.

        The difference vectors (reference_series - x_series, reference_series - y_series) are created and these differences (errors)
        are plotted against each other.

        :param output_directory: The output directory.
        :param file_prefix: A prefix for the generated files. A CSV file with the plot points, the R script, and the R output is saved along with the plot itself.
        :param df: A pandas dataframe. Note: The dataframe is zero-indexed.
        :param reference_series_index: The numerical index of the reference series e.g. experimental data.
        :param x_series_index: The numerical index of the X-axis series e.g. predictions from a computational method.
        :param y_series_index: The numerical index of the Y-axis series e.g. predictions from a second computational method.
        :param x_color: The color of the "method X is better" points.
        :param y_color: The color of the "method Y is better" points.
        :param x_series_name: A name for the X-series which is used in the the classification legend.
        :param y_series_name: A name for the Y-series which is used in the the classification legend.
        :param plot_title: Plot title.
        :param x_axis_label: X-axis label.
        :param y_axis_label: Y-axis label.
        :param similarity_range: A point (x, y) is considered as similar if |x - y| <= similarity_range.
        :param add_similarity_range_annotation: If true then the similarity range is included in the plot.
        :param shape_by_category: Boolean. If set then points are shaped by the column identified with shape_category_series_index. Otherwise, points are shaped by classification ("X is better", "Y is better", or "Similar")
        :param shape_category_series_index: The numerical index of the series used to choose point shapes.
        :param shape_category_title: The title of the shape legend.
        :param label_series_index: The numerical index of the series label_series_index
        :param label_outliers: Boolean. If set then label outliers using the column identified with label_series_index.
        :param use_geom_text_repel: Boolean. If set then the ggrepel package is used to avoid overlapping labels.

        This function was adapted from the Kortemme Lab covariation benchmark (https://github.com/Kortemme-Lab/covariation).
        todo: I need to check that ggplot2 is respecting the color choices. It may be doing its own thing.
    """
    try:
        os.mkdir(output_directory)
    except:
        pass
    assert (os.path.exists(output_directory))

    if not isinstance(shape_category_series_index, int):
        shape_by_category = False
    if not isinstance(label_series_index, int):
        label_outliers = False
    assert(x_series_name != None and y_series_name != None)

    df = df.copy()
    headers = df.columns.values

    num_categories = len(set(df.ix[:, shape_category_series_index].values))
    legal_shapes = range(15,25+1) + range(0,14+1)
    if num_categories > len(legal_shapes):
        colortext.warning('Too many categories ({0}) to plot using meaningful shapes.'.format(num_categories))
        shape_by_category = False
    else:
        legal_shapes = legal_shapes[:num_categories]

    df['X_error'] = abs(df[headers[reference_series_index]] - df[headers[x_series_index]])
    x_error_index = len(df.columns.values) - 1
    df['Y_error'] = abs(df[headers[reference_series_index]] - df[headers[y_series_index]])
    y_error_index = len(df.columns.values) - 1

    # Get the list of domains common to both runs
    df['Classification'] = df.apply(lambda r: _classify_smallest_error(r['X_error'], r['Y_error'], similarity_range, x_series_name, y_series_name), axis = 1)
    error_classification_index = len(df.columns.values) - 1

    # Create the R script
    boxplot_r_script = '''
library(ggplot2)
library(gridExtra)
library(scales)
library(qualV)
library(grid)'''
    if use_geom_text_repel:
        boxplot_r_script +='''
library(ggrepel) # install with 'install.packages("ggrepel")' inside the R interactive shell.
'''
    boxplot_r_script += '''

# PNG generation
png('%(file_prefix)s.png', width=2560, height=2048, bg="white", res=600)
txtalpha <- 0.8
redtxtalpha <- 0.8

%(png_plot_commands)s
        '''

    xy_table_filename = '{0}.txt'.format(file_prefix)
    xy_table_filepath = os.path.join(output_directory, xy_table_filename)

    data_table = df.to_csv(header = True, index = False)
    write_file(xy_table_filepath, data_table)

    main_plot_script = '''
# Set the margins
par(mar=c(5, 5, 1, 1))

xy_data <- read.csv('%(xy_table_filename)s', header=T)

names(xy_data)[%(x_error_index)d + 1] <- "xerrors"
names(xy_data)[%(y_error_index)d + 1] <- "yerrors"
'''

    if label_outliers:
        main_plot_script +='''names(xy_data)[%(label_series_index)d + 1] <- "outlier_labels"'''
    main_plot_script +='''
names(xy_data)[%(shape_category_series_index)d + 1] <- "categories"

xy_data[%(x_error_index)d + 1]
xy_data[%(y_error_index)d + 1]

# coefs contains two values: (Intercept) and yerrors
coefs <- coef(lm(xerrors~yerrors, data = xy_data))
fitcoefs = coef(lm(xerrors~0 + yerrors, data = xy_data))
fitlmv_yerrors <- as.numeric(fitcoefs[1])
lmv_intercept <- as.numeric(coefs[1])
lmv_yerrors <- as.numeric(coefs[2])
lm(xy_data$yerrors~xy_data$xerrors)

xlabel <- "%(x_axis_label)s"
ylabel <- "%(y_axis_label)s"
plot_title <- "%(plot_title)s"
rvalue <- cor(xy_data$yerrors, xy_data$xerrors)

# Alphabetically, "Similar" < "X" < "Y" so the logic below works
countsim <- paste("Similar =", dim(subset(xy_data, Classification=="Similar"))[1])
countX <- paste("%(x_series_name)s =", dim(subset(xy_data, Classification=="%(x_series_name)s"))[1])
countY <- paste("%(y_series_name)s =", dim(subset(xy_data, Classification=="%(y_series_name)s"))[1])

countX
countY
countsim

# Set graph limits and the position for the correlation value

minx <- min(0.0, min(xy_data$xerrors) - 0.1)
miny <- min(0.0, min(xy_data$yerrors) - 0.1)
maxx <- max(1.0, max(xy_data$xerrors) + 0.1)
maxy <- max(1.0, max(xy_data$yerrors) + 0.1)

# Create a square plot (x-range = y-range)
minx <- min(minx, miny)
miny <- minx
maxx <- max(maxx, maxy)
maxy <- maxx

xpos <- maxx / 25.0
ypos <- maxy - (maxy / 25.0)
ypos_2 <- maxy - (2 * maxy / 25.0)


plot_scale <- scale_color_manual(
    "Counts",
    values = c( "Similar" = '#444444', "%(x_series_name)s" = '%(x_color)s', "%(y_series_name)s" ='%(y_color)s'),
    labels = c( "Similar" = countsim,  "%(x_series_name)s" = countX,        "%(y_series_name)s" = countY) )'''

    if add_similarity_range_annotation:
        main_plot_script += '''
# Polygon denoting the similarity range. We turn off plot clipping below (gt$layout$clip) so we need to be more exact than using 4 points when defining the region
boxy_mc_boxface <- data.frame(
  X = c(minx - 0,                        maxx - %(similarity_range)f, maxx + 0, maxx + 0,                       0 + %(similarity_range)f, 0),
  Y = c(minx - 0 + %(similarity_range)f, maxx + 0,                    maxx + 0, maxx + 0 -%(similarity_range)f, 0, 0 )
)'''
    else:
        main_plot_script += '''
# Polygon denoting the similarity range. We turn off plot clipping below (gt$layout$clip) so we need to be more exact than using 4 points when defining the region
boxy_mc_boxface <- data.frame(
  X = c(minx - 1, maxx + 1, maxx + 1, minx - 1),
  Y = c(minx - 1 + %(similarity_range)f, maxx + 1 + %(similarity_range)f, maxx + 1 - %(similarity_range)f, minx - 1 - %(similarity_range)f)
)'''

    if shape_by_category:
        main_plot_script += '''
# Plot
p <- qplot(main="", xerrors, yerrors, data=xy_data, xlab=xlabel, ylab=ylabel, alpha = I(txtalpha), shape=factor(categories), col=factor(Classification)) +'''
    else:
        main_plot_script += '''
# Plot
p <- qplot(main="", xerrors, yerrors, data=xy_data, xlab=xlabel, ylab=ylabel, alpha = I(txtalpha), shape=factor(Classification), col=factor(Classification)) +'''

    main_plot_script += '''
geom_polygon(data=boxy_mc_boxface, aes(X, Y), fill = "#bbbbbb", alpha = 0.4, color = "darkseagreen", linetype="blank", inherit.aes = FALSE, show.legend = FALSE) +
plot_scale +
geom_point() +
guides(col = guide_legend()) +
labs(title = "%(plot_title)s") +
theme(plot.title = element_text(color = "#555555", size=rel(0.75))) +
theme(axis.title = element_text(color = "#555555", size=rel(0.6))) +
theme(legend.title = element_text(color = "#555555", size=rel(0.45)), legend.text = element_text(color = "#555555", size=rel(0.4))) +
coord_cartesian(xlim = c(minx, maxx), ylim = c(miny, maxy)) + # set the graph limits
annotate("text", hjust=0, size = 2, colour="#222222", x = xpos, y = ypos, label = sprintf("R = %%0.2f", round(rvalue, digits = 4))) + # add correlation text; hjust=0 sets left-alignment. Using annotate instead of geom_text avoids blocky text caused by geom_text being run multiple times over the series'''

    if label_outliers:
        if use_geom_text_repel:
            main_plot_script += '''

# Label outliers
geom_text_repel(size=1.5, segment.size = 0.15, color="#000000", alpha=0.6, data=subset(xy_data, abs(yerrors - xerrors) > maxx/3 & xerrors <= maxx / 2 & yerrors >=maxy/2), aes(xerrors, yerrors-maxy/100, label=outlier_labels)) +
geom_text_repel(size=1.5, segment.size = 0.15, color="#000000", alpha=0.6, data=subset(xy_data, abs(yerrors - xerrors) > maxx/3 & xerrors <= maxx / 2 & yerrors < maxy/2), aes(xerrors, yerrors+2*maxy/100, label=outlier_labels)) +
geom_text_repel(size=1.5, segment.size = 0.15, color="#000000", alpha=0.6, data=subset(xy_data, abs(yerrors - xerrors) > maxx/3 & xerrors > maxx / 2 & yerrors >=maxy/2), aes(xerrors, yerrors-maxy/100, label=outlier_labels)) +
geom_text_repel(size=1.5, segment.size = 0.15, color="#000000", alpha=0.6, data=subset(xy_data, abs(yerrors - xerrors) > maxx/3 & xerrors > maxx / 2 & yerrors < maxy/2), aes(xerrors, yerrors+2*maxy/100, label=outlier_labels)) +'''
        else:
            main_plot_script += '''

# Label outliers
geom_text(hjust = 0, size=1.5, color="#000000", alpha=0.6, data=subset(xy_data, abs(yerrors - xerrors) > maxx/3 & xerrors <= maxx / 2 & yerrors >=maxy/2), aes(xerrors, yerrors-maxy/100, label=outlier_labels)) +
geom_text(hjust = 0, size=1.5, color="#000000", alpha=0.6, data=subset(xy_data, abs(yerrors - xerrors) > maxx/3 & xerrors <= maxx / 2 & yerrors < maxy/2), aes(xerrors, yerrors+2*maxy/100, label=outlier_labels)) +
geom_text(hjust = 1, size=1.5, color="#000000", alpha=0.6, data=subset(xy_data, abs(yerrors - xerrors) > maxx/3 & xerrors > maxx / 2 & yerrors >=maxy/2), aes(xerrors, yerrors-maxy/100, label=outlier_labels)) +
geom_text(hjust = 1, size=1.5, color="#000000", alpha=0.6, data=subset(xy_data, abs(yerrors - xerrors) > maxx/3 & xerrors > maxx / 2 & yerrors < maxy/2), aes(xerrors, yerrors+2*maxy/100, label=outlier_labels)) +'''

        counts_title = 'Counts'
        if add_similarity_range_annotation:
            counts_title += '*'

        main_plot_script += '''


#geom_text(hjust = 0, size=1.5, color="#000000", alpha=0.6, data=subset(xy_data, abs(yvalues - xvalues) > 2 & xvalues <= 0), aes(xvalues, yvalues+0.35, label=Origin_of_peptide), check_overlap = TRUE) + # label outliers
#geom_text(hjust = 1, size=1.5, color="#000000", alpha=0.6, data=subset(xy_data, abs(yvalues - xvalues) > 2 & xvalues > 0), aes(xvalues, yvalues+0.35, label=Origin_of_peptide), check_overlap = TRUE) + # label outliers




scale_colour_manual('%(counts_title)s', values = c('#444444', '%(x_color)s', '%(y_color)s'),
                    labels = c( "Similar" = countsim,  "%(x_series_name)s" = countX,        "%(y_series_name)s" = countY)) +'''

    if shape_by_category:
        legal_shapes_str = ', '.join(map(str, legal_shapes))
        main_plot_script += '''
scale_shape_manual('%(shape_category_title)s', values = c(%(legal_shapes_str)s),
                    labels = c( "Similar" = countsim,  "%(x_series_name)s" = countX,        "%(y_series_name)s" = countY))'''

    else:
        main_plot_script += '''
scale_shape_manual('%(counts_title)s', values = c(18, 16, 15),
                    labels = c( "Similar" = countsim,  "%(x_series_name)s" = countX,        "%(y_series_name)s" = countY))'''

    if add_similarity_range_annotation:
        main_plot_script += '''+
    # Add a caption
    annotation_custom(grob = textGrob(gp = gpar(fontsize = 5), hjust = 0, sprintf("* Similar \\u225d \\u00b1 %%0.2f", round(%(similarity_range)f, digits = 2))), xmin = maxx + (2 * maxx / 10), ymin = -1, ymax = -1)'''

    main_plot_script += '''

# Plot graph
p
    '''
    if add_similarity_range_annotation:
        main_plot_script += '''
# Code to override clipping
gt <- ggplot_gtable(ggplot_build(p))
gt$layout$clip[gt$layout$name=="panel"] <- "off"
grid.draw(gt)'''

    main_plot_script +='''
dev.off()
'''

    # Create the R script
    plot_type = 'png'
    png_plot_commands = main_plot_script % locals()
    boxplot_r_script = boxplot_r_script % locals()
    r_script_filename = '{0}.R'.format(file_prefix)
    r_script_filepath = os.path.join(output_directory, r_script_filename)
    write_file(r_script_filepath, boxplot_r_script)

    # Run the R script
    run_r_script(r_script_filename, cwd = output_directory)
def multicategory_scatterplot(
    output_directory,
    file_prefix,
    df,
    x_series_index,
    y_series_index,
    category_series_index,
    series_color,
    plot_title="",
    x_axis_label="",
    y_axis_label="",
    min_predicted_ddg=None,
    max_predicted_ddg=None,
    min_experimental_ddg=None,
    max_experimental_ddg=None,
):
    """This function was adapted from the covariation benchmark."""

    # todo: Abstract this graph from the current usage (DDG measurements).
    # todo: make the capped value for unquantified but classified measurements (e.g. DDG > 7 kcal/mol) parameterizable
    # todo: add an option to identify outliers by standard deviations (over the set of errors |x - y|) rather than by fixed value
    # todo: add an option to use geom_text_repel to avoid/reduce overlapping text
    # todo: allow users to provide colors for the facets / categories

    # Changeset
    # todo: Change it to take in a pandas dataframe instead of the data_table_headers + data_table parameters.
    # todo: Add exception if number of cases > 2 so the general case can be implemented once we have test data.
    # todo: use one column as the category e.g. "PDB". assert that there is a maximum number of categories. Test with > 2 categories
    # todo: remove all references to SNX27 and NHERF1 below and loop over the set of categories instead

    # print(df[facet_index])
    color_map = {}
    categories = list(df.ix[:, category_series_index].unique())
    print(type(categories))
    num_categories = len(categories)
    category_colors = get_spaced_plot_colors(num_categories)
    for x in xrange(num_categories):
        color_map[categories[x]] = "#" + category_colors[x]

    df["CategorizationColor"] = df.apply(lambda r: color_map[r[category_series_index]], axis=1)
    categorization_color_index = len(df.columns.values) - 1

    # Monday: continue here
    print(df)
    sys.exit(0)
    try:
        os.mkdir(output_directory)
    except:
        pass
    assert os.path.exists(output_directory)

    df["Categorization"] = df.apply(
        lambda r: _determine_fraction_correct_class(r[x_series_index], r[y_series_index])[0], axis=1
    )
    categorization_index = len(df.columns.values) - 1
    df["CategorizationShape"] = df.apply(
        lambda r: _determine_fraction_correct_class(r[x_series_index], r[y_series_index])[1], axis=1
    )
    categorization_shape_index = len(df.columns.values) - 1

    # Create the R script
    boxplot_r_script = """
library(ggplot2)
library(gridExtra)
library(scales)
library(qualV)

# PNG generation
png('%(file_prefix)s.png', width=2560, height=2048, bg="white", res=600)
txtalpha <- 0.6
redtxtalpha <- 0.6

%(png_plot_commands)s
    """

    xy_table_filename = "{0}.txt".format(file_prefix)
    xy_table_filepath = os.path.join(output_directory, xy_table_filename)
    write_file(xy_table_filepath, "\n".join(",".join(map(str, line)) for line in [data_table_headers] + data_table))

    single_plot_commands = """
# Set the margins
par(mar=c(5, 5, 1, 1))

xy_data <- read.csv('%(xy_table_filename)s', header=T)

names(xy_data)[%(x_series_index)d + 1] <- "xvalues"
names(xy_data)[%(y_series_index)d + 1] <- "yvalues"

# coefs contains two values: (Intercept) and yvalues
coefs <- coef(lm(xvalues~yvalues, data = xy_data))
fitcoefs = coef(lm(xvalues~0 + yvalues, data = xy_data))
fitlmv_yvalues <- as.numeric(fitcoefs[1])
lmv_intercept <- as.numeric(coefs[1])
lmv_yvalues <- as.numeric(coefs[2])
lm(xy_data$yvalues~xy_data$xvalues)

xlabel <- "%(x_axis_label)s"
ylabel <- "%(y_axis_label)s"
plot_title <- "%(plot_title)s"
rvalue <- cor(xy_data$yvalues, xy_data$xvalues)
rvalue
xy_data

#3QDO = SNX27
#1G9O = NHERF1

valid_xy_data <- xy_data[which(xy_data$xvalues < 6.99),]
rvalue <- cor(valid_xy_data$yvalues, valid_xy_data$xvalues)
rvalue
valid_xy_data

valid_xy_data_NHERF1 <- xy_data[which(xy_data$xvalues < 6.99 & xy_data$PDB == '1G9O'),]
rvalue_NHERF1 <- cor(valid_xy_data_NHERF1$yvalues, valid_xy_data_NHERF1$xvalues)
rvalue_NHERF1
valid_xy_data_NHERF1

coefs_NHERF1 <- coef(lm(xvalues~yvalues, data = valid_xy_data_NHERF1))
lmv_intercept_NHERF1 <- as.numeric(coefs_NHERF1[1])
lmv_yvalues_NHERF1 <- as.numeric(coefs_NHERF1[2])

valid_xy_data_SNX27 <- xy_data[which(xy_data$xvalues < 6.99 & xy_data$PDB == '3QDO'),]
rvalue_SNX27 <- cor(valid_xy_data_SNX27$yvalues, valid_xy_data_SNX27$xvalues)
rvalue_SNX27
valid_xy_data_SNX27

coefs_SNX27 <- coef(lm(xvalues~yvalues, data = valid_xy_data_SNX27))
lmv_intercept_SNX27 <- as.numeric(coefs_SNX27[1])
lmv_yvalues_SNX27 <- as.numeric(coefs_SNX27[2])

lmv_intercept
lmv_yvalues
lmv_intercept_NHERF1
lmv_yvalues_NHERF1
lmv_intercept_SNX27
lmv_yvalues_SNX27

# Set graph limits and the position for the correlation value

minx <- min(0.0, min(xy_data$xvalues) - 0.1)
miny <- min(0.0, min(xy_data$yvalues) - 0.1)
maxx <- max(1.0, max(xy_data$xvalues) + 0.1)
maxy <- max(1.0, max(xy_data$yvalues) + 0.1)
    """

    if min_predicted_ddg != None:
        single_plot_commands += """
miny <- min(miny  - 0.2, %(min_predicted_ddg)f  - 0.2)
    """
    if max_predicted_ddg != None:
        single_plot_commands += """
maxy <- max(maxy + 0.5, %(max_predicted_ddg)f  + 0.5)

miny <- -6
maxy <- 12.5

    """
    if min_experimental_ddg != None:
        single_plot_commands += """
    minx <- min(minx, %(min_experimental_ddg)f)
        """
    if max_experimental_ddg != None:
        single_plot_commands += """
    maxx <- max(maxx, %(max_experimental_ddg)f) + 0.2
        """

    single_plot_commands += """
xpos <- minx + 0.2
ypos <- maxy - 1
ypos_SNX27 <- ypos - 1
ypos_NHERF1 <- ypos_SNX27 - 1

lrt <- expression('R'^tst)

p <- qplot(main="", xvalues, yvalues, data=xy_data, xlab=xlabel, ylab=ylabel, shape = PDB, alpha = I(txtalpha)) +
        geom_point(aes(color = PDB), alpha = 0.6) +
        scale_colour_manual(name="", values = c("1G9O"="orange", "3QDO"="blue", "3"="red", "value3"="grey", "value2"="black")) +
        labs(title = "%(plot_title)s") +
        theme(plot.title = element_text(color = "#555555", size=rel(0.75))) +

        # Correlation fit lines (global + one per facet
        geom_abline(size = 0.125, color="black", intercept = lmv_intercept, slope = lmv_yvalues, alpha=0.2) +
        geom_abline(size = 0.125, color="orange", intercept = lmv_intercept_NHERF1, slope = lmv_yvalues_NHERF1, alpha=0.4) +
        geom_abline(size = 0.125, color="blue", intercept = lmv_intercept_SNX27, slope = lmv_yvalues_SNX27, alpha=0.4) +

        geom_abline(slope=1, intercept=0, linetype=3, size=0.25, alpha=0.4) + # add a diagonal (dotted)
        coord_cartesian(xlim = c(minx, maxx), ylim = c(miny, maxy)) + # set the graph limits

        geom_text(hjust = 0, size=1.5, color="#000000", alpha=0.6, data=subset(xy_data, abs(yvalues - xvalues) > 2 & xvalues <= 0), aes(xvalues, yvalues+0.35, label=Origin_of_peptide), check_overlap = TRUE) + # label outliers
        geom_text(hjust = 1, size=1.5, color="#000000", alpha=0.6, data=subset(xy_data, abs(yvalues - xvalues) > 2 & xvalues > 0), aes(xvalues, yvalues+0.35, label=Origin_of_peptide), check_overlap = TRUE) + # label outliers
        geom_text(hjust=0, size=2, colour="black", aes(x = xpos, y = ypos, label = sprintf("R == %%0.2f", round(rvalue, digits = 4))), parse = TRUE) +
        geom_text(hjust=0, size=2, colour="darkorange", aes(x = xpos, y = ypos_NHERF1, label = sprintf("R[NHERF] == %%0.2f", round(rvalue_NHERF1, digits = 4))), parse = TRUE) +
        geom_text(hjust=0, size=2, colour="blue", aes(x = xpos, y = ypos_SNX27, label = sprintf("R[SNX27] == %%0.2f", round(rvalue_SNX27, digits = 4))), parse = TRUE) +
        theme(legend.position = "none")
#       geom_text(hjust=0, size=2, colour="black", aes(xpos, ypos, fontface="plain", family = "sans", label=paste(sprintf("R = %%0.2f%%s", round(rvalue, digits = 4), lrt), expression('R'[3])  ))) # add correlation text; hjust=0 sets left-alignment

#geom_text(hjust=0, size=3, colour="black", aes(xpos, ypos, fontface="plain", family = "sans", label=sprintf("R = %%0.2f", round(rvalue, digits = 4)))) # add correlation text; hjust=0 sets left-alignment
#       geom_text(hjust=0, size=3, colour="black", aes(xpos, ypos, fontface="plain", family = "sans", label=sprintf("R = %%0.2f", round(rvalue, digits = 4)))) # add correlation text; hjust=0 sets left-alignment

# Plot graph
p
dev.off()
        """

    # geom_point(aes(color = C)) +
    # color = "%(series_color)s"

    # Create the R script
    plot_type = "png"
    png_plot_commands = single_plot_commands % locals()
    boxplot_r_script = boxplot_r_script % locals()
    r_script_filename = "{0}.R".format(file_prefix)
    r_script_filepath = os.path.join(output_directory, r_script_filename)
    write_file(r_script_filepath, boxplot_r_script)

    # Run the R script
    run_r_script(r_script_filename, cwd=output_directory)