Exemplo n.º 1
0
def calculate_matrix_svg(snplst,
                         pop,
                         request,
                         genome_build,
                         r2_d="r2",
                         collapseTranscript=True):

    # Set data directories using config.yml
    with open('config.yml', 'r') as yml_file:
        config = yaml.load(yml_file)
    env = config['env']
    api_mongo_addr = config['api']['api_mongo_addr']
    population_samples_dir = config['data']['population_samples_dir']
    data_dir = config['data']['data_dir']
    tmp_dir = config['data']['tmp_dir']
    genotypes_dir = config['data']['genotypes_dir']
    aws_info = config['aws']
    mongo_username = config['database']['mongo_user_readonly']
    mongo_password = config['database']['mongo_password']
    mongo_port = config['database']['mongo_port']

    export_s3_keys = retrieveAWSCredentials()

    # Ensure tmp directory exists
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)

    # Open SNP list file
    snps_raw = open(snplst).readlines()

    # Remove duplicate RS numbers
    snps = []
    for snp_raw in snps_raw:
        snp = snp_raw.strip().split()
        if snp not in snps:
            snps.append(snp)

    # Select desired ancestral populations
    pops = pop.split("+")
    pop_dirs = []
    for pop_i in pops:
        if pop_i in [
                "ALL", "AFR", "AMR", "EAS", "EUR", "SAS", "ACB", "ASW", "BEB",
                "CDX", "CEU", "CHB", "CHS", "CLM", "ESN", "FIN", "GBR", "GIH",
                "GWD", "IBS", "ITU", "JPT", "KHV", "LWK", "MSL", "MXL", "PEL",
                "PJL", "PUR", "STU", "TSI", "YRI"
        ]:
            pop_dirs.append(data_dir + population_samples_dir + pop_i + ".txt")

    get_pops = "cat " + " ".join(pop_dirs)
    pop_list = [
        x.decode('utf-8') for x in subprocess.Popen(
            get_pops, shell=True, stdout=subprocess.PIPE).stdout.readlines()
    ]

    ids = [i.strip() for i in pop_list]
    pop_ids = list(set(ids))

    # Connect to Mongo snp database
    if env == 'local':
        mongo_host = api_mongo_addr
    else:
        mongo_host = 'localhost'
    client = MongoClient(
        'mongodb://' + mongo_username + ':' + mongo_password + '@' +
        mongo_host + '/admin', mongo_port)
    db = client["LDLink"]

    def get_coords(db, rsid):
        rsid = rsid.strip("rs")
        query_results = db.dbsnp.find_one({"id": rsid})
        query_results_sanitized = json.loads(json_util.dumps(query_results))
        return query_results_sanitized

    # Query genomic coordinates
    def get_rsnum(db, coord):
        temp_coord = coord.strip("chr").split(":")
        chro = temp_coord[0]
        pos = temp_coord[1]
        query_results = db.dbsnp.find({
            "chromosome":
            chro.upper() if chro == 'x' or chro == 'y' else str(chro),
            genome_build_vars[genome_build]['position']:
            str(pos)
        })
        query_results_sanitized = json.loads(json_util.dumps(query_results))
        return query_results_sanitized

    # Replace input genomic coordinates with variant ids (rsids)
    def replace_coords_rsid(db, snp_lst):
        new_snp_lst = []
        for snp_raw_i in snp_lst:
            if snp_raw_i[0][0:2] == "rs":
                new_snp_lst.append(snp_raw_i)
            else:
                snp_info_lst = get_rsnum(db, snp_raw_i[0])
                print("snp_info_lst")
                print(snp_info_lst)
                if snp_info_lst != None:
                    if len(snp_info_lst) > 1:
                        var_id = "rs" + snp_info_lst[0]['id']
                        ref_variants = []
                        for snp_info in snp_info_lst:
                            if snp_info['id'] == snp_info['ref_id']:
                                ref_variants.append(snp_info['id'])
                        if len(ref_variants) > 1:
                            var_id = "rs" + ref_variants[0]
                        elif len(ref_variants) == 0 and len(snp_info_lst) > 1:
                            var_id = "rs" + snp_info_lst[0]['id']
                        else:
                            var_id = "rs" + ref_variants[0]
                        new_snp_lst.append([var_id])
                    elif len(snp_info_lst) == 1:
                        var_id = "rs" + snp_info_lst[0]['id']
                        new_snp_lst.append([var_id])
                    else:
                        new_snp_lst.append(snp_raw_i)
                else:
                    new_snp_lst.append(snp_raw_i)
        return new_snp_lst

    snps = replace_coords_rsid(db, snps)

    # Find RS numbers in snp database
    rs_nums = []
    snp_pos = []
    snp_coords = []
    tabix_coords = ""
    for snp_i in snps:
        if len(snp_i) > 0:
            if len(snp_i[0]) > 2:
                if (snp_i[0][0:2] == "rs"
                        or snp_i[0][0:3] == "chr") and snp_i[0][-1].isdigit():
                    snp_coord = get_coords(db, snp_i[0])
                    if snp_coord != None and snp_coord[genome_build_vars[
                            genome_build]['position']] != "NA":
                        # check if variant is on chrY for genome build = GRCh38
                        if not (snp_coord['chromosome'] == "Y" and
                                (genome_build == "grch38"
                                 or genome_build == "grch38_high_coverage")):
                            rs_nums.append(snp_i[0])
                            snp_pos.append(snp_coord[
                                genome_build_vars[genome_build]['position']])
                            temp = [
                                snp_i[0], snp_coord['chromosome'],
                                snp_coord[genome_build_vars[genome_build]
                                          ['position']]
                            ]
                            snp_coords.append(temp)

    # Check max distance between SNPs
    distance_bp = []
    for i in range(len(snp_coords)):
        distance_bp.append(int(snp_coords[i][2]))

    # Sort coordinates and make tabix formatted coordinates
    snp_pos_int = [int(i) for i in snp_pos]
    snp_pos_int.sort()
    snp_coord_str = [
        genome_build_vars[genome_build]['1000G_chr_prefix'] +
        snp_coords[0][1] + ":" + str(i) + "-" + str(i) for i in snp_pos_int
    ]
    tabix_coords = " " + " ".join(snp_coord_str)

    # Extract 1000 Genomes phased genotypes
    vcf_filePath = "%s/%s%s/%s" % (
        config['aws']['data_subfolder'], genotypes_dir,
        genome_build_vars[genome_build]['1000G_dir'],
        genome_build_vars[genome_build]['1000G_file'] % (snp_coords[0][1]))
    vcf_query_snp_file = "s3://%s/%s" % (config['aws']['bucket'], vcf_filePath)

    checkS3File(aws_info, config['aws']['bucket'], vcf_filePath)

    # Define function to correct indel alleles
    def set_alleles(a1, a2):
        if len(a1) == 1 and len(a2) == 1:
            a1_n = a1
            a2_n = a2
        elif len(a1) == 1 and len(a2) > 1:
            a1_n = "-"
            a2_n = a2[1:]
        elif len(a1) > 1 and len(a2) == 1:
            a1_n = a1[1:]
            a2_n = "-"
        elif len(a1) > 1 and len(a2) > 1:
            a1_n = a1[1:]
            a2_n = a2[1:]
        return (a1_n, a2_n)

    # Import SNP VCF files
    tabix_snps = export_s3_keys + " cd {2}; tabix -fhD {0}{1} | grep -v -e END".format(
        vcf_query_snp_file, tabix_coords, data_dir + genotypes_dir +
        genome_build_vars[genome_build]['1000G_dir'])
    vcf = [
        x.decode('utf-8') for x in subprocess.Popen(
            tabix_snps, shell=True, stdout=subprocess.PIPE).stdout.readlines()
    ]

    h = 0
    while vcf[h][0:2] == "##":
        h += 1

    head = vcf[h].strip().split()

    # Extract haplotypes
    index = []
    for i in range(9, len(head)):
        if head[i] in pop_ids:
            index.append(i)

    hap1 = [[]]
    for i in range(len(index) - 1):
        hap1.append([])
    hap2 = [[]]
    for i in range(len(index) - 1):
        hap2.append([])

    rsnum_lst = []
    allele_lst = []
    pos_lst = []

    for g in range(h + 1, len(vcf)):
        geno = vcf[g].strip().split()
        geno[0] = geno[0].lstrip('chr')
        if geno[1] not in snp_pos:
            continue

        if snp_pos.count(geno[1]) == 1:
            rs_query = rs_nums[snp_pos.index(geno[1])]

        else:
            pos_index = []
            for p in range(len(snp_pos)):
                if snp_pos[p] == geno[1]:
                    pos_index.append(p)
            for p in pos_index:
                if rs_nums[p] not in rsnum_lst:
                    rs_query = rs_nums[p]
                    break

        if rs_query in rsnum_lst:
            continue

        rs_1000g = geno[2]

        if rs_query == rs_1000g:
            rsnum = rs_1000g
        else:
            count = -2
            found = "false"
            while count <= 2 and count + g < len(vcf):
                geno_next = vcf[g + count].strip().split()
                geno_next[0] = geno_next[0].lstrip('chr')
                if len(geno_next) >= 3 and rs_query == geno_next[2]:
                    found = "true"
                    break
                count += 1

            if found == "false":
                indx = [i[0] for i in snps].index(rs_query)
                # snps[indx][0] = geno[2]
                # rsnum = geno[2]
                snps[indx][0] = rs_query
                rsnum = rs_query
            else:
                continue

        if "," not in geno[3] and "," not in geno[4]:
            a1, a2 = set_alleles(geno[3], geno[4])
            for i in range(len(index)):
                if geno[index[i]] == "0|0":
                    hap1[i].append(a1)
                    hap2[i].append(a1)
                elif geno[index[i]] == "0|1":
                    hap1[i].append(a1)
                    hap2[i].append(a2)
                elif geno[index[i]] == "1|0":
                    hap1[i].append(a2)
                    hap2[i].append(a1)
                elif geno[index[i]] == "1|1":
                    hap1[i].append(a2)
                    hap2[i].append(a2)
                elif geno[index[i]] == "0":
                    hap1[i].append(a1)
                    hap2[i].append(".")
                elif geno[index[i]] == "1":
                    hap1[i].append(a2)
                    hap2[i].append(".")
                else:
                    hap1[i].append(".")
                    hap2[i].append(".")

            rsnum_lst.append(rsnum)

            position = "chr" + geno[0] + ":" + geno[1] + "-" + geno[1]
            pos_lst.append(position)
            alleles = a1 + "/" + a2
            allele_lst.append(alleles)

    # Calculate Pairwise LD Statistics
    all_haps = hap1 + hap2
    ld_matrix = [[[None for v in range(2)] for i in range(len(all_haps[0]))]
                 for j in range(len(all_haps[0]))]

    for i in range(len(all_haps[0])):
        for j in range(i, len(all_haps[0])):
            hap = {}
            for k in range(len(all_haps)):
                # Extract haplotypes
                hap_k = all_haps[k][i] + all_haps[k][j]
                if hap_k in hap:
                    hap[hap_k] += 1
                else:
                    hap[hap_k] = 1

            # Remove Missing Haplotypes
            keys = list(hap.keys())
            for key in keys:
                if "." in key:
                    hap.pop(key, None)

            # Check all haplotypes are present
            if len(hap) != 4:
                snp_i_a = allele_lst[i].split("/")
                snp_j_a = allele_lst[j].split("/")
                haps = [
                    snp_i_a[0] + snp_j_a[0], snp_i_a[0] + snp_j_a[1],
                    snp_i_a[1] + snp_j_a[0], snp_i_a[1] + snp_j_a[1]
                ]
                for h in haps:
                    if h not in hap:
                        hap[h] = 0

            # Perform LD calculations
            A = hap[sorted(hap)[0]]
            B = hap[sorted(hap)[1]]
            C = hap[sorted(hap)[2]]
            D = hap[sorted(hap)[3]]
            tmax = max(A, B, C, D)
            delta = float(A * D - B * C)
            Ms = float((A + C) * (B + D) * (A + B) * (C + D))
            if Ms != 0:
                # D prime
                if delta < 0:
                    D_prime = round(
                        abs(delta / min((A + C) * (A + B), (B + D) * (C + D))),
                        3)
                else:
                    D_prime = round(
                        abs(delta / min((A + C) * (C + D), (A + B) * (B + D))),
                        3)

                # R2
                r2 = round((delta**2) / Ms, 3)

                # Find Correlated Alleles
                if str(r2) != "NA" and float(r2) > 0.1:
                    Ac = hap[sorted(hap)[0]]
                    Bc = hap[sorted(hap)[1]]
                    Cc = hap[sorted(hap)[2]]
                    Dc = hap[sorted(hap)[3]]

                    if ((Ac * Dc) / max((Bc * Cc), 0.01) > 1):
                        match = sorted(hap)[0][0] + "=" + sorted(
                            hap)[0][1] + "," + sorted(
                                hap)[3][0] + "=" + sorted(hap)[3][1]
                    else:
                        match = sorted(hap)[1][0] + "=" + sorted(
                            hap)[1][1] + "," + sorted(
                                hap)[2][0] + "=" + sorted(hap)[2][1]
                else:
                    match = "  =  ,  =  "
            else:
                D_prime = "NA"
                r2 = "NA"
                match = "  =  ,  =  "

            snp1 = rsnum_lst[i]
            snp2 = rsnum_lst[j]
            pos1 = pos_lst[i].split("-")[0]
            pos2 = pos_lst[j].split("-")[0]
            allele1 = allele_lst[i]
            allele2 = allele_lst[j]
            corr = match.split(",")[0].split("=")[1] + "=" + match.split(
                ",")[0].split("=")[0] + "," + match.split(",")[1].split(
                    "=")[1] + "=" + match.split(",")[1].split("=")[0]
            corr_f = match

            ld_matrix[i][j] = [
                snp1, snp2, allele1, allele2, corr, pos1, pos2, D_prime, r2
            ]
            ld_matrix[j][i] = [
                snp2, snp1, allele2, allele1, corr_f, pos2, pos1, D_prime, r2
            ]

    # Generate Plot Variables
    out = [j for i in ld_matrix for j in i]
    xnames = []
    ynames = []
    xA = []
    yA = []
    corA = []
    xpos = []
    ypos = []
    D = []
    R = []
    box_color = []
    box_trans = []

    if r2_d not in ["r2", "d"]:
        r2_d = "r2"

    for i in range(len(out)):
        snp1, snp2, allele1, allele2, corr, pos1, pos2, D_prime, r2 = out[i]
        xnames.append(snp1)
        ynames.append(snp2)
        xA.append(allele1)
        yA.append(allele2)
        corA.append(corr)
        xpos.append(pos1)
        ypos.append(pos2)
        sqrti = math.floor(math.sqrt(len(out)))
        if sqrti == 0:
            D.append(str(round(float(D_prime), 4)))
            R.append(str(round(float(r2), 4)))
            box_color.append("red")
            box_trans.append(r2)
        elif i % sqrti < i // sqrti and r2 != "NA":
            D.append(str(round(float(D_prime), 4)))
            R.append(str(round(float(r2), 4)))
            box_color.append("blue")
            box_trans.append(abs(D_prime))
        elif i % sqrti > i // sqrti and D_prime != "NA":
            D.append(str(round(float(D_prime), 4)))
            R.append(str(round(float(r2), 4)))
            box_color.append("red")
            box_trans.append(r2)
        elif i % sqrti == i // sqrti and D_prime != "NA":
            D.append(str(round(float(D_prime), 4)))
            R.append(str(round(float(r2), 4)))
            box_color.append("purple")
            box_trans.append(r2)
        else:
            D.append("NA")
            R.append("NA")
            box_color.append("gray")
            box_trans.append(0.1)
    # Import plotting modules
    from collections import OrderedDict
    from bokeh.embed import components, file_html
    from bokeh.layouts import gridplot
    from bokeh.models import HoverTool, LinearAxis, Range1d
    from bokeh.plotting import ColumnDataSource, curdoc, figure, output_file, reset_output, save
    from bokeh.resources import CDN
    from bokeh.io import export_svgs
    import svgutils.compose as sg
    from math import pi

    reset_output()

    # Aggregate Plotting Data
    x = []
    y = []
    w = []
    h = []
    coord_snps_plot = []
    snp_id_plot = []
    alleles_snp_plot = []
    for i in range(0, len(xpos), int(len(xpos)**0.5)):
        x.append(int(xpos[i].split(":")[1]) / 1000000.0)
        y.append(0.5)
        w.append(0.00003)
        h.append(1.06)
        coord_snps_plot.append(xpos[i])
        snp_id_plot.append(xnames[i])
        alleles_snp_plot.append(xA[i])

    buffer = (x[-1] - x[0]) * 0.025
    xr = Range1d(start=x[0] - buffer, end=x[-1] + buffer)
    yr = Range1d(start=-0.03, end=1.03)
    y2_ll = [-0.03] * len(x)
    y2_ul = [1.03] * len(x)

    yr_pos = Range1d(start=(x[-1] + buffer) * -1, end=(x[0] - buffer) * -1)
    yr0 = Range1d(start=0, end=1)
    yr2 = Range1d(start=0, end=3.8)
    yr3 = Range1d(start=0, end=1)

    spacing = (x[-1] - x[0] + buffer + buffer) / (len(x) * 1.0)
    x2 = []
    y0 = []
    y1 = []
    y2 = []
    y3 = []
    y4 = []
    for i in range(len(x)):
        x2.append(x[0] - buffer + spacing * (i + 0.5))
        y0.append(0)
        y1.append(0.20)
        y2.append(0.80)
        y3.append(1)
        y4.append(1.15)

    xname_pos = []
    for i in x2:
        for j in range(len(x2)):
            xname_pos.append(i)

    data = {
        'xname': xnames,
        'xname_pos': xname_pos,
        'yname': ynames,
        'xA': xA,
        'yA': yA,
        'xpos': xpos,
        'ypos': ypos,
        'R2': R,
        'Dp': D,
        'corA': corA,
        'box_color': box_color,
        'box_trans': box_trans
    }

    source = ColumnDataSource(data)

    threshold = 70
    if len(snps) < threshold:
        matrix_plot = figure(
            outline_line_color="white",
            min_border_top=0,
            min_border_bottom=2,
            min_border_left=100,
            min_border_right=5,
            x_range=xr,
            y_range=list(reversed(rsnum_lst)),
            h_symmetry=False,
            v_symmetry=False,
            border_fill_color='white',
            x_axis_type=None,
            logo=None,
            tools="hover,undo,redo,reset,pan,box_zoom,previewsave",
            title=" ",
            plot_width=800,
            plot_height=700)

    else:
        matrix_plot = figure(
            outline_line_color="white",
            min_border_top=0,
            min_border_bottom=2,
            min_border_left=100,
            min_border_right=5,
            x_range=xr,
            y_range=list(reversed(rsnum_lst)),
            h_symmetry=False,
            v_symmetry=False,
            border_fill_color='white',
            x_axis_type=None,
            y_axis_type=None,
            logo=None,
            tools="hover,undo,redo,reset,pan,box_zoom,previewsave",
            title=" ",
            plot_width=800,
            plot_height=700)

    matrix_plot.rect(x='xname_pos',
                     y='yname',
                     width=0.95 * spacing,
                     height=0.95,
                     source=source,
                     color="box_color",
                     alpha="box_trans",
                     line_color=None)

    matrix_plot.grid.grid_line_color = None
    matrix_plot.axis.axis_line_color = None
    matrix_plot.axis.major_tick_line_color = None
    if len(snps) < threshold:
        matrix_plot.axis.major_label_text_font_size = "8pt"
        matrix_plot.xaxis.major_label_orientation = "vertical"

    matrix_plot.axis.major_label_text_font_style = "normal"
    matrix_plot.xaxis.major_label_standoff = 0

    sup_2 = "\u00B2"

    hover = matrix_plot.select(dict(type=HoverTool))
    hover.tooltips = OrderedDict([
        ("Variant 1", " " + "@yname (@yA)"),
        ("Variant 2", " " + "@xname (@xA)"),
        ("D\'", " " + "@Dp"),
        ("R" + sup_2, " " + "@R2"),
        ("Correlated Alleles", " " + "@corA"),
    ])

    # Connecting and Rug Plots
    # Connector Plot
    if len(snps) < threshold:
        connector = figure(outline_line_color="white",
                           y_axis_type=None,
                           x_axis_type=None,
                           x_range=xr,
                           y_range=yr2,
                           border_fill_color='white',
                           title="",
                           min_border_left=100,
                           min_border_right=5,
                           min_border_top=0,
                           min_border_bottom=0,
                           h_symmetry=False,
                           v_symmetry=False,
                           plot_width=800,
                           plot_height=90,
                           tools="xpan,tap")
        connector.segment(x, y0, x, y1, color="black")
        connector.segment(x, y1, x2, y2, color="black")
        connector.segment(x2, y2, x2, y3, color="black")
        connector.text(x2,
                       y4,
                       text=snp_id_plot,
                       alpha=1,
                       angle=pi / 2,
                       text_font_size="8pt",
                       text_baseline="middle",
                       text_align="left")
    else:
        connector = figure(outline_line_color="white",
                           y_axis_type=None,
                           x_axis_type=None,
                           x_range=xr,
                           y_range=yr3,
                           border_fill_color='white',
                           title="",
                           min_border_left=100,
                           min_border_right=5,
                           min_border_top=0,
                           min_border_bottom=0,
                           h_symmetry=False,
                           v_symmetry=False,
                           plot_width=800,
                           plot_height=30,
                           tools="xpan,tap")
        connector.segment(x, y0, x, y1, color="black")
        connector.segment(x, y1, x2, y2, color="black")
        connector.segment(x2, y2, x2, y3, color="black")

    connector.yaxis.major_label_text_color = None
    connector.yaxis.minor_tick_line_alpha = 0  # Option does not work
    connector.yaxis.axis_label = " "
    connector.grid.grid_line_color = None
    connector.axis.axis_line_color = None
    connector.axis.major_tick_line_color = None
    connector.axis.minor_tick_line_color = None

    connector.toolbar_location = None

    data_rug = {
        'x': x,
        'y': y,
        'w': w,
        'h': h,
        'coord_snps_plot': coord_snps_plot,
        'snp_id_plot': snp_id_plot,
        'alleles_snp_plot': alleles_snp_plot
    }

    source_rug = ColumnDataSource(data_rug)

    # Rug Plot
    rug = figure(x_range=xr,
                 y_range=yr,
                 y_axis_type=None,
                 title="",
                 min_border_top=1,
                 min_border_bottom=0,
                 min_border_left=100,
                 min_border_right=5,
                 h_symmetry=False,
                 v_symmetry=False,
                 plot_width=800,
                 plot_height=50,
                 tools="hover,xpan,tap")
    rug.rect(x='x',
             y='y',
             width='w',
             height='h',
             fill_color='red',
             dilate=True,
             line_color=None,
             fill_alpha=0.6,
             source=source_rug)

    hover = rug.select(dict(type=HoverTool))
    hover.tooltips = OrderedDict([
        ("SNP", "@snp_id_plot (@alleles_snp_plot)"),
        ("Coord", "@coord_snps_plot"),
    ])

    rug.toolbar_location = None

    if collapseTranscript == "false":
        # Gene Plot (All Transcripts)
        genes_file = tmp_dir + "genes_" + request + ".json"
        genes_raw = open(genes_file).readlines()

        genes_plot_start = []
        genes_plot_end = []
        genes_plot_y = []
        genes_plot_name = []
        exons_plot_x = []
        exons_plot_y = []
        exons_plot_w = []
        exons_plot_h = []
        exons_plot_name = []
        exons_plot_id = []
        exons_plot_exon = []
        message = ["Too many genes to plot."]
        lines = [0]
        gap = 80000
        tall = 0.75
        if genes_raw != None and len(genes_raw) > 0:
            for gene_raw_obj in genes_raw:
                gene_obj = json.loads(gene_raw_obj)
                bin = gene_obj["bin"]
                name_id = gene_obj["name"]
                chrom = gene_obj["chrom"]
                strand = gene_obj["strand"]
                txStart = gene_obj["txStart"]
                txEnd = gene_obj["txEnd"]
                cdsStart = gene_obj["cdsStart"]
                cdsEnd = gene_obj["cdsEnd"]
                exonCount = gene_obj["exonCount"]
                exonStarts = gene_obj["exonStarts"]
                exonEnds = gene_obj["exonEnds"]
                score = gene_obj["score"]
                name2 = gene_obj["name2"]
                cdsStartStat = gene_obj["cdsStartStat"]
                cdsEndStat = gene_obj["cdsEndStat"]
                exonFrames = gene_obj["exonFrames"]
                name = name2
                id = name_id
                e_start = exonStarts.split(",")
                e_end = exonEnds.split(",")

                # Determine Y Coordinate
                i = 0
                y_coord = None
                while y_coord == None:
                    if i > len(lines) - 1:
                        y_coord = i + 1
                        lines.append(int(txEnd))
                    elif int(txStart) > (gap + lines[i]):
                        y_coord = i + 1
                        lines[i] = int(txEnd)
                    else:
                        i += 1

                genes_plot_start.append(int(txStart) / 1000000.0)
                genes_plot_end.append(int(txEnd) / 1000000.0)
                genes_plot_y.append(y_coord)
                genes_plot_name.append(name + "  ")

                for i in range(len(e_start) - 1):
                    if strand == "+":
                        exon = i + 1
                    else:
                        exon = len(e_start) - 1 - i

                    width = (int(e_end[i]) - int(e_start[i])) / 1000000.0
                    x_coord = int(e_start[i]) / 1000000.0 + (width / 2)

                    exons_plot_x.append(x_coord)
                    exons_plot_y.append(y_coord)
                    exons_plot_w.append(width)
                    exons_plot_h.append(tall)
                    exons_plot_name.append(name)
                    exons_plot_id.append(id)
                    exons_plot_exon.append(exon)

        n_rows = len(lines)
        genes_plot_yn = [n_rows - w + 0.5 for w in genes_plot_y]
        exons_plot_yn = [n_rows - w + 0.5 for w in exons_plot_y]
        yr2 = Range1d(start=0, end=n_rows)

        data_gene_plot = {
            'exons_plot_x': exons_plot_x,
            'exons_plot_yn': exons_plot_yn,
            'exons_plot_w': exons_plot_w,
            'exons_plot_h': exons_plot_h,
            'exons_plot_name': exons_plot_name,
            'exons_plot_id': exons_plot_id,
            'exons_plot_exon': exons_plot_exon,
            'coord_snps_plot': coord_snps_plot,
            'snp_id_plot': snp_id_plot,
            'alleles_snp_plot': alleles_snp_plot
        }

        source_gene_plot = ColumnDataSource(data_gene_plot)

        max_genes = 40
        # if len(lines) < 3 or len(genes_raw) > max_genes:
        if len(lines) < 3:
            plot_h_pix = 250
        else:
            plot_h_pix = 250 + (len(lines) - 2) * 50

        gene_plot = figure(
            min_border_top=2,
            min_border_bottom=0,
            min_border_left=100,
            min_border_right=5,
            x_range=xr,
            y_range=yr2,
            border_fill_color='white',
            title="",
            h_symmetry=False,
            v_symmetry=False,
            logo=None,
            plot_width=800,
            plot_height=plot_h_pix,
            tools=
            "hover,xpan,box_zoom,wheel_zoom,tap,undo,redo,reset,previewsave")

        # if len(genes_raw) <= max_genes:
        gene_plot.segment(genes_plot_start,
                          genes_plot_yn,
                          genes_plot_end,
                          genes_plot_yn,
                          color="black",
                          alpha=1,
                          line_width=2)
        gene_plot.rect(x='exons_plot_x',
                       y='exons_plot_yn',
                       width='exons_plot_w',
                       height='exons_plot_h',
                       source=source_gene_plot,
                       fill_color='grey',
                       line_color="grey")
        gene_plot.text(genes_plot_start,
                       genes_plot_yn,
                       text=genes_plot_name,
                       alpha=1,
                       text_font_size="7pt",
                       text_font_style="bold",
                       text_baseline="middle",
                       text_align="right",
                       angle=0)
        hover = gene_plot.select(dict(type=HoverTool))
        hover.tooltips = OrderedDict([
            ("Gene", "@exons_plot_name"),
            ("ID", "@exons_plot_id"),
            ("Exon", "@exons_plot_exon"),
        ])

        # else:
        #     x_coord_text = x[0] + (x[-1] - x[0]) / 2.0
        #     gene_plot.text(x_coord_text, n_rows / 2.0, text=message, alpha=1,
        #                    text_font_size="12pt", text_font_style="bold", text_baseline="middle", text_align="center", angle=0)

        gene_plot.xaxis.axis_label = "Chromosome " + \
            snp_coords[1][1] + " Coordinate (Mb)(" + genome_build_vars[genome_build]['title'] + ")"
        gene_plot.yaxis.axis_label = "Genes (All Transcripts)"
        gene_plot.ygrid.grid_line_color = None
        gene_plot.yaxis.axis_line_color = None
        gene_plot.yaxis.minor_tick_line_color = None
        gene_plot.yaxis.major_tick_line_color = None
        gene_plot.yaxis.major_label_text_color = None

        gene_plot.toolbar_location = "below"

    # Gene Plot (Collapsed)
    else:
        genes_c_file = tmp_dir + "genes_c_" + request + ".json"
        genes_c_raw = open(genes_c_file).readlines()

        genes_c_plot_start = []
        genes_c_plot_end = []
        genes_c_plot_y = []
        genes_c_plot_name = []
        exons_c_plot_x = []
        exons_c_plot_y = []
        exons_c_plot_w = []
        exons_c_plot_h = []
        exons_c_plot_name = []
        exons_c_plot_id = []
        message_c = ["Too many genes to plot."]
        lines_c = [0]
        gap = 80000
        tall = 0.75
        if genes_c_raw != None and len(genes_c_raw) > 0:
            for gene_c_raw_obj in genes_c_raw:
                gene_c_obj = json.loads(gene_c_raw_obj)
                chrom = gene_c_obj["chrom"]
                txStart = gene_c_obj["txStart"]
                txEnd = gene_c_obj["txEnd"]
                exonStarts = gene_c_obj["exonStarts"]
                exonEnds = gene_c_obj["exonEnds"]
                name2 = gene_c_obj["name2"]
                transcripts = gene_c_obj["transcripts"]
                name = name2
                e_start = exonStarts.split(",")
                e_end = exonEnds.split(",")
                e_transcripts = transcripts.split(",")

                # Determine Y Coordinate
                i = 0
                y_coord = None
                while y_coord == None:
                    if i > len(lines_c) - 1:
                        y_coord = i + 1
                        lines_c.append(int(txEnd))
                    elif int(txStart) > (gap + lines_c[i]):
                        y_coord = i + 1
                        lines_c[i] = int(txEnd)
                    else:
                        i += 1

                genes_c_plot_start.append(int(txStart) / 1000000.0)
                genes_c_plot_end.append(int(txEnd) / 1000000.0)
                genes_c_plot_y.append(y_coord)
                genes_c_plot_name.append(name + "  ")

                # for i in range(len(e_start)):
                for i in range(len(e_start) - 1):
                    width = (int(e_end[i]) - int(e_start[i])) / 1000000.0
                    x_coord = int(e_start[i]) / 1000000.0 + (width / 2)

                    exons_c_plot_x.append(x_coord)
                    exons_c_plot_y.append(y_coord)
                    exons_c_plot_w.append(width)
                    exons_c_plot_h.append(tall)
                    exons_c_plot_name.append(name)
                    exons_c_plot_id.append(e_transcripts[i].replace("-", ","))

        n_rows_c = len(lines_c)
        genes_c_plot_yn = [n_rows_c - x + 0.5 for x in genes_c_plot_y]
        exons_c_plot_yn = [n_rows_c - x + 0.5 for x in exons_c_plot_y]
        yr2_c = Range1d(start=0, end=n_rows_c)

        data_gene_c_plot = {
            'exons_c_plot_x': exons_c_plot_x,
            'exons_c_plot_yn': exons_c_plot_yn,
            'exons_c_plot_w': exons_c_plot_w,
            'exons_c_plot_h': exons_c_plot_h,
            'exons_c_plot_name': exons_c_plot_name,
            'exons_c_plot_id': exons_c_plot_id
        }
        source_gene_c_plot = ColumnDataSource(data_gene_c_plot)
        max_genes_c = 40
        # if len(lines_c) < 3 or len(genes_c_raw) > max_genes_c:
        if len(lines_c) < 3:
            plot_h_pix = 250
        else:
            plot_h_pix = 250 + (len(lines_c) - 2) * 50

        gene_plot = figure(
            min_border_top=2,
            min_border_bottom=0,
            min_border_left=100,
            min_border_right=5,
            x_range=xr,
            y_range=yr2_c,
            border_fill_color='white',
            title="",
            h_symmetry=False,
            v_symmetry=False,
            logo=None,
            plot_width=900,
            plot_height=plot_h_pix,
            tools=
            "hover,xpan,box_zoom,wheel_zoom,tap,undo,redo,reset,previewsave")

        # if len(genes_c_raw) <= max_genes_c:
        gene_plot.segment(genes_c_plot_start,
                          genes_c_plot_yn,
                          genes_c_plot_end,
                          genes_c_plot_yn,
                          color="black",
                          alpha=1,
                          line_width=2)
        gene_plot.rect(x='exons_c_plot_x',
                       y='exons_c_plot_yn',
                       width='exons_c_plot_w',
                       height='exons_c_plot_h',
                       source=source_gene_c_plot,
                       fill_color="grey",
                       line_color="grey")
        gene_plot.text(genes_c_plot_start,
                       genes_c_plot_yn,
                       text=genes_c_plot_name,
                       alpha=1,
                       text_font_size="7pt",
                       text_font_style="bold",
                       text_baseline="middle",
                       text_align="right",
                       angle=0)
        hover = gene_plot.select(dict(type=HoverTool))
        hover.tooltips = OrderedDict([
            ("Gene", "@exons_c_plot_name"),
            ("Transcript IDs", "@exons_c_plot_id"),
        ])

        # else:
        # 	x_coord_text = coord1/1000000.0 + (coord2/1000000.0 - coord1/1000000.0) / 2.0
        # 	gene_c_plot.text(x_coord_text, n_rows_c / 2.0, text=message_c, alpha=1,
        # 				   text_font_size="12pt", text_font_style="bold", text_baseline="middle", text_align="center", angle=0)

        gene_plot.xaxis.axis_label = "Chromosome " + snp_coords[1][
            1] + " Coordinate (Mb)(" + genome_build_vars[genome_build][
                'title'] + ")"
        gene_plot.yaxis.axis_label = "Genes (Transcripts Collapsed)"
        gene_plot.ygrid.grid_line_color = None
        gene_plot.yaxis.axis_line_color = None
        gene_plot.yaxis.minor_tick_line_color = None
        gene_plot.yaxis.major_tick_line_color = None
        gene_plot.yaxis.major_label_text_color = None

        gene_plot.toolbar_location = "below"

    # Change output backend to SVG temporarily for headless export
    # Will be changed back to canvas in LDlink.js
    matrix_plot.output_backend = "svg"
    connector.output_backend = "svg"
    rug.output_backend = "svg"
    gene_plot.output_backend = "svg"
    export_svgs(matrix_plot,
                filename=tmp_dir + "matrix_plot_1_" + request + ".svg")
    export_svgs(connector,
                filename=tmp_dir + "connector_1_" + request + ".svg")
    export_svgs(rug, filename=tmp_dir + "rug_1_" + request + ".svg")
    export_svgs(gene_plot,
                filename=tmp_dir + "gene_plot_1_" + request + ".svg")

    # 1 pixel = 0.0264583333 cm
    svg_height = str(25.00 + (0.0264583333 * plot_h_pix)) + "cm"
    svg_height_scaled = str(110.00 + (0.1322916665 * plot_h_pix)) + "cm"

    # Concatenate svgs
    sg.Figure(
        "21.59cm", svg_height,
        sg.SVG(tmp_dir + "matrix_plot_1_" + request + ".svg"),
        sg.SVG(tmp_dir + "connector_1_" + request + ".svg").scale(.97).move(
            0, 700),
        sg.SVG(tmp_dir + "rug_1_" + request + ".svg").scale(.97).move(0, 790),
        sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").scale(.97).move(
            0, 840)).save(tmp_dir + "matrix_plot_" + request + ".svg")

    sg.Figure(
        "107.95cm", svg_height_scaled,
        sg.SVG(tmp_dir + "matrix_plot_1_" + request + ".svg").scale(5),
        sg.SVG(tmp_dir + "connector_1_" + request + ".svg").scale(4.85).move(
            0, 3500),
        sg.SVG(tmp_dir + "rug_1_" + request + ".svg").scale(4.85).move(
            0, 3930),
        sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").scale(4.85).move(
            0, 4160)).save(tmp_dir + "matrix_plot_scaled_" + request + ".svg")

    # Export to PDF
    subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "matrix_plot_" +
                    request + ".svg " + tmp_dir + "matrix_plot_" + request +
                    ".pdf",
                    shell=True)
    # Export to PNG
    subprocess.call("phantomjs ./rasterize.js " + tmp_dir +
                    "matrix_plot_scaled_" + request + ".svg " + tmp_dir +
                    "matrix_plot_" + request + ".png",
                    shell=True)
    # Export to JPEG
    subprocess.call("phantomjs ./rasterize.js " + tmp_dir +
                    "matrix_plot_scaled_" + request + ".svg " + tmp_dir +
                    "matrix_plot_" + request + ".jpeg",
                    shell=True)
    # Remove individual SVG files after they are combined
    subprocess.call("rm " + tmp_dir + "matrix_plot_1_" + request + ".svg",
                    shell=True)
    subprocess.call("rm " + tmp_dir + "gene_plot_1_" + request + ".svg",
                    shell=True)
    subprocess.call("rm " + tmp_dir + "rug_1_" + request + ".svg", shell=True)
    subprocess.call("rm " + tmp_dir + "connector_1_" + request + ".svg",
                    shell=True)
    # Remove scaled SVG file after it is converted to png and jpeg
    subprocess.call("rm " + tmp_dir + "matrix_plot_scaled_" + request + ".svg",
                    shell=True)
    # Remove temporary file(s)
    subprocess.call("rm " + tmp_dir + "genes_*" + request + "*.json",
                    shell=True)

    reset_output()

    return None
Exemplo n.º 2
0
def calculate_matrix_svg(snplst, pop, request, r2_d="r2"):

    # Set data directories using config.yml
    with open('config.yml', 'r') as f:
        config = yaml.load(f)
    gene_dir=config['data']['gene_dir']
    snp_dir=config['data']['snp_dir']
    pop_dir=config['data']['pop_dir']
    vcf_dir=config['data']['vcf_dir']

    tmp_dir = "./tmp/"

    # Ensure tmp directory exists
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)

    # Open SNP list file
    snps_raw = open(snplst).readlines()

    # Remove duplicate RS numbers
    snps = []
    for snp_raw in snps_raw:
        snp = snp_raw.strip().split()
        if snp not in snps:
            snps.append(snp)

    # Select desired ancestral populations
    pops = pop.split("+")
    pop_dirs = []
    for pop_i in pops:
        if pop_i in ["ALL", "AFR", "AMR", "EAS", "EUR", "SAS", "ACB", "ASW", "BEB", "CDX", "CEU", "CHB", "CHS", "CLM", "ESN", "FIN", "GBR", "GIH", "GWD", "IBS", "ITU", "JPT", "KHV", "LWK", "MSL", "MXL", "PEL", "PJL", "PUR", "STU", "TSI", "YRI"]:
            pop_dirs.append(pop_dir + pop_i + ".txt")

    get_pops = "cat " + " ".join(pop_dirs)
    proc = subprocess.Popen(get_pops, shell=True, stdout=subprocess.PIPE)
    pop_list = proc.stdout.readlines()

    ids = [i.strip() for i in pop_list]
    pop_ids = list(set(ids))

    # Connect to snp database
    conn = sqlite3.connect(snp_dir)
    conn.text_factory = str
    cur = conn.cursor()

    def get_coords(rs):
        id = rs.strip("rs")
        t = (id,)
        cur.execute("SELECT * FROM tbl_" + id[-1] + " WHERE id=?", t)
        return cur.fetchone()

    # Find RS numbers in snp database
    rs_nums = []
    snp_pos = []
    snp_coords = []
    tabix_coords = ""
    for snp_i in snps:
        if len(snp_i) > 0:
            if len(snp_i[0]) > 2:
                if snp_i[0][0:2] == "rs" and snp_i[0][-1].isdigit():
                    snp_coord = get_coords(snp_i[0])
                    if snp_coord != None:
                        rs_nums.append(snp_i[0])
                        snp_pos.append(snp_coord[2])
                        temp = [snp_i[0], snp_coord[1], snp_coord[2]]
                        snp_coords.append(temp)

    # Close snp connection
    cur.close()
    conn.close()


    # Check max distance between SNPs
    distance_bp = []
    for i in range(len(snp_coords)):
        distance_bp.append(int(snp_coords[i][2]))

    # Sort coordinates and make tabix formatted coordinates
    snp_pos_int = [int(i) for i in snp_pos]
    snp_pos_int.sort()
    snp_coord_str = [snp_coords[0][1] + ":" +
                     str(i) + "-" + str(i) for i in snp_pos_int]
    tabix_coords = " " + " ".join(snp_coord_str)

    # Extract 1000 Genomes phased genotypes
    vcf_file = vcf_dir + \
        snp_coords[0][
            1] + ".phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.vcf.gz"
    tabix_snps = "tabix -h {0}{1} | grep -v -e END".format(
        vcf_file, tabix_coords)
    proc = subprocess.Popen(tabix_snps, shell=True, stdout=subprocess.PIPE)

    # Define function to correct indel alleles
    def set_alleles(a1, a2):
        if len(a1) == 1 and len(a2) == 1:
            a1_n = a1
            a2_n = a2
        elif len(a1) == 1 and len(a2) > 1:
            a1_n = "-"
            a2_n = a2[1:]
        elif len(a1) > 1 and len(a2) == 1:
            a1_n = a1[1:]
            a2_n = "-"
        elif len(a1) > 1 and len(a2) > 1:
            a1_n = a1[1:]
            a2_n = a2[1:]
        return(a1_n, a2_n)

    # Import SNP VCF files
    vcf = proc.stdout.readlines()

    h = 0
    while vcf[h][0:2] == "##":
        h += 1

    head = vcf[h].strip().split()

    # Extract haplotypes
    index = []
    for i in range(9, len(head)):
        if head[i] in pop_ids:
            index.append(i)

    hap1 = [[]]
    for i in range(len(index) - 1):
        hap1.append([])
    hap2 = [[]]
    for i in range(len(index) - 1):
        hap2.append([])

    rsnum_lst = []
    allele_lst = []
    pos_lst = []

    for g in range(h + 1, len(vcf)):
        geno = vcf[g].strip().split()
        if geno[1] not in snp_pos:
            continue

        if snp_pos.count(geno[1]) == 1:
            rs_query = rs_nums[snp_pos.index(geno[1])]

        else:
            pos_index = []
            for p in range(len(snp_pos)):
                if snp_pos[p] == geno[1]:
                    pos_index.append(p)
            for p in pos_index:
                if rs_nums[p] not in rsnum_lst:
                    rs_query = rs_nums[p]
                    break

        if rs_query in rsnum_lst:
            continue

        rs_1000g = geno[2]

        if rs_query == rs_1000g:
            rsnum = rs_1000g
        else:
            count = -2
            found = "false"
            while count <= 2 and count + g < len(vcf):
                geno_next = vcf[g + count].strip().split()
                if rs_query == geno_next[2]:
                    found = "true"
                    break
                count += 1

            if found == "false":
                indx = [i[0] for i in snps].index(rs_query)
                # snps[indx][0] = geno[2]
                # rsnum = geno[2]
                snps[indx][0]=rs_query
                rsnum=rs_query
            else:
                continue

        if "," not in geno[3] and "," not in geno[4]:
            a1, a2 = set_alleles(geno[3], geno[4])
            for i in range(len(index)):
                if geno[index[i]] == "0|0":
                    hap1[i].append(a1)
                    hap2[i].append(a1)
                elif geno[index[i]] == "0|1":
                    hap1[i].append(a1)
                    hap2[i].append(a2)
                elif geno[index[i]] == "1|0":
                    hap1[i].append(a2)
                    hap2[i].append(a1)
                elif geno[index[i]] == "1|1":
                    hap1[i].append(a2)
                    hap2[i].append(a2)
                elif geno[index[i]] == "0":
                    hap1[i].append(a1)
                    hap2[i].append(".")
                elif geno[index[i]] == "1":
                    hap1[i].append(a2)
                    hap2[i].append(".")
                else:
                    hap1[i].append(".")
                    hap2[i].append(".")

            rsnum_lst.append(rsnum)

            position = "chr" + geno[0] + ":" + geno[1] + "-" + geno[1]
            pos_lst.append(position)
            alleles = a1 + "/" + a2
            allele_lst.append(alleles)

    # Calculate Pairwise LD Statistics
    all_haps = hap1 + hap2
    ld_matrix = [[[None for v in range(2)] for i in range(
        len(all_haps[0]))] for j in range(len(all_haps[0]))]

    for i in range(len(all_haps[0])):
        for j in range(i, len(all_haps[0])):
            hap = {}
            for k in range(len(all_haps)):
                # Extract haplotypes
                hap_k = all_haps[k][i] + all_haps[k][j]
                if hap_k in hap:
                    hap[hap_k] += 1
                else:
                    hap[hap_k] = 1

            # Remove Missing Haplotypes
            keys = hap.keys()
            for key in keys:
                if "." in key:
                    hap.pop(key, None)

            # Check all haplotypes are present
            if len(hap) != 4:
                snp_i_a = allele_lst[i].split("/")
                snp_j_a = allele_lst[j].split("/")
                haps = [snp_i_a[0] + snp_j_a[0], snp_i_a[0] + snp_j_a[1],
                        snp_i_a[1] + snp_j_a[0], snp_i_a[1] + snp_j_a[1]]
                for h in haps:
                    if h not in hap:
                        hap[h] = 0

            # Perform LD calculations
            A = hap[sorted(hap)[0]]
            B = hap[sorted(hap)[1]]
            C = hap[sorted(hap)[2]]
            D = hap[sorted(hap)[3]]
            tmax = max(A, B, C, D)
            delta = float(A * D - B * C)
            Ms = float((A + C) * (B + D) * (A + B) * (C + D))
            if Ms != 0:
                # D prime
                if delta < 0:
                    D_prime = round(
                        abs(delta / min((A + C) * (A + B), (B + D) * (C + D))), 3)
                else:
                    D_prime = round(
                        abs(delta / min((A + C) * (C + D), (A + B) * (B + D))), 3)

                # R2
                r2 = round((delta**2) / Ms, 3)

                # Find Correlated Alleles
                if r2 > 0.1:
                    N = A + B + C + D
                    # Expected Cell Counts
                    eA = (A + B) * (A + C) / N
                    eB = (B + A) * (B + D) / N
                    eC = (C + A) * (C + D) / N
                    eD = (D + C) * (D + B) / N

                    # Calculate Deltas
                    dA = (A - eA)**2
                    dB = (B - eB)**2
                    dC = (C - eC)**2
                    dD = (D - eD)**2
                    dmax = max(dA, dB, dC, dD)

                    if dA == dB == dC == dD:
                        if tmax == dA or tmax == dD:
                            match = sorted(hap)[0][
                                0] + "=" + sorted(hap)[0][1] + "," + sorted(hap)[2][0] + "=" + sorted(hap)[1][1]
                        else:
                            match = sorted(hap)[0][
                                0] + "=" + sorted(hap)[1][1] + "," + sorted(hap)[2][0] + "=" + sorted(hap)[0][1]
                    elif dmax == dA or dmax == dD:
                        match = sorted(hap)[0][
                            0] + "=" + sorted(hap)[0][1] + "," + sorted(hap)[2][0] + "=" + sorted(hap)[1][1]
                    else:
                        match = sorted(hap)[0][
                            0] + "=" + sorted(hap)[1][1] + "," + sorted(hap)[2][0] + "=" + sorted(hap)[0][1]
                else:
                    match = "  =  ,  =  "
            else:
                D_prime = "NA"
                r2 = "NA"
                match = "  =  ,  =  "

            snp1 = rsnum_lst[i]
            snp2 = rsnum_lst[j]
            pos1 = pos_lst[i].split("-")[0]
            pos2 = pos_lst[j].split("-")[0]
            allele1 = allele_lst[i]
            allele2 = allele_lst[j]
            corr = match.split(",")[0].split("=")[1] + "=" + match.split(",")[0].split("=")[
                0] + "," + match.split(",")[1].split("=")[1] + "=" + match.split(",")[1].split("=")[0]
            corr_f = match

            ld_matrix[i][j] = [snp1, snp2, allele1,
                               allele2, corr, pos1, pos2, D_prime, r2]
            ld_matrix[j][i] = [snp2, snp1, allele2,
                               allele1, corr_f, pos2, pos1, D_prime, r2]

    # Generate Plot Variables
    out = [j for i in ld_matrix for j in i]
    xnames = []
    ynames = []
    xA = []
    yA = []
    corA = []
    xpos = []
    ypos = []
    D = []
    R = []
    box_color = []
    box_trans = []

    if r2_d not in ["r2", "d"]:
        r2_d = "r2"

    for i in range(len(out)):
        snp1, snp2, allele1, allele2, corr, pos1, pos2, D_prime, r2 = out[i]
        xnames.append(snp1)
        ynames.append(snp2)
        xA.append(allele1)
        yA.append(allele2)
        corA.append(corr)
        xpos.append(pos1)
        ypos.append(pos2)
        if r2_d == "r2" and r2 != "NA":
            D.append(str(round(float(D_prime), 4)))
            R.append(str(round(float(r2), 4)))
            box_color.append("red")
            box_trans.append(r2)
        elif r2_d == "d" and D_prime != "NA":
            D.append(str(round(float(D_prime), 4)))
            R.append(str(round(float(r2), 4)))
            box_color.append("red")
            box_trans.append(abs(D_prime))
        else:
            D.append("NA")
            R.append("NA")
            box_color.append("blue")
            box_trans.append(0.1)

    # Import plotting modules
    from collections import OrderedDict
    from bokeh.embed import components, file_html
    from bokeh.layouts import gridplot
    from bokeh.models import HoverTool, LinearAxis, Range1d
    from bokeh.plotting import ColumnDataSource, curdoc, figure, output_file, reset_output, save
    from bokeh.resources import CDN
    from bokeh.io import export_svgs
    import svgutils.compose as sg
    from math import pi

    reset_output()

    # Aggregate Plotting Data
    x = []
    y = []
    w = []
    h = []
    coord_snps_plot = []
    snp_id_plot = []
    alleles_snp_plot = []
    for i in range(0, len(xpos), int(len(xpos)**0.5)):
        x.append(int(xpos[i].split(":")[1]) / 1000000.0)
        y.append(0.5)
        w.append(0.00003)
        h.append(1.06)
        coord_snps_plot.append(xpos[i])
        snp_id_plot.append(xnames[i])
        alleles_snp_plot.append(xA[i])
    

    buffer = (x[-1] - x[0]) * 0.025
    xr = Range1d(start=x[0] - buffer, end=x[-1] + buffer)
    yr = Range1d(start=-0.03, end=1.03)
    y2_ll = [-0.03] * len(x)
    y2_ul = [1.03] * len(x)

    yr_pos = Range1d(start=(x[-1] + buffer) * -1, end=(x[0] - buffer) * -1)
    yr0 = Range1d(start=0, end=1)
    yr2 = Range1d(start=0, end=3.8)
    yr3 = Range1d(start=0, end=1)

    spacing = (x[-1] - x[0] + buffer + buffer) / (len(x) * 1.0)
    x2 = []
    y0 = []
    y1 = []
    y2 = []
    y3 = []
    y4 = []
    for i in range(len(x)):
        x2.append(x[0] - buffer + spacing * (i + 0.5))
        y0.append(0)
        y1.append(0.20)
        y2.append(0.80)
        y3.append(1)
        y4.append(1.15)

    xname_pos = []
    for i in x2:
        for j in range(len(x2)):
            xname_pos.append(i)

    data = {
            'xname': xnames,
            'xname_pos': xname_pos,
            'yname': ynames,
            'xA': xA,
            'yA': yA,
            'xpos': xpos,
            'ypos': ypos,
            'R2': R,
            'Dp': D,
            'corA': corA,
            'box_color': box_color,
            'box_trans': box_trans
    }

    source = ColumnDataSource(data)

    threshold = 70
    if len(snps) < threshold:
        matrix_plot = figure(outline_line_color="white", min_border_top=0, min_border_bottom=2, min_border_left=100, min_border_right=5,
                             x_range=xr, y_range=list(reversed(rsnum_lst)),
                             h_symmetry=False, v_symmetry=False, border_fill_color='white', x_axis_type=None, logo=None,
                             tools="hover,undo,redo,reset,pan,box_zoom,previewsave", title=" ", plot_width=800, plot_height=700)

    else:
        matrix_plot = figure(outline_line_color="white", min_border_top=0, min_border_bottom=2, min_border_left=100, min_border_right=5,
                             x_range=xr, y_range=list(reversed(rsnum_lst)),
                             h_symmetry=False, v_symmetry=False, border_fill_color='white', x_axis_type=None, y_axis_type=None, logo=None,
                             tools="hover,undo,redo,reset,pan,box_zoom,previewsave", title=" ", plot_width=800, plot_height=700)
    

    matrix_plot.rect(x='xname_pos', y='yname', width=0.95 * spacing, height=0.95, source=source,
                    color="box_color", alpha="box_trans", line_color=None)
    
    matrix_plot.grid.grid_line_color = None
    matrix_plot.axis.axis_line_color = None
    matrix_plot.axis.major_tick_line_color = None
    if len(snps) < threshold:
        matrix_plot.axis.major_label_text_font_size = "8pt"
        matrix_plot.xaxis.major_label_orientation = "vertical"

    matrix_plot.axis.major_label_text_font_style = "normal"
    matrix_plot.xaxis.major_label_standoff = 0

    sup_2 = u"\u00B2"

    hover = matrix_plot.select(dict(type=HoverTool))
    hover.tooltips = OrderedDict([
        ("Variant 1", " " + "@yname (@yA)"),
        ("Variant 2", " " + "@xname (@xA)"),
        ("D\'", " " + "@Dp"),
        ("R" + sup_2, " " + "@R2"),
        ("Correlated Alleles", " " + "@corA"),
    ])

    # Connecting and Rug Plots
    # Connector Plot
    if len(snps) < threshold:
        connector = figure(outline_line_color="white", y_axis_type=None, x_axis_type=None,
                           x_range=xr, y_range=yr2, border_fill_color='white',
                           title="", min_border_left=100, min_border_right=5, min_border_top=0, min_border_bottom=0, h_symmetry=False, v_symmetry=False,
                           plot_width=800, plot_height=90, tools="xpan,tap")
        connector.segment(x, y0, x, y1, color="black")
        connector.segment(x, y1, x2, y2, color="black")
        connector.segment(x2, y2, x2, y3, color="black")
        connector.text(x2, y4, text=snp_id_plot, alpha=1, angle=pi / 2,
                       text_font_size="8pt", text_baseline="middle", text_align="left")
    else:
        connector = figure(outline_line_color="white", y_axis_type=None, x_axis_type=None,
                           x_range=xr, y_range=yr3, border_fill_color='white',
                           title="", min_border_left=100, min_border_right=5, min_border_top=0, min_border_bottom=0, h_symmetry=False, v_symmetry=False,
                           plot_width=800, plot_height=30, tools="xpan,tap")
        connector.segment(x, y0, x, y1, color="black")
        connector.segment(x, y1, x2, y2, color="black")
        connector.segment(x2, y2, x2, y3, color="black")

    connector.yaxis.major_label_text_color = None
    connector.yaxis.minor_tick_line_alpha = 0  # Option does not work
    connector.yaxis.axis_label = " "
    connector.grid.grid_line_color = None
    connector.axis.axis_line_color = None
    connector.axis.major_tick_line_color = None
    connector.axis.minor_tick_line_color = None

    connector.toolbar_location = None

    data_rug = {
        'x': x,
        'y': y,
        'w': w,
        'h': h,
        'coord_snps_plot': coord_snps_plot,
        'snp_id_plot': snp_id_plot,
        'alleles_snp_plot': alleles_snp_plot
    }

    source_rug = ColumnDataSource(data_rug)

    # Rug Plot
    rug = figure(x_range=xr, y_range=yr, y_axis_type=None,
                 title="", min_border_top=1, min_border_bottom=0, min_border_left=100, min_border_right=5, h_symmetry=False, v_symmetry=False,
                 plot_width=800, plot_height=50, tools="hover,xpan,tap")
    rug.rect(x='x', y='y', width='w', height='h', fill_color='red', dilate=True, line_color=None, fill_alpha=0.6, source=source_rug)

    hover = rug.select(dict(type=HoverTool))
    hover.tooltips = OrderedDict([
        ("SNP", "@snp_id_plot (@alleles_snp_plot)"),
        ("Coord", "@coord_snps_plot"),
    ])

    rug.toolbar_location = None

    # Gene Plot
    tabix_gene = "tabix -fh {0} {1}:{2}-{3} > {4}".format(gene_dir, snp_coords[1][1], int(
        (x[0] - buffer) * 1000000), int((x[-1] + buffer) * 1000000), tmp_dir + "genes_" + request + ".txt")
    subprocess.call(tabix_gene, shell=True)
    filename = tmp_dir + "genes_" + request + ".txt"
    genes_raw = open(filename).readlines()

    genes_plot_start = []
    genes_plot_end = []
    genes_plot_y = []
    genes_plot_name = []
    exons_plot_x = []
    exons_plot_y = []
    exons_plot_w = []
    exons_plot_h = []
    exons_plot_name = []
    exons_plot_id = []
    exons_plot_exon = []
    message = ["Too many genes to plot."]
    lines = [0]
    gap = 80000
    tall = 0.75
    if genes_raw != None:
        for i in range(len(genes_raw)):
            bin, name_id, chrom, strand, txStart, txEnd, cdsStart, cdsEnd, exonCount, exonStarts, exonEnds, score, name2, cdsStartStat, cdsEndStat, exonFrames = genes_raw[
                i].strip().split()
            name = name2
            id = name_id
            e_start = exonStarts.split(",")
            e_end = exonEnds.split(",")

            # Determine Y Coordinate
            i = 0
            y_coord = None
            while y_coord == None:
                if i > len(lines) - 1:
                    y_coord = i + 1
                    lines.append(int(txEnd))
                elif int(txStart) > (gap + lines[i]):
                    y_coord = i + 1
                    lines[i] = int(txEnd)
                else:
                    i += 1

            genes_plot_start.append(int(txStart) / 1000000.0)
            genes_plot_end.append(int(txEnd) / 1000000.0)
            genes_plot_y.append(y_coord)
            genes_plot_name.append(name + "  ")

            for i in range(len(e_start) - 1):
                if strand == "+":
                    exon = i + 1
                else:
                    exon = len(e_start) - 1 - i

                width = (int(e_end[i]) - int(e_start[i])) / 1000000.0
                x_coord = int(e_start[i]) / 1000000.0 + (width / 2)

                exons_plot_x.append(x_coord)
                exons_plot_y.append(y_coord)
                exons_plot_w.append(width)
                exons_plot_h.append(tall)
                exons_plot_name.append(name)
                exons_plot_id.append(id)
                exons_plot_exon.append(exon)

    n_rows = len(lines)
    genes_plot_yn = [n_rows - w + 0.5 for w in genes_plot_y]
    exons_plot_yn = [n_rows - w + 0.5 for w in exons_plot_y]
    yr2 = Range1d(start=0, end=n_rows)

    data_gene_plot = {
        'exons_plot_x': exons_plot_x,
        'exons_plot_yn': exons_plot_yn,
        'exons_plot_w': exons_plot_w,
        'exons_plot_h': exons_plot_h,
        'exons_plot_name': exons_plot_name,
        'exons_plot_id': exons_plot_id,
        'exons_plot_exon': exons_plot_exon,
        'coord_snps_plot': coord_snps_plot,
        'snp_id_plot': snp_id_plot,
        'alleles_snp_plot': alleles_snp_plot
    }

    source_gene_plot = ColumnDataSource(data_gene_plot)

    max_genes = 40
    if len(lines) < 3 or len(genes_raw) > max_genes:
        plot_h_pix = 150
    else:
        plot_h_pix = 150 + (len(lines) - 2) * 50

    gene_plot = figure(min_border_top=2, min_border_bottom=0, min_border_left=100, min_border_right=5,
                       x_range=xr, y_range=yr2, border_fill_color='white',
                       title="", h_symmetry=False, v_symmetry=False, logo=None,
                       plot_width=800, plot_height=plot_h_pix, tools="hover,xpan,box_zoom,wheel_zoom,tap,undo,redo,reset,previewsave")

    if len(genes_raw) <= max_genes:
        gene_plot.segment(genes_plot_start, genes_plot_yn, genes_plot_end,
                          genes_plot_yn, color="black", alpha=1, line_width=2)
        gene_plot.rect(x='exons_plot_x', y='exons_plot_yn', width='exons_plot_w', height='exons_plot_h',
                        source=source_gene_plot, fill_color='grey', line_color="grey")
        gene_plot.text(genes_plot_start, genes_plot_yn, text=genes_plot_name, alpha=1, text_font_size="7pt",
                       text_font_style="bold", text_baseline="middle", text_align="right", angle=0)
        hover = gene_plot.select(dict(type=HoverTool))
        hover.tooltips = OrderedDict([
            ("Gene", "@exons_plot_name"),
            ("ID", "@exons_plot_id"),
            ("Exon", "@exons_plot_exon"),
        ])

    else:
        x_coord_text = x[0] + (x[-1] - x[0]) / 2.0
        gene_plot.text(x_coord_text, n_rows / 2.0, text=message, alpha=1,
                       text_font_size="12pt", text_font_style="bold", text_baseline="middle", text_align="center", angle=0)

    gene_plot.xaxis.axis_label = "Chromosome " + \
        snp_coords[1][1] + " Coordinate (Mb)(GRCh37)"
    gene_plot.yaxis.axis_label = "Genes"
    gene_plot.ygrid.grid_line_color = None
    gene_plot.yaxis.axis_line_color = None
    gene_plot.yaxis.minor_tick_line_color = None
    gene_plot.yaxis.major_tick_line_color = None
    gene_plot.yaxis.major_label_text_color = None

    gene_plot.toolbar_location = "below"

    # Change output backend to SVG temporarily for headless export
    # Will be changed back to canvas in LDlink.js
    matrix_plot.output_backend = "svg"
    rug.output_backend = "svg"
    gene_plot.output_backend = "svg"
    export_svgs(matrix_plot, filename=tmp_dir + "matrix_plot_1_" + request + ".svg")
    export_svgs(gene_plot, filename=tmp_dir + "gene_plot_1_" + request + ".svg")

    # Concatenate svgs
    sg.Figure("21.59cm", "27.94cm",
        sg.SVG(tmp_dir + "matrix_plot_1_" + request + ".svg"),
        sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").move(0, 720)
        ).save(tmp_dir + "matrix_plot_" + request + ".svg")

    sg.Figure("107.95cm", "139.70cm",
        sg.SVG(tmp_dir + "matrix_plot_1_" + request + ".svg").scale(5),
        sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").scale(5).move(0, 3600)
        ).save(tmp_dir + "matrix_plot_scaled_" + request + ".svg")

    # Export to PDF
    subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "matrix_plot_" + request + ".svg " + tmp_dir + "matrix_plot_" + request + ".pdf", shell=True)
    # Export to PNG
    subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "matrix_plot_scaled_" + request + ".svg " + tmp_dir + "matrix_plot_" + request + ".png", shell=True)
    # Export to JPEG
    subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "matrix_plot_scaled_" + request + ".svg " + tmp_dir + "matrix_plot_" + request + ".jpeg", shell=True)    
    # Remove individual SVG files after they are combined
    subprocess.call("rm " + tmp_dir + "matrix_plot_1_" + request + ".svg", shell=True)
    subprocess.call("rm " + tmp_dir + "gene_plot_1_" + request + ".svg", shell=True)
    # Remove scaled SVG file after it is converted to png and jpeg
    subprocess.call("rm " + tmp_dir + "matrix_plot_scaled_" + request + ".svg", shell=True)

    reset_output()

    return None
Exemplo n.º 3
0
def calculate_assoc_svg(file, region, pop, request, myargs, myargsName, myargsOrigin):

    # Set data directories using config.yml
    with open('config.yml', 'r') as f:
        config = yaml.load(f)
    gene_dir2 = config['data']['gene_dir2']
    vcf_dir = config['data']['vcf_dir']

    tmp_dir = "./tmp/"


    # Ensure tmp directory exists
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)


    chrs=["1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","X","Y"]

    # Define parameters for --variant option
    if region=="variant":
        if myargsOrigin=="None":
            return None
            

    if myargsOrigin!="None":
        # Find coordinates (GRCh37/hg19) for SNP RS number
        if myargsOrigin[0:2]=="rs":
            snp=myargsOrigin

            # Connect to Mongo snp database
            client = MongoClient('mongodb://'+username+':'+password+'@localhost/admin', port)
            db = client["LDLink"]


            def get_coords_var(db, rsid):
                rsid = rsid.strip("rs")
                query_results = db.dbsnp151.find_one({"id": rsid})
                query_results_sanitized = json.loads(json_util.dumps(query_results))
                return query_results_sanitized

            # Find RS number in snp database
            var_coord=get_coords_var(db, snp)

            if var_coord==None:
                return None
                

        elif myargsOrigin.split(":")[0].strip("chr") in chrs and len(myargsOrigin.split(":"))==2:
            snp=myargsOrigin
            var_coord=[None,myargsOrigin.split(":")[0].strip("chr"),myargsOrigin.split(":")[1]]

        else:
            return None
            

        chromosome = var_coord['chromosome']
        org_coord = var_coord['position']


    # Open Association Data
    header_list=[]
    header_list.append(myargs['chr'])
    header_list.append(myargs['bp'])
    header_list.append(myargs['pval'])

    # Load input file
    with open(file) as fp:
        header = fp.readline().strip().split()
        first = fp.readline().strip().split()

    if len(header)!=len(first):
        return None
        

    # Check header
    for item in header_list:
        if item not in header:
            return None
            

    len_head=len(header)

    chr_index=header.index(myargs['chr'])
    pos_index=header.index(myargs['bp'])
    p_index=header.index(myargs['pval'])


    # Define window of interest around query SNP
    if myargs['window']==None:
        if region=="variant":
            window=500000
        elif region=="gene":
            window=100000
        else:
            window=0
    else:
        window=myargs['window']

    if region=="variant":
        coord1=int(org_coord)-window
        if coord1<0:
            coord1=0
        coord2=int(org_coord)+window

    elif region=="gene":
        if myargsName=="None":
            return None
            

        # Connect to gene database
        conn=sqlite3.connect(gene_dir2)
        conn.text_factory=str
        cur=conn.cursor()

        def get_coords_gene(gene_raw):
            gene=gene_raw.upper()
            t=(gene,)
            cur.execute("SELECT * FROM genes WHERE name=?", t)
            return cur.fetchone()

        # Find RS number in snp database
        gene_coord=get_coords_gene(myargsName)

        # Close snp connection
        cur.close()
        conn.close()

        if gene_coord==None:
            return None
            

        # Define search coordinates
        coord1=int(gene_coord[2])-window
        if coord1<0:
            coord1=0
        coord2=int(gene_coord[3])+window

        # Run with --origin option
        if myargsOrigin!="None":
            if gene_coord[1]!=chromosome:
                return None
                
            if coord1>int(org_coord) or int(org_coord)>coord2:
                return None
                
        else:
            chromosome=gene_coord[1]

    elif region=="region":
        if myargs['start']==None:
            return None
            
        if myargs['end']==None:
            return None
            

        # Parse out chr and positions for --region option
        if len(myargs['start'].split(":"))!=2:
            return None
            
        if len(myargs['end'].split(":"))!=2:
            return None
            

        chr_s=myargs['start'].strip("chr").split(":")[0]
        coord_s=myargs['start'].split(":")[1]
        chr_e=myargs['end'].strip("chr").split(":")[0]
        coord_e=myargs['end'].split(":")[1]

        if chr_s not in chrs:
            return None
            
        if chr_e not in chrs:
            return None
            
        if chr_s!=chr_e:
            return None
            
        if coord_s>=coord_e:
            return None
            

        coord1=int(coord_s)-window
        if coord1<0:
            coord1=0
        coord2=int(coord_e)+window

        # Run with --origin option
        if myargsOrigin!="None":
            if chr_s!=chromosome:
                return None
                
            if coord1>int(org_coord) or int(org_coord)>coord2:
                return None
                
        else:
            chromosome=chr_s

    # Generate coordinate list and P-value dictionary
    max_window=3000000
    if coord2-coord1>max_window:
            return None
            

    assoc_coords=[]
    a_pos=[]
    assoc_dict={}
    assoc_list=[]
    with open(file) as fp:
        for line in fp:
            col=line.strip().split()
            if len(col)==len_head:
                if col[chr_index].strip("chr")==chromosome:
                    try:
                        int(col[pos_index])
                    except ValueError:
                        continue
                    else:
                        if coord1<=int(col[pos_index])<=coord2:
                            try:
                                float(col[p_index])
                            except ValueError:
                                continue
                            else:
                                coord_i=col[chr_index].strip("chr")+":"+col[pos_index]+"-"+col[pos_index]
                                assoc_coords.append(coord_i)
                                a_pos.append(col[pos_index])
                                assoc_dict[coord_i]=[col[p_index]]
                                assoc_list.append([coord_i,float(col[p_index])])


    # Coordinate list checks
    if len(assoc_coords)==0:
        return None


    # Get population ids from population output file from LDassoc.py
    pop_list=open(tmp_dir+"pops_"+request+".txt").readlines()
    ids=[]
    for i in range(len(pop_list)):
        ids.append(pop_list[i].strip())

    pop_ids=list(set(ids))


    # Define LD origin coordinate
    try:
        org_coord
    except NameError:
        for var_p in sorted(assoc_list, key=operator.itemgetter(1)):
            snp="chr"+var_p[0].split("-")[0]

            # Extract lowest P SNP phased genotypes
            vcf_file=vcf_dir+chromosome+".phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.vcf.gz"

            tabix_snp_h="tabix -H {0} | grep CHROM".format(vcf_file)
            proc_h=subprocess.Popen(tabix_snp_h, shell=True, stdout=subprocess.PIPE)
            head=[x.decode('utf-8') for x in proc_h.stdout.readlines()][0].strip().split()

            # Check lowest P SNP is in the 1000G population and not monoallelic from LDassoc.py output file
            vcf=open(tmp_dir+"snp_no_dups_"+request+".vcf").readlines()

            if len(vcf)==0:
                continue
            elif len(vcf)>1:
                geno=vcf[0].strip().split()

            else:
                geno=vcf[0].strip().split()

            if "," in geno[3] or "," in geno[4]:
                continue

            index=[]
            for i in range(9,len(head)):
                if head[i] in pop_ids:
                    index.append(i)

            genotypes={"0":0, "1":0}
            for i in index:
                sub_geno=geno[i].split("|")
                for j in sub_geno:
                    if j in genotypes:
                        genotypes[j]+=1
                    else:
                        genotypes[j]=1

            if genotypes["0"]==0 or genotypes["1"]==0:
                continue

            org_coord=var_p[0].split("-")[1]
            break


    else:
        if chromosome+":"+org_coord+"-"+org_coord not in assoc_coords:
            return None
            

        # Extract query SNP phased genotypes
        vcf_file=vcf_dir+chromosome+".phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.vcf.gz"

        tabix_snp_h="tabix -H {0} | grep CHROM".format(vcf_file)
        proc_h=subprocess.Popen(tabix_snp_h, shell=True, stdout=subprocess.PIPE)
        head=[x.decode('utf-8') for x in proc_h.stdout.readlines()][0].strip().split()

        tabix_snp="tabix {0} {1}:{2}-{2} | grep -v -e END > {3}".format(vcf_file, chromosome, org_coord, tmp_dir+"snp_no_dups_"+request+".vcf")
        subprocess.call(tabix_snp, shell=True)


        # Check query SNP is in the 1000G population, has the correct RS number, and not monoallelic
        vcf=open(tmp_dir+"snp_no_dups_"+request+".vcf").readlines()

        if len(vcf)==0:
            subprocess.call("rm "+tmp_dir+"pops_"+request+".txt", shell=True)
            subprocess.call("rm "+tmp_dir+"*"+request+"*.vcf", shell=True)
            return None
            
        elif len(vcf)>1:
            geno=[]
            for i in range(len(vcf)):
                if vcf[i].strip().split()[2]==snp:
                    geno=vcf[i].strip().split()
            if geno==[]:
                subprocess.call("rm "+tmp_dir+"pops_"+request+".txt", shell=True)
                subprocess.call("rm "+tmp_dir+"*"+request+"*.vcf", shell=True)
                return None
                
        else:
            geno=vcf[0].strip().split()

        if geno[2]!=snp and snp[0:2]=="rs":
            snp=geno[2]

        if "," in geno[3] or "," in geno[4]:
            subprocess.call("rm "+tmp_dir+"pops_"+request+".txt", shell=True)
            subprocess.call("rm "+tmp_dir+"*"+request+"*.vcf", shell=True)
            return None
            


        index=[]
        for i in range(9,len(head)):
            if head[i] in pop_ids:
                index.append(i)

        genotypes={"0":0, "1":0}
        for i in index:
            sub_geno=geno[i].split("|")
            for j in sub_geno:
                if j in genotypes:
                    genotypes[j]+=1
                else:
                    genotypes[j]=1

        if genotypes["0"]==0 or genotypes["1"]==0:
            subprocess.call("rm "+tmp_dir+"pops_"+request+".txt", shell=True)
            subprocess.call("rm "+tmp_dir+"*"+request+"*.vcf", shell=True)
            return None
            


    # Calculate proxy LD statistics in parallel
    if len(assoc_coords)<60:
        threads=1
    else:
        threads=4

    block=len(assoc_coords)/threads
    commands=[]
    for i in range(threads):
        if i==min(range(threads)) and i==max(range(threads)):
            command="python LDassoc_sub.py "+snp+" "+chromosome+" "+"_".join(assoc_coords)+" "+request+" "+str(i)
        elif i==min(range(threads)):
            command="python LDassoc_sub.py "+snp+" "+chromosome+" "+"_".join(assoc_coords[:block])+" "+request+" "+str(i)
        elif i==max(range(threads)):
            command="python LDassoc_sub.py "+snp+" "+chromosome+" "+"_".join(assoc_coords[(block*i)+1:])+" "+request+" "+str(i)
        else:
            command="python LDassoc_sub.py "+snp+" "+chromosome+" "+"_".join(assoc_coords[(block*i)+1:block*(i+1)])+" "+request+" "+str(i)
        commands.append(command)


    processes=[subprocess.Popen(command, shell=True, stdout=subprocess.PIPE) for command in commands]

    # collect output in parallel
    def get_output(process):
        return process.communicate()[0].splitlines()

    pool = Pool(len(processes))
    out_raw=pool.map(get_output, processes)
    pool.close()
    pool.join()


    # Aggregate output
    out_prox=[]
    for i in range(len(out_raw)):
        for j in range(len(out_raw[i])):
            col=out_raw[i][j].decode('utf-8').strip().split("\t")
            col[6]=int(col[6])
            col[7]=float(col[7])
            col[8]=float(col[8])
            col.append(abs(int(col[6])))
            pos_i_j=col[5].split(":")[1]
            coord_i_j=chromosome+":"+pos_i_j+"-"+pos_i_j
            if coord_i_j in assoc_dict:
                col.append(float(assoc_dict[coord_i_j][0]))
                out_prox.append(col)


    out_dist_sort=sorted(out_prox, key=operator.itemgetter(14))
    out_p_sort=sorted(out_dist_sort, key=operator.itemgetter(15), reverse=False)

    # Organize scatter plot data
    q_rs=[]
    q_allele=[]
    q_coord=[]
    q_maf=[]
    p_rs=[]
    p_allele=[]
    p_coord=[]
    p_pos=[]
    p_maf=[]
    dist=[]
    d_prime=[]
    d_prime_round=[]
    r2=[]
    r2_round=[]
    corr_alleles=[]
    regdb=[]
    funct=[]
    color=[]
    alpha=[]
    size=[]
    p_val=[]
    neg_log_p=[]
    for i in range(len(out_p_sort)):
        q_rs_i,q_allele_i,q_coord_i,p_rs_i,p_allele_i,p_coord_i,dist_i,d_prime_i,r2_i,corr_alleles_i,regdb_i,q_maf_i,p_maf_i,funct_i,dist_abs,p_val_i=out_p_sort[i]

        q_rs.append(q_rs_i)
        q_allele.append(q_allele_i)
        q_coord.append(float(q_coord_i.split(":")[1])/1000000)
        q_maf.append(str(round(float(q_maf_i),4)))
        if p_rs_i==".":
            p_rs_i=p_coord_i
        p_rs.append(p_rs_i)
        p_allele.append(p_allele_i)
        p_coord.append(float(p_coord_i.split(":")[1])/1000000)
        p_pos.append(p_coord_i.split(":")[1])
        p_maf.append(str(round(float(p_maf_i),4)))
        dist.append(str(round(dist_i/1000000.0,4)))
        d_prime.append(float(d_prime_i))
        d_prime_round.append(str(round(float(d_prime_i),4)))
        r2.append(float(r2_i))
        r2_round.append(str(round(float(r2_i),4)))
        corr_alleles.append(corr_alleles_i)

        # P-value
        p_val.append(p_val_i)
        neg_log_p.append(-log10(p_val_i))

        # Correct Missing Annotations
        if regdb_i==".":
            regdb_i=""
        regdb.append(regdb_i)
        if funct_i==".":
            funct_i=""
        if funct_i=="NA":
            funct_i="none"
        funct.append(funct_i)

        # Set Color
        reds=["#FFCCCC","#FFCACA","#FFC8C8","#FFC6C6","#FFC4C4","#FFC2C2","#FFC0C0","#FFBEBE","#FFBCBC","#FFBABA","#FFB8B8","#FFB6B6","#FFB4B4","#FFB1B1","#FFAFAF","#FFADAD","#FFABAB","#FFA9A9","#FFA7A7","#FFA5A5","#FFA3A3","#FFA1A1","#FF9F9F","#FF9D9D","#FF9B9B","#FF9999","#FF9797","#FF9595","#FF9393","#FF9191","#FF8F8F","#FF8D8D","#FF8B8B","#FF8989","#FF8787","#FF8585","#FF8383","#FF8181","#FF7E7E","#FF7C7C","#FF7A7A","#FF7878","#FF7676","#FF7474","#FF7272","#FF7070","#FF6E6E","#FF6C6C","#FF6A6A","#FF6868","#FF6666","#FF6464","#FF6262","#FF6060","#FF5E5E","#FF5C5C","#FF5A5A","#FF5858","#FF5656","#FF5454","#FF5252","#FF5050","#FF4E4E","#FF4B4B","#FF4949","#FF4747","#FF4545","#FF4343","#FF4141","#FF3F3F","#FF3D3D","#FF3B3B","#FF3939","#FF3737","#FF3535","#FF3333","#FF3131","#FF2F2F","#FF2D2D","#FF2B2B","#FF2929","#FF2727","#FF2525","#FF2323","#FF2121","#FF1F1F","#FF1D1D","#FF1B1B","#FF1818","#FF1616","#FF1414","#FF1212","#FF1010","#FF0E0E","#FF0C0C","#FF0A0A","#FF0808","#FF0606","#FF0404","#FF0202","#FF0000"]
        if q_coord_i==p_coord_i:
            color_i="#0000FF"
            alpha_i=0.7
        else:
            if myargs['dprime']==True:
                color_i=reds[int(d_prime_i*100.0)]
                alpha_i=0.7
            elif myargs['dprime']==False:
                color_i=reds[int(r2_i*100.0)]
                alpha_i=0.7
        color.append(color_i)
        alpha.append(alpha_i)

        # Set Size
        size_i=9+float(p_maf_i)*14.0
        size.append(size_i)


    # Pull out SNPs from association file not found in 1000G
    p_plot_pos=[]
    p_plot_pval=[]
    p_plot_pos2=[]
    p_plot_pval2=[]
    p_plot_dist=[]
    index_var_pos=float(q_coord_i.split(":")[1])/1000000
    for input_pos in a_pos:
        if input_pos not in p_pos:
            p_plot_pos.append(float(input_pos)/1000000)
            p_plot_pval.append(-log10(float(assoc_dict[chromosome+":"+input_pos+"-"+input_pos][0])))
            p_plot_pos2.append("chr"+chromosome+":"+input_pos)
            p_plot_pval2.append(float(assoc_dict[chromosome+":"+input_pos+"-"+input_pos][0]))
            p_plot_dist.append(str(round(float(input_pos)/1000000-index_var_pos,4)))


    # Begin Bokeh Plotting
    from collections import OrderedDict
    from bokeh.embed import components,file_html
    from bokeh.layouts import gridplot
    from bokeh.models import HoverTool,LinearAxis,Range1d
    from bokeh.plotting import ColumnDataSource,curdoc,figure,output_file,reset_output,save
    from bokeh.resources import CDN
    from bokeh.io import export_svgs
    import svgutils.compose as sg

    reset_output()

    data_p = {'p_plot_posX': p_plot_pos, 'p_plot_pvalY': p_plot_pval, 'p_plot_pos2': p_plot_pos2, 'p_plot_pval2': p_plot_pval2, 'p_plot_dist': p_plot_dist}
    source_p = ColumnDataSource(data_p)

    # Assoc Plot
    x=p_coord
    y=neg_log_p

    data = {'x': x, 'y': y, 'qrs': q_rs, 'q_alle': q_allele, 'q_maf': q_maf, 'prs': p_rs, 'p_alle': p_allele, 'p_maf': p_maf, 'dist': dist, 'r': r2_round, 'd': d_prime_round, 'alleles': corr_alleles, 'regdb': regdb, 'funct': funct, 'p_val': p_val, 'size': size, 'color': color, 'alpha': alpha}
    source = ColumnDataSource(data)

    whitespace=0.01
    xr=Range1d(start=coord1/1000000.0-whitespace, end=coord2/1000000.0+whitespace)
    yr=Range1d(start=-0.03, end=max(y)*1.03)
    sup_2="\u00B2"

    assoc_plot=figure(
                title="P-values and Regional LD for "+snp+" in "+pop,
                min_border_top=2, min_border_bottom=2, min_border_left=60, min_border_right=60, h_symmetry=False, v_symmetry=False,
                plot_width=900,
                plot_height=600,
                x_range=xr, y_range=yr,
                tools="tap,pan,box_zoom,wheel_zoom,box_select,undo,redo,reset,previewsave", logo=None,
                toolbar_location="above")

    assoc_plot.title.align="center"

    # Add recombination rate from LDassoc.py output file
    filename=tmp_dir+"recomb_"+request+".txt"
    recomb_raw=open(filename).readlines()
    recomb_x=[]
    recomb_y=[]
    for i in range(len(recomb_raw)):
        chr,pos,rate=recomb_raw[i].strip().split()
        recomb_x.append(int(pos)/1000000.0)
        recomb_y.append(float(rate)/100*max(y))

    assoc_plot.line(recomb_x, recomb_y, line_width=1, color="black", alpha=0.5)

    # Add genome-wide significance
    a = [coord1/1000000.0-whitespace,coord2/1000000.0+whitespace]
    b = [-log10(0.00000005),-log10(0.00000005)]
    assoc_plot.line(a, b, color="blue", alpha=0.5)

    assoc_points_not1000G=assoc_plot.circle(x='p_plot_posX', y='p_plot_pvalY', size=9+float("0.25")*14.0, source=source_p, line_color="gray", fill_color="white")
    assoc_points=assoc_plot.circle(x='x', y='y', size='size', color='color', alpha='alpha', source=source)
    assoc_plot.add_tools(HoverTool(renderers=[assoc_points_not1000G], tooltips=OrderedDict([("Variant", "@p_plot_pos2"), ("P-value", "@p_plot_pval2"), ("Distance (Mb)", "@p_plot_dist")])))

    hover=HoverTool(renderers=[assoc_points])
    hover.tooltips=OrderedDict([
        ("Variant", "@prs @p_alle"),
        ("P-value", "@p_val"),
        ("Distance (Mb)", "@dist"),
        ("MAF", "@p_maf"),
        ("R"+sup_2+" ("+q_rs[0]+")", "@r"),
        ("D\' ("+q_rs[0]+")", "@d"),
        ("Correlated Alleles", "@alleles"),
        ("RegulomeDB", "@regdb"),
        ("Functional Class", "@funct"),
    ])

    assoc_plot.add_tools(hover)

    # Annotate RebulomeDB scores
    if myargs['annotate']==True:
        assoc_plot.text(x, y, text=regdb, alpha=1, text_font_size="7pt", text_baseline="middle", text_align="center", angle=0)

    assoc_plot.yaxis.axis_label="-log10 P-value"

    assoc_plot.extra_y_ranges = {"y2_axis": Range1d(start=-3, end=103)}
    assoc_plot.add_layout(LinearAxis(y_range_name="y2_axis", axis_label="Combined Recombination Rate (cM/Mb)"), "right")  ## Need to confirm units


    # Rug Plot
    y2_ll=[-0.03]*len(x)
    y2_ul=[1.03]*len(x)
    yr_rug=Range1d(start=-0.03, end=1.03)

    data_rug = {'x': x, 'y': y, 'y2_ll': y2_ll, 'y2_ul': y2_ul,'qrs': q_rs, 'q_alle': q_allele, 'q_maf': q_maf, 'prs': p_rs, 'p_alle': p_allele, 'p_maf': p_maf, 'dist': dist, 'r': r2_round, 'd': d_prime_round, 'alleles': corr_alleles, 'regdb': regdb, 'funct': funct, 'p_val': p_val, 'size': size, 'color': color, 'alpha': alpha}
    source_rug = ColumnDataSource(data_rug)

    rug=figure(
            x_range=xr, y_range=yr_rug, border_fill_color='white', y_axis_type=None,
            title="", min_border_top=2, min_border_bottom=2, min_border_left=60, min_border_right=60, h_symmetry=False, v_symmetry=False,
            plot_width=900, plot_height=50, tools="xpan,tap,wheel_zoom", logo=None)

    rug.segment(x0='x', y0='y2_ll', x1='x', y1='y2_ul', source=source_rug, color='color', alpha='alpha', line_width=1)
    rug.toolbar_location=None


    # Gene Plot (All Transcripts)
    if myargs['transcript']==True:
        # Get genes from LDassoc.py output file
        filename=tmp_dir+"genes_"+request+".txt"
        genes_raw=open(filename).readlines()

        genes_plot_start=[]
        genes_plot_end=[]
        genes_plot_y=[]
        genes_plot_name=[]
        exons_plot_x=[]
        exons_plot_y=[]
        exons_plot_w=[]
        exons_plot_h=[]
        exons_plot_name=[]
        exons_plot_id=[]
        exons_plot_exon=[]
        message = ["Too many genes to plot."]
        lines=[0]
        gap=80000
        tall=0.75
        if genes_raw!=None:
            for i in range(len(genes_raw)):
                bin,name_id,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,score,name2,cdsStartStat,cdsEndStat,exonFrames=genes_raw[i].strip().split()
                name=name2
                id=name_id
                e_start=exonStarts.split(",")
                e_end=exonEnds.split(",")

                # Determine Y Coordinate
                i=0
                y_coord=None
                while y_coord==None:
                    if i>len(lines)-1:
                        y_coord=i+1
                        lines.append(int(txEnd))
                    elif int(txStart)>(gap+lines[i]):
                        y_coord=i+1
                        lines[i]=int(txEnd)
                    else:
                        i+=1

                genes_plot_start.append(int(txStart)/1000000.0)
                genes_plot_end.append(int(txEnd)/1000000.0)
                genes_plot_y.append(y_coord)
                genes_plot_name.append(name+"  ")

                for i in range(len(e_start)-1):
                    if strand=="+":
                        exon=i+1
                    else:
                        exon=len(e_start)-1-i

                    width=(int(e_end[i])-int(e_start[i]))/1000000.0
                    x_coord=int(e_start[i])/1000000.0+(width/2)

                    exons_plot_x.append(x_coord)
                    exons_plot_y.append(y_coord)
                    exons_plot_w.append(width)
                    exons_plot_h.append(tall)
                    exons_plot_name.append(name)
                    exons_plot_id.append(id)
                    exons_plot_exon.append(exon)


        n_rows=len(lines)
        genes_plot_yn=[n_rows-x+0.5 for x in genes_plot_y]
        exons_plot_yn=[n_rows-x+0.5 for x in exons_plot_y]
        yr2=Range1d(start=0, end=n_rows)

        data_gene_plot = {'exons_plot_x': exons_plot_x, 'exons_plot_yn': exons_plot_yn, 'exons_plot_w': exons_plot_w, 'exons_plot_h': exons_plot_h,'exons_plot_name': exons_plot_name, 'exons_plot_id': exons_plot_id, 'exons_plot_exon': exons_plot_exon}
        source_gene_plot=ColumnDataSource(data_gene_plot)

        max_genes = 40
        # if len(lines) < 3 or len(genes_raw) > max_genes:
        if len(lines) < 3:
            plot_h_pix = 150
        else:
            plot_h_pix = 150 + (len(lines) - 2) * 50

        gene_plot = figure(min_border_top=2, min_border_bottom=0, min_border_left=100, min_border_right=5,
                            x_range=xr, y_range=yr2, border_fill_color='white',
                            title="", h_symmetry=False, v_symmetry=False, logo=None,
                            plot_width=900, plot_height=plot_h_pix, tools="hover,xpan,box_zoom,wheel_zoom,tap,undo,redo,reset,previewsave")

        # if len(genes_raw) <= max_genes:
        gene_plot.segment(genes_plot_start, genes_plot_yn, genes_plot_end,
                            genes_plot_yn, color="black", alpha=1, line_width=2)
        gene_plot.rect(x='exons_plot_x', y='exons_plot_yn', width='exons_plot_w', height='exons_plot_h',
                        source=source_gene_plot, fill_color="grey", line_color="grey")
        gene_plot.text(genes_plot_start, genes_plot_yn, text=genes_plot_name, alpha=1, text_font_size="7pt",
                        text_font_style="bold", text_baseline="middle", text_align="right", angle=0)
        hover = gene_plot.select(dict(type=HoverTool))
        hover.tooltips = OrderedDict([
            ("Gene", "@exons_plot_name"),
            ("Transcript ID", "@exons_plot_id"),
            ("Exon", "@exons_plot_exon"),
        ])

        # else:
        #     x_coord_text = coord1/1000000.0 + (coord2/1000000.0 - coord1/1000000.0) / 2.0
        #     gene_plot.text(x_coord_text, n_rows / 2.0, text=message, alpha=1,
        #                     text_font_size="12pt", text_font_style="bold", text_baseline="middle", text_align="center", angle=0)

        gene_plot.xaxis.axis_label = "Chromosome " + chromosome + " Coordinate (Mb)(GRCh37)"
        gene_plot.yaxis.axis_label = "Genes (All Transcripts)"
        gene_plot.ygrid.grid_line_color = None
        gene_plot.yaxis.axis_line_color = None
        gene_plot.yaxis.minor_tick_line_color = None
        gene_plot.yaxis.major_tick_line_color = None
        gene_plot.yaxis.major_label_text_color = None

        gene_plot.toolbar_location = "below"

        # Change output backend to SVG temporarily for headless export
        assoc_plot.output_backend = "svg"
        rug.output_backend = "svg"
        gene_plot.output_backend = "svg"
        export_svgs(assoc_plot, filename=tmp_dir + "assoc_plot_1_" + request + ".svg")
        export_svgs(gene_plot, filename=tmp_dir + "gene_plot_1_" + request + ".svg")

        # 1 pixel = 0.0264583333 cm
        svg_height = str(20.00 + (0.0264583333 * plot_h_pix)) + "cm"
        svg_height_scaled = str(100.00 + (0.1322916665 * plot_h_pix)) + "cm"
        
        # Concatenate svgs
        sg.Figure("24.59cm", svg_height,
            sg.SVG(tmp_dir + "assoc_plot_1_" + request + ".svg"),
            sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").move(-40, 630)
            ).save(tmp_dir + "assoc_plot_" + request + ".svg")

        sg.Figure("122.95cm", svg_height_scaled,
            sg.SVG(tmp_dir + "assoc_plot_1_" + request + ".svg").scale(5),
            sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").scale(5).move(-200, 3150)
            ).save(tmp_dir + "assoc_plot_scaled_" + request + ".svg")

        # Export to PDF
        subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "assoc_plot_" + request + ".svg " + tmp_dir + "assoc_plot_" + request + ".pdf", shell=True)
        # Export to PNG
        subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "assoc_plot_scaled_" + request + ".svg " + tmp_dir + "assoc_plot_" + request + ".png", shell=True)
        # Export to JPEG
        subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "assoc_plot_scaled_" + request + ".svg " + tmp_dir + "assoc_plot_" + request + ".jpeg", shell=True)    
        # Remove individual SVG files after they are combined
        subprocess.call("rm " + tmp_dir + "assoc_plot_1_" + request + ".svg", shell=True)
        subprocess.call("rm " + tmp_dir + "gene_plot_1_" + request + ".svg", shell=True)
        # Remove scaled SVG file after it is converted to png and jpeg
        subprocess.call("rm " + tmp_dir + "assoc_plot_scaled_" + request + ".svg", shell=True)



    # Gene Plot (Collapsed)
    else:
        # Get genes from LDassoc.py output file
        filename_c=tmp_dir+"genes_c_"+request+".txt"
        genes_c_raw=open(filename_c).readlines()

        genes_c_plot_start=[]
        genes_c_plot_end=[]
        genes_c_plot_y=[]
        genes_c_plot_name=[]
        exons_c_plot_x=[]
        exons_c_plot_y=[]
        exons_c_plot_w=[]
        exons_c_plot_h=[]
        exons_c_plot_name=[]
        exons_c_plot_id=[]
        message_c = ["Too many genes to plot."]
        lines_c=[0]
        gap=80000
        tall=0.75
        if genes_c_raw!=None:
            for i in range(len(genes_c_raw)):
                chrom,txStart,txEnd,name,exonStarts,exonEnds,transcripts=genes_c_raw[i].strip().split()
                e_start=exonStarts.split(",")
                e_end=exonEnds.split(",")
                e_transcripts=transcripts.split(",")

                # Determine Y Coordinate
                i=0
                y_coord=None
                while y_coord==None:
                    if i>len(lines_c)-1:
                        y_coord=i+1
                        lines_c.append(int(txEnd))
                    elif int(txStart)>(gap+lines_c[i]):
                        y_coord=i+1
                        lines_c[i]=int(txEnd)
                    else:
                        i+=1

                genes_c_plot_start.append(int(txStart)/1000000.0)
                genes_c_plot_end.append(int(txEnd)/1000000.0)
                genes_c_plot_y.append(y_coord)
                genes_c_plot_name.append(name+"  ")

                for i in range(len(e_start)):

                    width=(int(e_end[i])-int(e_start[i]))/1000000.0
                    x_coord=int(e_start[i])/1000000.0+(width/2)

                    exons_c_plot_x.append(x_coord)
                    exons_c_plot_y.append(y_coord)
                    exons_c_plot_w.append(width)
                    exons_c_plot_h.append(tall)
                    exons_c_plot_name.append(name)
                    exons_c_plot_id.append(e_transcripts[i].replace("-",","))


        n_rows_c=len(lines_c)
        genes_c_plot_yn=[n_rows_c-x+0.5 for x in genes_c_plot_y]
        exons_c_plot_yn=[n_rows_c-x+0.5 for x in exons_c_plot_y]
        yr2_c=Range1d(start=0, end=n_rows_c)

        data_gene_c_plot = {'exons_c_plot_x': exons_c_plot_x, 'exons_c_plot_yn': exons_c_plot_yn, 'exons_c_plot_w': exons_c_plot_w, 'exons_c_plot_h': exons_c_plot_h, 'exons_c_plot_name': exons_c_plot_name, 'exons_c_plot_id': exons_c_plot_id}
        source_gene_c_plot=ColumnDataSource(data_gene_c_plot)

        max_genes_c = 40
        # if len(lines_c) < 3 or len(genes_c_raw) > max_genes_c:
        if len(lines_c) < 3:
            plot_c_h_pix = 150
        else:
            plot_c_h_pix = 150 + (len(lines_c) - 2) * 50

        gene_c_plot = figure(min_border_top=2, min_border_bottom=0, min_border_left=100, min_border_right=5,
                            x_range=xr, y_range=yr2_c, border_fill_color='white',
                            title="", h_symmetry=False, v_symmetry=False, logo=None,
                            plot_width=900, plot_height=plot_c_h_pix, tools="hover,xpan,box_zoom,wheel_zoom,tap,undo,redo,reset,previewsave")

        # if len(genes_c_raw) <= max_genes_c:
        gene_c_plot.segment(genes_c_plot_start, genes_c_plot_yn, genes_c_plot_end,
                            genes_c_plot_yn, color="black", alpha=1, line_width=2)
        gene_c_plot.rect(x='exons_c_plot_x', y='exons_c_plot_yn', width='exons_c_plot_w', height='exons_c_plot_h',
                        source=source_gene_c_plot, fill_color="grey", line_color="grey")
        gene_c_plot.text(genes_c_plot_start, genes_c_plot_yn, text=genes_c_plot_name, alpha=1, text_font_size="7pt",
                        text_font_style="bold", text_baseline="middle", text_align="right", angle=0)
        hover = gene_c_plot.select(dict(type=HoverTool))
        hover.tooltips = OrderedDict([
            ("Gene", "@exons_c_plot_name"),
            ("Transcript IDs", "@exons_c_plot_id"),
        ])

        # else:
        #     x_coord_text = coord1/1000000.0 + (coord2/1000000.0 - coord1/1000000.0) / 2.0
        #     gene_c_plot.text(x_coord_text, n_rows_c / 2.0, text=message_c, alpha=1,
        #                     text_font_size="12pt", text_font_style="bold", text_baseline="middle", text_align="center", angle=0)

        gene_c_plot.xaxis.axis_label = "Chromosome " + chromosome + " Coordinate (Mb)(GRCh37)"
        gene_c_plot.yaxis.axis_label = "Genes (Transcripts Collapsed)"
        gene_c_plot.ygrid.grid_line_color = None
        gene_c_plot.yaxis.axis_line_color = None
        gene_c_plot.yaxis.minor_tick_line_color = None
        gene_c_plot.yaxis.major_tick_line_color = None
        gene_c_plot.yaxis.major_label_text_color = None

        gene_c_plot.toolbar_location = "below"
        
        # Change output backend to SVG temporarily for headless export
        assoc_plot.output_backend = "svg"
        rug.output_backend = "svg"
        gene_c_plot.output_backend = "svg"
        export_svgs(assoc_plot, filename=tmp_dir + "assoc_plot_1_" + request + ".svg")
        export_svgs(gene_c_plot, filename=tmp_dir + "gene_plot_1_" + request + ".svg")
        
        # 1 pixel = 0.0264583333 cm
        svg_height = str(20.00 + (0.0264583333 * plot_c_h_pix)) + "cm"
        svg_height_scaled = str(100.00 + (0.1322916665 * plot_c_h_pix)) + "cm"

        # Concatenate svgs
        sg.Figure("24.59cm", svg_height,
            sg.SVG(tmp_dir + "assoc_plot_1_" + request + ".svg"),
            sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").move(-40, 630)
            ).save(tmp_dir + "assoc_plot_" + request + ".svg")

        sg.Figure("122.95cm", svg_height_scaled,
            sg.SVG(tmp_dir + "assoc_plot_1_" + request + ".svg").scale(5),
            sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").scale(5).move(-200, 3150)
            ).save(tmp_dir + "assoc_plot_scaled_" + request + ".svg")

        # Export to PDF
        subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "assoc_plot_" + request + ".svg " + tmp_dir + "assoc_plot_" + request + ".pdf", shell=True)
        # Export to PNG
        subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "assoc_plot_scaled_" + request + ".svg " + tmp_dir + "assoc_plot_" + request + ".png", shell=True)
        # Export to JPEG
        subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "assoc_plot_scaled_" + request + ".svg " + tmp_dir + "assoc_plot_" + request + ".jpeg", shell=True)    
        # Remove individual SVG files after they are combined
        subprocess.call("rm " + tmp_dir + "assoc_plot_1_" + request + ".svg", shell=True)
        subprocess.call("rm " + tmp_dir + "gene_plot_1_" + request + ".svg", shell=True)
        # Remove scaled SVG file after it is converted to png and jpeg
        subprocess.call("rm " + tmp_dir + "assoc_plot_scaled_" + request + ".svg", shell=True)

    reset_output()

    # Remove temporary files
    subprocess.call("rm "+tmp_dir+"pops_"+request+".txt", shell=True)
    subprocess.call("rm "+tmp_dir+"*"+request+"*.vcf", shell=True)
    subprocess.call("rm "+tmp_dir+"genes_*"+request+"*.txt", shell=True)
    subprocess.call("rm "+tmp_dir+"recomb_"+request+".txt", shell=True)
    subprocess.call("rm "+tmp_dir+"assoc_args"+request+".json", shell=True)

    print("Bokeh high quality image export complete!")

    # Return plot output
    return None
Exemplo n.º 4
0
def calculate_proxy_svg(snp, pop, request, r2_d="r2"):

    # Set data directories using config.yml
    with open('config.yml', 'r') as f:
        config = yaml.load(f)
    vcf_dir = config['data']['vcf_dir']

    tmp_dir = "./tmp/"

    # Ensure tmp directory exists
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)

    if request is False:
        request = str(time.strftime("%I%M%S"))

    # Create JSON output

    # Find coordinates (GRCh37/hg19) for SNP RS number

    # Connect to Mongo snp database
    client = MongoClient(
        'mongodb://' + username + ':' + password + '@localhost/admin', port)
    db = client["LDLink"]

    def get_coords(db, rsid):
        rsid = rsid.strip("rs")
        query_results = db.dbsnp151.find_one({"id": rsid})
        query_results_sanitized = json.loads(json_util.dumps(query_results))
        return query_results_sanitized

    # Query genomic coordinates
    def get_rsnum(db, coord):
        temp_coord = coord.strip("chr").split(":")
        chro = temp_coord[0]
        pos = temp_coord[1]
        query_results = db.dbsnp151.find({
            "chromosome":
            chro.upper() if chro == 'x' or chro == 'y' else chro,
            "position":
            pos
        })
        query_results_sanitized = json.loads(json_util.dumps(query_results))
        return query_results_sanitized

    # Replace input genomic coordinates with variant ids (rsids)
    def replace_coord_rsid(db, snp):
        if snp[0:2] == "rs":
            return snp
        else:
            snp_info_lst = get_rsnum(db, snp)
            print "snp_info_lst"
            print snp_info_lst
            if snp_info_lst != None:
                if len(snp_info_lst) > 1:
                    var_id = "rs" + snp_info_lst[0]['id']
                    ref_variants = []
                    for snp_info in snp_info_lst:
                        if snp_info['id'] == snp_info['ref_id']:
                            ref_variants.append(snp_info['id'])
                    if len(ref_variants) > 1:
                        var_id = "rs" + ref_variants[0]
                    elif len(ref_variants) == 0 and len(snp_info_lst) > 1:
                        var_id = "rs" + snp_info_lst[0]['id']
                    else:
                        var_id = "rs" + ref_variants[0]
                    return var_id
                elif len(snp_info_lst) == 1:
                    var_id = "rs" + snp_info_lst[0]['id']
                    return var_id
                else:
                    return snp
            else:
                return snp
        return snp

    snp = replace_coord_rsid(db, snp)

    # Find RS number in snp database
    snp_coord = get_coords(db, snp)

    # Get population ids from LDproxy.py tmp output files
    pop_list = open(tmp_dir + "pops_" + request + ".txt").readlines()
    ids = []
    for i in range(len(pop_list)):
        ids.append(pop_list[i].strip())

    pop_ids = list(set(ids))

    # Extract query SNP phased genotypes
    vcf_file = vcf_dir + \
        snp_coord['chromosome'] + ".phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.vcf.gz"

    tabix_snp_h = "tabix -H {0} | grep CHROM".format(vcf_file)
    proc_h = subprocess.Popen(tabix_snp_h, shell=True, stdout=subprocess.PIPE)
    head = proc_h.stdout.readlines()[0].strip().split()

    tabix_snp = "tabix {0} {1}:{2}-{2} | grep -v -e END > {3}".format(
        vcf_file, snp_coord['chromosome'], snp_coord['position'],
        tmp_dir + "snp_no_dups_" + request + ".vcf")
    subprocess.call(tabix_snp, shell=True)

    # Check SNP is in the 1000G population, has the correct RS number, and not
    # monoallelic
    vcf = open(tmp_dir + "snp_no_dups_" + request + ".vcf").readlines()

    if len(vcf) == 0:
        subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt",
                        shell=True)
        subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True)
        return None
    elif len(vcf) > 1:
        geno = []
        for i in range(len(vcf)):
            if vcf[i].strip().split()[2] == snp:
                geno = vcf[i].strip().split()
        if geno == []:
            subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt",
                            shell=True)
            subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf",
                            shell=True)
            return None
    else:
        geno = vcf[0].strip().split()

    if geno[2] != snp:
        snp = geno[2]

    if "," in geno[3] or "," in geno[4]:
        subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt",
                        shell=True)
        subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True)
        return None

    index = []
    for i in range(9, len(head)):
        if head[i] in pop_ids:
            index.append(i)

    genotypes = {"0": 0, "1": 0}
    for i in index:
        sub_geno = geno[i].split("|")
        for j in sub_geno:
            if j in genotypes:
                genotypes[j] += 1
            else:
                genotypes[j] = 1

    if genotypes["0"] == 0 or genotypes["1"] == 0:
        subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt",
                        shell=True)
        subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True)
        return None

    # Define window of interest around query SNP
    window = 500000
    coord1 = int(snp_coord['position']) - window
    if coord1 < 0:
        coord1 = 0
    coord2 = int(snp_coord['position']) + window

    # Calculate proxy LD statistics in parallel
    threads = 4
    block = (2 * window) / 4
    commands = []
    for i in range(threads):
        if i == min(range(threads)) and i == max(range(threads)):
            command = "python LDproxy_sub.py " + "True " + snp + " " + \
                snp_coord['chromosome'] + " " + str(coord1) + " " + \
                str(coord2) + " " + request + " " + str(i)
        elif i == min(range(threads)):
            command = "python LDproxy_sub.py " + "True " + snp + " " + \
                snp_coord['chromosome'] + " " + str(coord1) + " " + \
                str(coord1 + block) + " " + request + " " + str(i)
        elif i == max(range(threads)):
            command = "python LDproxy_sub.py " + "True " + snp + " " + snp_coord[
                'chromosome'] + " " + str(
                    coord1 + (block * i) +
                    1) + " " + str(coord2) + " " + request + " " + str(i)
        else:
            command = "python LDproxy_sub.py " + "True " + snp + " " + snp_coord[
                'chromosome'] + " " + str(
                    coord1 + (block * i) +
                    1) + " " + str(coord1 +
                                   (block *
                                    (i + 1))) + " " + request + " " + str(i)
        commands.append(command)

    processes = [
        subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
        for command in commands
    ]

    # collect output in parallel
    def get_output(process):
        return process.communicate()[0].splitlines()

    if not hasattr(threading.current_thread(), "_children"):
        threading.current_thread()._children = weakref.WeakKeyDictionary()

    pool = Pool(len(processes))
    out_raw = pool.map(get_output, processes)
    pool.close()
    pool.join()

    # Aggregate output
    out_prox = []
    for i in range(len(out_raw)):
        for j in range(len(out_raw[i])):
            col = out_raw[i][j].strip().split("\t")
            col[6] = int(col[6])
            col[7] = float(col[7])
            col[8] = float(col[8])
            col.append(abs(int(col[6])))
            out_prox.append(col)

    # Sort output
    if r2_d not in ["r2", "d"]:
        r2_d = "r2"

    out_dist_sort = sorted(out_prox, key=operator.itemgetter(14))
    if r2_d == "r2":
        out_ld_sort = sorted(out_dist_sort,
                             key=operator.itemgetter(8),
                             reverse=True)
    else:
        out_ld_sort = sorted(out_dist_sort,
                             key=operator.itemgetter(7),
                             reverse=True)

    # Organize scatter plot data
    q_rs = []
    q_allele = []
    q_coord = []
    q_maf = []
    p_rs = []
    p_allele = []
    p_coord = []
    p_maf = []
    dist = []
    d_prime = []
    d_prime_round = []
    r2 = []
    r2_round = []
    corr_alleles = []
    regdb = []
    funct = []
    color = []
    size = []
    for i in range(len(out_ld_sort)):
        q_rs_i, q_allele_i, q_coord_i, p_rs_i, p_allele_i, p_coord_i, dist_i, d_prime_i, r2_i, corr_alleles_i, regdb_i, q_maf_i, p_maf_i, funct_i, dist_abs = out_ld_sort[
            i]

        if float(r2_i) > 0.01:
            q_rs.append(q_rs_i)
            q_allele.append(q_allele_i)
            q_coord.append(float(q_coord_i.split(":")[1]) / 1000000)
            q_maf.append(str(round(float(q_maf_i), 4)))
            if p_rs_i == ".":
                p_rs_i = p_coord_i
            p_rs.append(p_rs_i)
            p_allele.append(p_allele_i)
            p_coord.append(float(p_coord_i.split(":")[1]) / 1000000)
            p_maf.append(str(round(float(p_maf_i), 4)))
            dist.append(str(round(dist_i / 1000000.0, 4)))
            d_prime.append(float(d_prime_i))
            d_prime_round.append(str(round(float(d_prime_i), 4)))
            r2.append(float(r2_i))
            r2_round.append(str(round(float(r2_i), 4)))
            corr_alleles.append(corr_alleles_i)

            # Correct Missing Annotations
            if regdb_i == ".":
                regdb_i = ""
            regdb.append(regdb_i)
            if funct_i == ".":
                funct_i = ""
            if funct_i == "NA":
                funct_i = "none"
            funct.append(funct_i)

            # Set Color
            if i == 0:
                color_i = "blue"
            elif funct_i != "none" and funct_i != "":
                color_i = "red"
            else:
                color_i = "orange"
            color.append(color_i)

            # Set Size
            size_i = 9 + float(p_maf_i) * 14.0
            size.append(size_i)

    # Begin Bokeh Plotting
    from collections import OrderedDict
    from bokeh.embed import components, file_html
    from bokeh.layouts import gridplot
    from bokeh.models import HoverTool, LinearAxis, Range1d
    from bokeh.plotting import ColumnDataSource, curdoc, figure, output_file, reset_output, save
    from bokeh.resources import CDN
    from bokeh.io import export_svgs
    import svgutils.compose as sg

    reset_output()

    # Proxy Plot
    x = p_coord
    if r2_d == "r2":
        y = r2
    else:
        y = d_prime
    whitespace = 0.01
    xr = Range1d(start=coord1 / 1000000.0 - whitespace,
                 end=coord2 / 1000000.0 + whitespace)
    yr = Range1d(start=-0.03, end=1.03)
    sup_2 = u"\u00B2"

    proxy_plot = figure(
        title="Proxies for " + snp + " in " + pop,
        min_border_top=2,
        min_border_bottom=2,
        min_border_left=60,
        min_border_right=60,
        h_symmetry=False,
        v_symmetry=False,
        plot_width=900,
        plot_height=600,
        x_range=xr,
        y_range=yr,
        tools="hover,tap,pan,box_zoom,box_select,undo,redo,reset,previewsave",
        logo=None,
        toolbar_location="above")

    proxy_plot.title.align = "center"

    # Get recomb from LDproxy.py tmp output files
    filename = tmp_dir + "recomb_" + request + ".txt"
    recomb_raw = open(filename).readlines()
    recomb_x = []
    recomb_y = []
    for i in range(len(recomb_raw)):
        chr, pos, rate = recomb_raw[i].strip().split()
        recomb_x.append(int(pos) / 1000000.0)
        recomb_y.append(float(rate) / 100.0)

    data = {
        'x': x,
        'y': y,
        'qrs': q_rs,
        'q_alle': q_allele,
        'q_maf': q_maf,
        'prs': p_rs,
        'p_alle': p_allele,
        'p_maf': p_maf,
        'dist': dist,
        'r': r2_round,
        'd': d_prime_round,
        'alleles': corr_alleles,
        'regdb': regdb,
        'funct': funct,
        'size': size,
        'color': color
    }
    source = ColumnDataSource(data)

    proxy_plot.line(recomb_x, recomb_y, line_width=1, color="black", alpha=0.5)

    proxy_plot.circle(x='x',
                      y='y',
                      size='size',
                      color='color',
                      alpha=0.5,
                      source=source)

    hover = proxy_plot.select(dict(type=HoverTool))
    hover.tooltips = OrderedDict([
        ("Query Variant", "@qrs @q_alle"),
        ("Proxy Variant", "@prs @p_alle"),
        ("Distance (Mb)", "@dist"),
        ("MAF (Query,Proxy)", "@q_maf,@p_maf"),
        ("R" + sup_2, "@r"),
        ("D\'", "@d"),
        ("Correlated Alleles", "@alleles"),
        ("RegulomeDB", "@regdb"),
        ("Functional Class", "@funct"),
    ])

    proxy_plot.text(x,
                    y,
                    text=regdb,
                    alpha=1,
                    text_font_size="7pt",
                    text_baseline="middle",
                    text_align="center",
                    angle=0)

    if r2_d == "r2":
        proxy_plot.yaxis.axis_label = "R" + sup_2
    else:
        proxy_plot.yaxis.axis_label = "D\'"

    proxy_plot.extra_y_ranges = {"y2_axis": Range1d(start=-3, end=103)}
    proxy_plot.add_layout(
        LinearAxis(y_range_name="y2_axis",
                   axis_label="Combined Recombination Rate (cM/Mb)"), "right")

    # Rug Plot
    y2_ll = [-0.03] * len(x)
    y2_ul = [1.03] * len(x)
    yr_rug = Range1d(start=-0.03, end=1.03)

    data_rug = {
        'x': x,
        'y': y,
        'y2_ll': y2_ll,
        'y2_ul': y2_ul,
        'qrs': q_rs,
        'q_alle': q_allele,
        'q_maf': q_maf,
        'prs': p_rs,
        'p_alle': p_allele,
        'p_maf': p_maf,
        'dist': dist,
        'r': r2_round,
        'd': d_prime_round,
        'alleles': corr_alleles,
        'regdb': regdb,
        'funct': funct,
        'size': size,
        'color': color
    }
    source_rug = ColumnDataSource(data_rug)

    rug = figure(x_range=xr,
                 y_range=yr_rug,
                 border_fill_color='white',
                 y_axis_type=None,
                 title="",
                 min_border_top=2,
                 min_border_bottom=2,
                 min_border_left=60,
                 min_border_right=60,
                 h_symmetry=False,
                 v_symmetry=False,
                 plot_width=900,
                 plot_height=50,
                 tools="xpan,tap",
                 logo=None)

    rug.segment(x0='x',
                y0='y2_ll',
                x1='x',
                y1='y2_ul',
                source=source_rug,
                color='color',
                alpha=0.5,
                line_width=1)
    rug.toolbar_location = None

    # Gene Plot
    # Get genes from LDproxy.py tmp output files
    filename = tmp_dir + "genes_" + request + ".txt"
    genes_raw = open(filename).readlines()

    genes_plot_start = []
    genes_plot_end = []
    genes_plot_y = []
    genes_plot_name = []
    exons_plot_x = []
    exons_plot_y = []
    exons_plot_w = []
    exons_plot_h = []
    exons_plot_name = []
    exons_plot_id = []
    exons_plot_exon = []
    lines = [0]
    gap = 80000
    tall = 0.75
    if genes_raw != None:
        for i in range(len(genes_raw)):
            bin, name_id, chrom, strand, txStart, txEnd, cdsStart, cdsEnd, exonCount, exonStarts, exonEnds, score, name2, cdsStartStat, cdsEndStat, exonFrames = genes_raw[
                i].strip().split()
            name = name2
            id = name_id
            e_start = exonStarts.split(",")
            e_end = exonEnds.split(",")

            # Determine Y Coordinate
            i = 0
            y_coord = None
            while y_coord == None:
                if i > len(lines) - 1:
                    y_coord = i + 1
                    lines.append(int(txEnd))
                elif int(txStart) > (gap + lines[i]):
                    y_coord = i + 1
                    lines[i] = int(txEnd)
                else:
                    i += 1

            genes_plot_start.append(int(txStart) / 1000000.0)
            genes_plot_end.append(int(txEnd) / 1000000.0)
            genes_plot_y.append(y_coord)
            genes_plot_name.append(name + "  ")

            for i in range(len(e_start) - 1):
                if strand == "+":
                    exon = i + 1
                else:
                    exon = len(e_start) - 1 - i

                width = (int(e_end[i]) - int(e_start[i])) / 1000000.0
                x_coord = int(e_start[i]) / 1000000.0 + (width / 2)

                exons_plot_x.append(x_coord)
                exons_plot_y.append(y_coord)
                exons_plot_w.append(width)
                exons_plot_h.append(tall)
                exons_plot_name.append(name)
                exons_plot_id.append(id)
                exons_plot_exon.append(exon)

    n_rows = len(lines)
    genes_plot_yn = [n_rows - x + 0.5 for x in genes_plot_y]
    exons_plot_yn = [n_rows - x + 0.5 for x in exons_plot_y]
    yr2 = Range1d(start=0, end=n_rows)

    data_gene_plot = {
        'exons_plot_x': exons_plot_x,
        'exons_plot_yn': exons_plot_yn,
        'exons_plot_w': exons_plot_w,
        'exons_plot_h': exons_plot_h,
        'exons_plot_name': exons_plot_name,
        'exons_plot_id': exons_plot_id,
        'exons_plot_exon': exons_plot_exon
    }

    source_gene_plot = ColumnDataSource(data_gene_plot)

    if len(lines) < 3:
        plot_h_pix = 150
    else:
        plot_h_pix = 150 + (len(lines) - 2) * 50

    gene_plot = figure(
        x_range=xr,
        y_range=yr2,
        border_fill_color='white',
        title="",
        min_border_top=2,
        min_border_bottom=2,
        min_border_left=60,
        min_border_right=60,
        h_symmetry=False,
        v_symmetry=False,
        plot_width=900,
        plot_height=plot_h_pix,
        tools="hover,tap,xpan,box_zoom,undo,redo,reset,previewsave",
        logo=None)

    gene_plot.segment(genes_plot_start,
                      genes_plot_yn,
                      genes_plot_end,
                      genes_plot_yn,
                      color="black",
                      alpha=1,
                      line_width=2)

    gene_plot.rect(x='exons_plot_x',
                   y='exons_plot_yn',
                   width='exons_plot_w',
                   height='exons_plot_h',
                   source=source_gene_plot,
                   fill_color="grey",
                   line_color="grey")
    gene_plot.xaxis.axis_label = "Chromosome " + \
        snp_coord['chromosome'] + " Coordinate (Mb)(GRCh37)"
    gene_plot.yaxis.axis_label = "Genes"
    gene_plot.ygrid.grid_line_color = None
    gene_plot.yaxis.axis_line_color = None
    gene_plot.yaxis.minor_tick_line_color = None
    gene_plot.yaxis.major_tick_line_color = None
    gene_plot.yaxis.major_label_text_color = None

    hover = gene_plot.select(dict(type=HoverTool))
    hover.tooltips = OrderedDict([
        ("Gene", "@exons_plot_name"),
        ("ID", "@exons_plot_id"),
        ("Exon", "@exons_plot_exon"),
    ])

    gene_plot.text(genes_plot_start,
                   genes_plot_yn,
                   text=genes_plot_name,
                   alpha=1,
                   text_font_size="7pt",
                   text_font_style="bold",
                   text_baseline="middle",
                   text_align="right",
                   angle=0)

    gene_plot.toolbar_location = "below"

    # Change output backend to SVG temporarily for headless export
    # Will be changed back to canvas in LDlink.js
    proxy_plot.output_backend = "svg"
    rug.output_backend = "svg"
    gene_plot.output_backend = "svg"
    export_svgs(proxy_plot,
                filename=tmp_dir + "proxy_plot_1_" + request + ".svg")
    export_svgs(gene_plot,
                filename=tmp_dir + "gene_plot_1_" + request + ".svg")

    # 1 pixel = 0.0264583333 cm
    svg_height = str(20.00 + (0.0264583333 * plot_h_pix)) + "cm"
    svg_height_scaled = str(100.00 + (0.1322916665 * plot_h_pix)) + "cm"

    # Concatenate svgs
    sg.Figure("24.59cm", svg_height,
              sg.SVG(tmp_dir + "proxy_plot_1_" + request + ".svg"),
              sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").move(
                  0, 630)).save(tmp_dir + "proxy_plot_" + request + ".svg")

    sg.Figure(
        "122.95cm", svg_height_scaled,
        sg.SVG(tmp_dir + "proxy_plot_1_" + request + ".svg").scale(5),
        sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").scale(5).move(
            0, 3150)).save(tmp_dir + "proxy_plot_scaled_" + request + ".svg")

    # Export to PDF
    subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "proxy_plot_" +
                    request + ".svg " + tmp_dir + "proxy_plot_" + request +
                    ".pdf",
                    shell=True)
    # Export to PNG
    subprocess.call("phantomjs ./rasterize.js " + tmp_dir +
                    "proxy_plot_scaled_" + request + ".svg " + tmp_dir +
                    "proxy_plot_" + request + ".png",
                    shell=True)
    # Export to JPEG
    subprocess.call("phantomjs ./rasterize.js " + tmp_dir +
                    "proxy_plot_scaled_" + request + ".svg " + tmp_dir +
                    "proxy_plot_" + request + ".jpeg",
                    shell=True)
    # Remove individual SVG files after they are combined
    subprocess.call("rm " + tmp_dir + "proxy_plot_1_" + request + ".svg",
                    shell=True)
    subprocess.call("rm " + tmp_dir + "gene_plot_1_" + request + ".svg",
                    shell=True)
    # Remove scaled SVG file after it is converted to png and jpeg
    subprocess.call("rm " + tmp_dir + "proxy_plot_scaled_" + request + ".svg",
                    shell=True)

    reset_output()

    # Remove temporary files
    subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt", shell=True)
    subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True)
    subprocess.call("rm " + tmp_dir + "genes_" + request + ".txt", shell=True)
    subprocess.call("rm " + tmp_dir + "recomb_" + request + ".txt", shell=True)

    # Return plot output
    return None
Exemplo n.º 5
0
def fsapt_analyze(lig_dir, mode, ene_type):
    lig_name = os.path.basename(os.path.abspath(lig_dir))
    matrix_dfs = []
    outfiles = glob('%s/FSAPT*out' % lig_dir)
    for of in outfiles:
        df = _get_ene_matrix(of, ene_type)
        if not df is None:
            matrix_dfs.append(df)

    all_df = pd.concat(matrix_dfs, axis=1)
    mean_df = all_df.stack().groupby(level=[0, 1]).mean().unstack()
    std_df = all_df.stack().groupby(level=[0, 1]).std().unstack()

    if mode in ['prolig', 'proliglig']:
        old_columns = mean_df.columns[:]
        new_labels = []
        numbering = []
        for old_label in old_columns:
            if old_label == 'Total':
                new_labels.append('Total')
                numbering.append(100000)
            else:
                labels = old_label.split('-')
                if len(labels) == 2:
                    new_labels.append(''.join(labels))
                    numbering.append(float(labels[-1]))
                elif len(labels) == 3:
                    new_labels.append('-'.join(labels[1:]))
                    numbering.append(0.5 *
                                     (float(labels[-1]) + float(labels[-2])))
        new_columns = [nl for _, nl in sorted(zip(numbering, new_labels))]
        old_columns = [ol for _, ol in sorted(zip(numbering, old_columns))]

        new_mean_df = pd.DataFrame()
        new_std_df = pd.DataFrame()
        for nc, oc in zip(new_columns, old_columns):
            new_mean_df[nc] = mean_df[oc]
            new_std_df[nc] = std_df[oc]
        mean_df = new_mean_df
        std_df = new_std_df

    mean_anno = mean_df.applymap(lambda x: '%+.2f\n' % x)
    std_anno = std_df.applymap(lambda x: r'+/-%.2f' % x)
    all_anno = mean_anno + std_anno

    matrix_svg = '%s/ene_matrix_%s.svg' % (lig_dir, ene_type)
    plot_matrix(mean_df, all_anno, matrix_svg, mode, ene_type)

    mean_df.to_csv('%s/ene_mean_%s_%s_%s.csv' %
                   (lig_dir, lig_name, mode, ene_type))
    std_df.to_csv('%s/ene_std_%s_%s_%s.csv' %
                  (lig_dir, lig_name, mode, ene_type))

    # Plot the ligand
    dpi = 96
    width = len(mean_df.columns) + 2
    height = 4

    ligmol = cs._RdkitMolBase.from_file('MD/%s/cmp_sybyl.mol2' % lig_name)
    ligmol._init_atominfo(reset=False)
    ligmol.charged_mol2file = 'MD/%s/cmp_sybyl.mol2' % lig_name
    ligmol.get_noh_mol()
    AllChem.Compute2DCoords(ligmol.noh_mol, canonOrient=True, bondLength=1.5)
    drawer = rdMolDraw2D.MolDraw2DSVG(width * dpi, height * dpi)
    opts = drawer.drawOptions()
    opts.additionalAtomLabelPadding = 0.1

    frag_dict, _ = fragment_mol(ligmol, 'L1')

    for noha in ligmol.noh_mol.GetAtoms():
        noh_idx = noha.GetIdx()
        h_idx = ligmol.noh_to_h_atom_mapping[noh_idx]
        frag_label = str(frag_dict[h_idx]['resid'])
        if not 'L1-%02d' % int(frag_label) in mean_df.index:
            continue
        if noha.GetAtomicNum() == 6:
            opts.atomLabels[noh_idx] = '%02d' % int(frag_label)
        else:
            elem = ligmol.GetAtomWithIdx(h_idx).GetProp(
                '_TriposAtomType').split('.')[0]
            opts.atomLabels[noh_idx] = '%s/%02d' % (elem, int(frag_label))
    drawer.DrawMolecule(ligmol.noh_mol)
    drawer.FinishDrawing()
    svg = drawer.GetDrawingText().replace('svg:', '')
    struct_svg = '%s/lig_frag_%s.svg' % (lig_dir, ene_type)
    with open(struct_svg, 'w') as fh:
        fh.writelines(svg)

    # Consolidate the panels
    if mode == 'prolig':
        mat_title = 'Protein-Ligand %s Interaction' % ene_type.capitalize()
    else:
        mat_title = 'Ligand-Ligand %s Interaction' % ene_type.capitalize()

    mat_title = sc.Panel(sc.Text(mat_title, size=24)).move(20, 20)
    mat_panel = sc.Panel(sc.SVG(matrix_svg).scale(1.4)).move(0, 20)
    struct_title = sc.Panel(sc.Text('Ligand %s' % lig_name,
                                    size=24)).move(20,
                                                   dpi * len(mean_df) + 20)
    struct_panel = sc.Panel(sc.SVG(struct_svg)).move(0,
                                                     dpi * len(mean_df) + 20)
    final_figure = sc.Figure(dpi * width,
                             dpi * (len(mean_df) + height) + 40, mat_panel,
                             mat_title, struct_panel, struct_title)
    final_name = '%s/%s_%s_%s' % (lig_dir, lig_name, mode, ene_type)
    final_figure.save('%s.svg' % final_name)
    os.system('convert -density 100 %s.svg %s.pdf' % (final_name, final_name))
    os.system('rm -f %s %s' % (matrix_svg, struct_svg))

    # Write pdb for pymol
    inpdb = '%s/frame0/fsapt.pdb' % lig_dir
    outpdb = '%s_pymol.pdb' % final_name
    write_pymol_pdb(inpdb, outpdb, mean_df)
Exemplo n.º 6
0
import svgutils.compose as cg
from tqdm import tqdm


for c in tqdm([1,2,4,9,18]):

    wh = str(16*c/12)+"cm"
    cg.Figure(wh,wh,*[cg.SVG('img/cpu.svg').scale(3) for __ in range(c*c)]).tile(c,c).save("img/cpugrids/cpu1-"+str(c)+".svg")
Exemplo n.º 7
0
def put_list_of_figs_to_svg_fig(
        FIGS,
        fig_name="fig.svg",
        initial_guess=True,
        visualize=False,
        export_as_png=False,
        Props=None,
        figsize=None,
        fontsize=9,
        SCALING_FACTOR=1.34,  # needed to get the right cm size ...
        with_top_left_letter=False,
        transparent=True):
    """ take a list of figures and make a multi panel plot"""

    label = list(string.ascii_uppercase)[:len(FIGS)]

    SIZE = []
    for fig in FIGS:
        if type(fig) == str:
            SIZE.append([1., 1.])
        else:
            SIZE.append(fig.get_size_inches())

    width = np.max([s[0] for s in SIZE])
    height = np.max([s[1] for s in SIZE])

    if Props is None:
        LABELS, XCOORD, YCOORD = [], [], []

        # saving as svg
        for i in range(len(FIGS)):
            LABELS.append(label[i])
            XCOORD.append((i % 3) * width * 100)
            YCOORD.append(int(i / 3) * height * 100)
        XCOORD_LABELS,\
            YCOORD_LABELS = XCOORD, YCOORD

    else:
        XCOORD, YCOORD = Props['XCOORD'],\
                Props['YCOORD'],
        if 'LABELS' in Props:
            LABELS = Props['LABELS']
        else:
            LABELS = ['' for x in XCOORD]
        if 'XCOORD_LABELS' in Props:
            XCOORD_LABELS,\
                YCOORD_LABELS = Props['XCOORD_LABELS'],\
                                Props['YCOORD_LABELS']
        else:
            XCOORD_LABELS,\
                YCOORD_LABELS = XCOORD, YCOORD

    LOCATIONS = []
    for i in range(len(FIGS)):
        if type(FIGS[i]) is str:
            LOCATIONS.append(FIGS[i])
        else:
            LOCATIONS.append(os.path.join(gettempdir(), str(i) + '.svg'))
            FIGS[i].savefig(LOCATIONS[-1],
                            format='svg',
                            transparent=transparent)

    PANELS = []
    for i in range(len(FIGS)):
        PANELS.append(sg.Panel(\
            sg.SVG(LOCATIONS[i]).move(XCOORD[i],YCOORD[i])))

    for i in range(len(LABELS)):
        PANELS.append(sg.Panel(\
            sg.Text(LABELS[i], 15, 10,
                    size=fontsize, weight='bold').move(\
                                                       XCOORD_LABELS[i],YCOORD_LABELS[i]))\
        )

    sg.Figure("21cm", "29.7cm", *PANELS).scale(SCALING_FACTOR).save(fig_name)
    # if figsize is None:
    #     sg.Figure("21cm", "29.7cm", *PANELS).save(fig_name)
    # else:
    #     sg.Figure(str(inch2cm(figsize[0]*A0_format['width'])[0])+"cm",\
    #               str(inch2cm(figsize[1]*A0_format['height'])[0])+"cm",\
    #               *PANELS).scale(SCALING_FACTOR).save(fig_name)

    if visualize:
        os.system('open ' + fig_name)  # works well with 'Gapplin' on OS-X