Пример #1
0
def calculate_matrix_svg(snplst,
                         pop,
                         request,
                         genome_build,
                         r2_d="r2",
                         collapseTranscript=True):

    # Set data directories using config.yml
    with open('config.yml', 'r') as yml_file:
        config = yaml.load(yml_file)
    env = config['env']
    api_mongo_addr = config['api']['api_mongo_addr']
    population_samples_dir = config['data']['population_samples_dir']
    data_dir = config['data']['data_dir']
    tmp_dir = config['data']['tmp_dir']
    genotypes_dir = config['data']['genotypes_dir']
    aws_info = config['aws']
    mongo_username = config['database']['mongo_user_readonly']
    mongo_password = config['database']['mongo_password']
    mongo_port = config['database']['mongo_port']

    export_s3_keys = retrieveAWSCredentials()

    # Ensure tmp directory exists
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)

    # Open SNP list file
    snps_raw = open(snplst).readlines()

    # Remove duplicate RS numbers
    snps = []
    for snp_raw in snps_raw:
        snp = snp_raw.strip().split()
        if snp not in snps:
            snps.append(snp)

    # Select desired ancestral populations
    pops = pop.split("+")
    pop_dirs = []
    for pop_i in pops:
        if pop_i in [
                "ALL", "AFR", "AMR", "EAS", "EUR", "SAS", "ACB", "ASW", "BEB",
                "CDX", "CEU", "CHB", "CHS", "CLM", "ESN", "FIN", "GBR", "GIH",
                "GWD", "IBS", "ITU", "JPT", "KHV", "LWK", "MSL", "MXL", "PEL",
                "PJL", "PUR", "STU", "TSI", "YRI"
        ]:
            pop_dirs.append(data_dir + population_samples_dir + pop_i + ".txt")

    get_pops = "cat " + " ".join(pop_dirs)
    pop_list = [
        x.decode('utf-8') for x in subprocess.Popen(
            get_pops, shell=True, stdout=subprocess.PIPE).stdout.readlines()
    ]

    ids = [i.strip() for i in pop_list]
    pop_ids = list(set(ids))

    # Connect to Mongo snp database
    if env == 'local':
        mongo_host = api_mongo_addr
    else:
        mongo_host = 'localhost'
    client = MongoClient(
        'mongodb://' + mongo_username + ':' + mongo_password + '@' +
        mongo_host + '/admin', mongo_port)
    db = client["LDLink"]

    def get_coords(db, rsid):
        rsid = rsid.strip("rs")
        query_results = db.dbsnp.find_one({"id": rsid})
        query_results_sanitized = json.loads(json_util.dumps(query_results))
        return query_results_sanitized

    # Query genomic coordinates
    def get_rsnum(db, coord):
        temp_coord = coord.strip("chr").split(":")
        chro = temp_coord[0]
        pos = temp_coord[1]
        query_results = db.dbsnp.find({
            "chromosome":
            chro.upper() if chro == 'x' or chro == 'y' else str(chro),
            genome_build_vars[genome_build]['position']:
            str(pos)
        })
        query_results_sanitized = json.loads(json_util.dumps(query_results))
        return query_results_sanitized

    # Replace input genomic coordinates with variant ids (rsids)
    def replace_coords_rsid(db, snp_lst):
        new_snp_lst = []
        for snp_raw_i in snp_lst:
            if snp_raw_i[0][0:2] == "rs":
                new_snp_lst.append(snp_raw_i)
            else:
                snp_info_lst = get_rsnum(db, snp_raw_i[0])
                print("snp_info_lst")
                print(snp_info_lst)
                if snp_info_lst != None:
                    if len(snp_info_lst) > 1:
                        var_id = "rs" + snp_info_lst[0]['id']
                        ref_variants = []
                        for snp_info in snp_info_lst:
                            if snp_info['id'] == snp_info['ref_id']:
                                ref_variants.append(snp_info['id'])
                        if len(ref_variants) > 1:
                            var_id = "rs" + ref_variants[0]
                        elif len(ref_variants) == 0 and len(snp_info_lst) > 1:
                            var_id = "rs" + snp_info_lst[0]['id']
                        else:
                            var_id = "rs" + ref_variants[0]
                        new_snp_lst.append([var_id])
                    elif len(snp_info_lst) == 1:
                        var_id = "rs" + snp_info_lst[0]['id']
                        new_snp_lst.append([var_id])
                    else:
                        new_snp_lst.append(snp_raw_i)
                else:
                    new_snp_lst.append(snp_raw_i)
        return new_snp_lst

    snps = replace_coords_rsid(db, snps)

    # Find RS numbers in snp database
    rs_nums = []
    snp_pos = []
    snp_coords = []
    tabix_coords = ""
    for snp_i in snps:
        if len(snp_i) > 0:
            if len(snp_i[0]) > 2:
                if (snp_i[0][0:2] == "rs"
                        or snp_i[0][0:3] == "chr") and snp_i[0][-1].isdigit():
                    snp_coord = get_coords(db, snp_i[0])
                    if snp_coord != None and snp_coord[genome_build_vars[
                            genome_build]['position']] != "NA":
                        # check if variant is on chrY for genome build = GRCh38
                        if not (snp_coord['chromosome'] == "Y" and
                                (genome_build == "grch38"
                                 or genome_build == "grch38_high_coverage")):
                            rs_nums.append(snp_i[0])
                            snp_pos.append(snp_coord[
                                genome_build_vars[genome_build]['position']])
                            temp = [
                                snp_i[0], snp_coord['chromosome'],
                                snp_coord[genome_build_vars[genome_build]
                                          ['position']]
                            ]
                            snp_coords.append(temp)

    # Check max distance between SNPs
    distance_bp = []
    for i in range(len(snp_coords)):
        distance_bp.append(int(snp_coords[i][2]))

    # Sort coordinates and make tabix formatted coordinates
    snp_pos_int = [int(i) for i in snp_pos]
    snp_pos_int.sort()
    snp_coord_str = [
        genome_build_vars[genome_build]['1000G_chr_prefix'] +
        snp_coords[0][1] + ":" + str(i) + "-" + str(i) for i in snp_pos_int
    ]
    tabix_coords = " " + " ".join(snp_coord_str)

    # Extract 1000 Genomes phased genotypes
    vcf_filePath = "%s/%s%s/%s" % (
        config['aws']['data_subfolder'], genotypes_dir,
        genome_build_vars[genome_build]['1000G_dir'],
        genome_build_vars[genome_build]['1000G_file'] % (snp_coords[0][1]))
    vcf_query_snp_file = "s3://%s/%s" % (config['aws']['bucket'], vcf_filePath)

    checkS3File(aws_info, config['aws']['bucket'], vcf_filePath)

    # Define function to correct indel alleles
    def set_alleles(a1, a2):
        if len(a1) == 1 and len(a2) == 1:
            a1_n = a1
            a2_n = a2
        elif len(a1) == 1 and len(a2) > 1:
            a1_n = "-"
            a2_n = a2[1:]
        elif len(a1) > 1 and len(a2) == 1:
            a1_n = a1[1:]
            a2_n = "-"
        elif len(a1) > 1 and len(a2) > 1:
            a1_n = a1[1:]
            a2_n = a2[1:]
        return (a1_n, a2_n)

    # Import SNP VCF files
    tabix_snps = export_s3_keys + " cd {2}; tabix -fhD {0}{1} | grep -v -e END".format(
        vcf_query_snp_file, tabix_coords, data_dir + genotypes_dir +
        genome_build_vars[genome_build]['1000G_dir'])
    vcf = [
        x.decode('utf-8') for x in subprocess.Popen(
            tabix_snps, shell=True, stdout=subprocess.PIPE).stdout.readlines()
    ]

    h = 0
    while vcf[h][0:2] == "##":
        h += 1

    head = vcf[h].strip().split()

    # Extract haplotypes
    index = []
    for i in range(9, len(head)):
        if head[i] in pop_ids:
            index.append(i)

    hap1 = [[]]
    for i in range(len(index) - 1):
        hap1.append([])
    hap2 = [[]]
    for i in range(len(index) - 1):
        hap2.append([])

    rsnum_lst = []
    allele_lst = []
    pos_lst = []

    for g in range(h + 1, len(vcf)):
        geno = vcf[g].strip().split()
        geno[0] = geno[0].lstrip('chr')
        if geno[1] not in snp_pos:
            continue

        if snp_pos.count(geno[1]) == 1:
            rs_query = rs_nums[snp_pos.index(geno[1])]

        else:
            pos_index = []
            for p in range(len(snp_pos)):
                if snp_pos[p] == geno[1]:
                    pos_index.append(p)
            for p in pos_index:
                if rs_nums[p] not in rsnum_lst:
                    rs_query = rs_nums[p]
                    break

        if rs_query in rsnum_lst:
            continue

        rs_1000g = geno[2]

        if rs_query == rs_1000g:
            rsnum = rs_1000g
        else:
            count = -2
            found = "false"
            while count <= 2 and count + g < len(vcf):
                geno_next = vcf[g + count].strip().split()
                geno_next[0] = geno_next[0].lstrip('chr')
                if len(geno_next) >= 3 and rs_query == geno_next[2]:
                    found = "true"
                    break
                count += 1

            if found == "false":
                indx = [i[0] for i in snps].index(rs_query)
                # snps[indx][0] = geno[2]
                # rsnum = geno[2]
                snps[indx][0] = rs_query
                rsnum = rs_query
            else:
                continue

        if "," not in geno[3] and "," not in geno[4]:
            a1, a2 = set_alleles(geno[3], geno[4])
            for i in range(len(index)):
                if geno[index[i]] == "0|0":
                    hap1[i].append(a1)
                    hap2[i].append(a1)
                elif geno[index[i]] == "0|1":
                    hap1[i].append(a1)
                    hap2[i].append(a2)
                elif geno[index[i]] == "1|0":
                    hap1[i].append(a2)
                    hap2[i].append(a1)
                elif geno[index[i]] == "1|1":
                    hap1[i].append(a2)
                    hap2[i].append(a2)
                elif geno[index[i]] == "0":
                    hap1[i].append(a1)
                    hap2[i].append(".")
                elif geno[index[i]] == "1":
                    hap1[i].append(a2)
                    hap2[i].append(".")
                else:
                    hap1[i].append(".")
                    hap2[i].append(".")

            rsnum_lst.append(rsnum)

            position = "chr" + geno[0] + ":" + geno[1] + "-" + geno[1]
            pos_lst.append(position)
            alleles = a1 + "/" + a2
            allele_lst.append(alleles)

    # Calculate Pairwise LD Statistics
    all_haps = hap1 + hap2
    ld_matrix = [[[None for v in range(2)] for i in range(len(all_haps[0]))]
                 for j in range(len(all_haps[0]))]

    for i in range(len(all_haps[0])):
        for j in range(i, len(all_haps[0])):
            hap = {}
            for k in range(len(all_haps)):
                # Extract haplotypes
                hap_k = all_haps[k][i] + all_haps[k][j]
                if hap_k in hap:
                    hap[hap_k] += 1
                else:
                    hap[hap_k] = 1

            # Remove Missing Haplotypes
            keys = list(hap.keys())
            for key in keys:
                if "." in key:
                    hap.pop(key, None)

            # Check all haplotypes are present
            if len(hap) != 4:
                snp_i_a = allele_lst[i].split("/")
                snp_j_a = allele_lst[j].split("/")
                haps = [
                    snp_i_a[0] + snp_j_a[0], snp_i_a[0] + snp_j_a[1],
                    snp_i_a[1] + snp_j_a[0], snp_i_a[1] + snp_j_a[1]
                ]
                for h in haps:
                    if h not in hap:
                        hap[h] = 0

            # Perform LD calculations
            A = hap[sorted(hap)[0]]
            B = hap[sorted(hap)[1]]
            C = hap[sorted(hap)[2]]
            D = hap[sorted(hap)[3]]
            tmax = max(A, B, C, D)
            delta = float(A * D - B * C)
            Ms = float((A + C) * (B + D) * (A + B) * (C + D))
            if Ms != 0:
                # D prime
                if delta < 0:
                    D_prime = round(
                        abs(delta / min((A + C) * (A + B), (B + D) * (C + D))),
                        3)
                else:
                    D_prime = round(
                        abs(delta / min((A + C) * (C + D), (A + B) * (B + D))),
                        3)

                # R2
                r2 = round((delta**2) / Ms, 3)

                # Find Correlated Alleles
                if str(r2) != "NA" and float(r2) > 0.1:
                    Ac = hap[sorted(hap)[0]]
                    Bc = hap[sorted(hap)[1]]
                    Cc = hap[sorted(hap)[2]]
                    Dc = hap[sorted(hap)[3]]

                    if ((Ac * Dc) / max((Bc * Cc), 0.01) > 1):
                        match = sorted(hap)[0][0] + "=" + sorted(
                            hap)[0][1] + "," + sorted(
                                hap)[3][0] + "=" + sorted(hap)[3][1]
                    else:
                        match = sorted(hap)[1][0] + "=" + sorted(
                            hap)[1][1] + "," + sorted(
                                hap)[2][0] + "=" + sorted(hap)[2][1]
                else:
                    match = "  =  ,  =  "
            else:
                D_prime = "NA"
                r2 = "NA"
                match = "  =  ,  =  "

            snp1 = rsnum_lst[i]
            snp2 = rsnum_lst[j]
            pos1 = pos_lst[i].split("-")[0]
            pos2 = pos_lst[j].split("-")[0]
            allele1 = allele_lst[i]
            allele2 = allele_lst[j]
            corr = match.split(",")[0].split("=")[1] + "=" + match.split(
                ",")[0].split("=")[0] + "," + match.split(",")[1].split(
                    "=")[1] + "=" + match.split(",")[1].split("=")[0]
            corr_f = match

            ld_matrix[i][j] = [
                snp1, snp2, allele1, allele2, corr, pos1, pos2, D_prime, r2
            ]
            ld_matrix[j][i] = [
                snp2, snp1, allele2, allele1, corr_f, pos2, pos1, D_prime, r2
            ]

    # Generate Plot Variables
    out = [j for i in ld_matrix for j in i]
    xnames = []
    ynames = []
    xA = []
    yA = []
    corA = []
    xpos = []
    ypos = []
    D = []
    R = []
    box_color = []
    box_trans = []

    if r2_d not in ["r2", "d"]:
        r2_d = "r2"

    for i in range(len(out)):
        snp1, snp2, allele1, allele2, corr, pos1, pos2, D_prime, r2 = out[i]
        xnames.append(snp1)
        ynames.append(snp2)
        xA.append(allele1)
        yA.append(allele2)
        corA.append(corr)
        xpos.append(pos1)
        ypos.append(pos2)
        sqrti = math.floor(math.sqrt(len(out)))
        if sqrti == 0:
            D.append(str(round(float(D_prime), 4)))
            R.append(str(round(float(r2), 4)))
            box_color.append("red")
            box_trans.append(r2)
        elif i % sqrti < i // sqrti and r2 != "NA":
            D.append(str(round(float(D_prime), 4)))
            R.append(str(round(float(r2), 4)))
            box_color.append("blue")
            box_trans.append(abs(D_prime))
        elif i % sqrti > i // sqrti and D_prime != "NA":
            D.append(str(round(float(D_prime), 4)))
            R.append(str(round(float(r2), 4)))
            box_color.append("red")
            box_trans.append(r2)
        elif i % sqrti == i // sqrti and D_prime != "NA":
            D.append(str(round(float(D_prime), 4)))
            R.append(str(round(float(r2), 4)))
            box_color.append("purple")
            box_trans.append(r2)
        else:
            D.append("NA")
            R.append("NA")
            box_color.append("gray")
            box_trans.append(0.1)
    # Import plotting modules
    from collections import OrderedDict
    from bokeh.embed import components, file_html
    from bokeh.layouts import gridplot
    from bokeh.models import HoverTool, LinearAxis, Range1d
    from bokeh.plotting import ColumnDataSource, curdoc, figure, output_file, reset_output, save
    from bokeh.resources import CDN
    from bokeh.io import export_svgs
    import svgutils.compose as sg
    from math import pi

    reset_output()

    # Aggregate Plotting Data
    x = []
    y = []
    w = []
    h = []
    coord_snps_plot = []
    snp_id_plot = []
    alleles_snp_plot = []
    for i in range(0, len(xpos), int(len(xpos)**0.5)):
        x.append(int(xpos[i].split(":")[1]) / 1000000.0)
        y.append(0.5)
        w.append(0.00003)
        h.append(1.06)
        coord_snps_plot.append(xpos[i])
        snp_id_plot.append(xnames[i])
        alleles_snp_plot.append(xA[i])

    buffer = (x[-1] - x[0]) * 0.025
    xr = Range1d(start=x[0] - buffer, end=x[-1] + buffer)
    yr = Range1d(start=-0.03, end=1.03)
    y2_ll = [-0.03] * len(x)
    y2_ul = [1.03] * len(x)

    yr_pos = Range1d(start=(x[-1] + buffer) * -1, end=(x[0] - buffer) * -1)
    yr0 = Range1d(start=0, end=1)
    yr2 = Range1d(start=0, end=3.8)
    yr3 = Range1d(start=0, end=1)

    spacing = (x[-1] - x[0] + buffer + buffer) / (len(x) * 1.0)
    x2 = []
    y0 = []
    y1 = []
    y2 = []
    y3 = []
    y4 = []
    for i in range(len(x)):
        x2.append(x[0] - buffer + spacing * (i + 0.5))
        y0.append(0)
        y1.append(0.20)
        y2.append(0.80)
        y3.append(1)
        y4.append(1.15)

    xname_pos = []
    for i in x2:
        for j in range(len(x2)):
            xname_pos.append(i)

    data = {
        'xname': xnames,
        'xname_pos': xname_pos,
        'yname': ynames,
        'xA': xA,
        'yA': yA,
        'xpos': xpos,
        'ypos': ypos,
        'R2': R,
        'Dp': D,
        'corA': corA,
        'box_color': box_color,
        'box_trans': box_trans
    }

    source = ColumnDataSource(data)

    threshold = 70
    if len(snps) < threshold:
        matrix_plot = figure(
            outline_line_color="white",
            min_border_top=0,
            min_border_bottom=2,
            min_border_left=100,
            min_border_right=5,
            x_range=xr,
            y_range=list(reversed(rsnum_lst)),
            h_symmetry=False,
            v_symmetry=False,
            border_fill_color='white',
            x_axis_type=None,
            logo=None,
            tools="hover,undo,redo,reset,pan,box_zoom,previewsave",
            title=" ",
            plot_width=800,
            plot_height=700)

    else:
        matrix_plot = figure(
            outline_line_color="white",
            min_border_top=0,
            min_border_bottom=2,
            min_border_left=100,
            min_border_right=5,
            x_range=xr,
            y_range=list(reversed(rsnum_lst)),
            h_symmetry=False,
            v_symmetry=False,
            border_fill_color='white',
            x_axis_type=None,
            y_axis_type=None,
            logo=None,
            tools="hover,undo,redo,reset,pan,box_zoom,previewsave",
            title=" ",
            plot_width=800,
            plot_height=700)

    matrix_plot.rect(x='xname_pos',
                     y='yname',
                     width=0.95 * spacing,
                     height=0.95,
                     source=source,
                     color="box_color",
                     alpha="box_trans",
                     line_color=None)

    matrix_plot.grid.grid_line_color = None
    matrix_plot.axis.axis_line_color = None
    matrix_plot.axis.major_tick_line_color = None
    if len(snps) < threshold:
        matrix_plot.axis.major_label_text_font_size = "8pt"
        matrix_plot.xaxis.major_label_orientation = "vertical"

    matrix_plot.axis.major_label_text_font_style = "normal"
    matrix_plot.xaxis.major_label_standoff = 0

    sup_2 = "\u00B2"

    hover = matrix_plot.select(dict(type=HoverTool))
    hover.tooltips = OrderedDict([
        ("Variant 1", " " + "@yname (@yA)"),
        ("Variant 2", " " + "@xname (@xA)"),
        ("D\'", " " + "@Dp"),
        ("R" + sup_2, " " + "@R2"),
        ("Correlated Alleles", " " + "@corA"),
    ])

    # Connecting and Rug Plots
    # Connector Plot
    if len(snps) < threshold:
        connector = figure(outline_line_color="white",
                           y_axis_type=None,
                           x_axis_type=None,
                           x_range=xr,
                           y_range=yr2,
                           border_fill_color='white',
                           title="",
                           min_border_left=100,
                           min_border_right=5,
                           min_border_top=0,
                           min_border_bottom=0,
                           h_symmetry=False,
                           v_symmetry=False,
                           plot_width=800,
                           plot_height=90,
                           tools="xpan,tap")
        connector.segment(x, y0, x, y1, color="black")
        connector.segment(x, y1, x2, y2, color="black")
        connector.segment(x2, y2, x2, y3, color="black")
        connector.text(x2,
                       y4,
                       text=snp_id_plot,
                       alpha=1,
                       angle=pi / 2,
                       text_font_size="8pt",
                       text_baseline="middle",
                       text_align="left")
    else:
        connector = figure(outline_line_color="white",
                           y_axis_type=None,
                           x_axis_type=None,
                           x_range=xr,
                           y_range=yr3,
                           border_fill_color='white',
                           title="",
                           min_border_left=100,
                           min_border_right=5,
                           min_border_top=0,
                           min_border_bottom=0,
                           h_symmetry=False,
                           v_symmetry=False,
                           plot_width=800,
                           plot_height=30,
                           tools="xpan,tap")
        connector.segment(x, y0, x, y1, color="black")
        connector.segment(x, y1, x2, y2, color="black")
        connector.segment(x2, y2, x2, y3, color="black")

    connector.yaxis.major_label_text_color = None
    connector.yaxis.minor_tick_line_alpha = 0  # Option does not work
    connector.yaxis.axis_label = " "
    connector.grid.grid_line_color = None
    connector.axis.axis_line_color = None
    connector.axis.major_tick_line_color = None
    connector.axis.minor_tick_line_color = None

    connector.toolbar_location = None

    data_rug = {
        'x': x,
        'y': y,
        'w': w,
        'h': h,
        'coord_snps_plot': coord_snps_plot,
        'snp_id_plot': snp_id_plot,
        'alleles_snp_plot': alleles_snp_plot
    }

    source_rug = ColumnDataSource(data_rug)

    # Rug Plot
    rug = figure(x_range=xr,
                 y_range=yr,
                 y_axis_type=None,
                 title="",
                 min_border_top=1,
                 min_border_bottom=0,
                 min_border_left=100,
                 min_border_right=5,
                 h_symmetry=False,
                 v_symmetry=False,
                 plot_width=800,
                 plot_height=50,
                 tools="hover,xpan,tap")
    rug.rect(x='x',
             y='y',
             width='w',
             height='h',
             fill_color='red',
             dilate=True,
             line_color=None,
             fill_alpha=0.6,
             source=source_rug)

    hover = rug.select(dict(type=HoverTool))
    hover.tooltips = OrderedDict([
        ("SNP", "@snp_id_plot (@alleles_snp_plot)"),
        ("Coord", "@coord_snps_plot"),
    ])

    rug.toolbar_location = None

    if collapseTranscript == "false":
        # Gene Plot (All Transcripts)
        genes_file = tmp_dir + "genes_" + request + ".json"
        genes_raw = open(genes_file).readlines()

        genes_plot_start = []
        genes_plot_end = []
        genes_plot_y = []
        genes_plot_name = []
        exons_plot_x = []
        exons_plot_y = []
        exons_plot_w = []
        exons_plot_h = []
        exons_plot_name = []
        exons_plot_id = []
        exons_plot_exon = []
        message = ["Too many genes to plot."]
        lines = [0]
        gap = 80000
        tall = 0.75
        if genes_raw != None and len(genes_raw) > 0:
            for gene_raw_obj in genes_raw:
                gene_obj = json.loads(gene_raw_obj)
                bin = gene_obj["bin"]
                name_id = gene_obj["name"]
                chrom = gene_obj["chrom"]
                strand = gene_obj["strand"]
                txStart = gene_obj["txStart"]
                txEnd = gene_obj["txEnd"]
                cdsStart = gene_obj["cdsStart"]
                cdsEnd = gene_obj["cdsEnd"]
                exonCount = gene_obj["exonCount"]
                exonStarts = gene_obj["exonStarts"]
                exonEnds = gene_obj["exonEnds"]
                score = gene_obj["score"]
                name2 = gene_obj["name2"]
                cdsStartStat = gene_obj["cdsStartStat"]
                cdsEndStat = gene_obj["cdsEndStat"]
                exonFrames = gene_obj["exonFrames"]
                name = name2
                id = name_id
                e_start = exonStarts.split(",")
                e_end = exonEnds.split(",")

                # Determine Y Coordinate
                i = 0
                y_coord = None
                while y_coord == None:
                    if i > len(lines) - 1:
                        y_coord = i + 1
                        lines.append(int(txEnd))
                    elif int(txStart) > (gap + lines[i]):
                        y_coord = i + 1
                        lines[i] = int(txEnd)
                    else:
                        i += 1

                genes_plot_start.append(int(txStart) / 1000000.0)
                genes_plot_end.append(int(txEnd) / 1000000.0)
                genes_plot_y.append(y_coord)
                genes_plot_name.append(name + "  ")

                for i in range(len(e_start) - 1):
                    if strand == "+":
                        exon = i + 1
                    else:
                        exon = len(e_start) - 1 - i

                    width = (int(e_end[i]) - int(e_start[i])) / 1000000.0
                    x_coord = int(e_start[i]) / 1000000.0 + (width / 2)

                    exons_plot_x.append(x_coord)
                    exons_plot_y.append(y_coord)
                    exons_plot_w.append(width)
                    exons_plot_h.append(tall)
                    exons_plot_name.append(name)
                    exons_plot_id.append(id)
                    exons_plot_exon.append(exon)

        n_rows = len(lines)
        genes_plot_yn = [n_rows - w + 0.5 for w in genes_plot_y]
        exons_plot_yn = [n_rows - w + 0.5 for w in exons_plot_y]
        yr2 = Range1d(start=0, end=n_rows)

        data_gene_plot = {
            'exons_plot_x': exons_plot_x,
            'exons_plot_yn': exons_plot_yn,
            'exons_plot_w': exons_plot_w,
            'exons_plot_h': exons_plot_h,
            'exons_plot_name': exons_plot_name,
            'exons_plot_id': exons_plot_id,
            'exons_plot_exon': exons_plot_exon,
            'coord_snps_plot': coord_snps_plot,
            'snp_id_plot': snp_id_plot,
            'alleles_snp_plot': alleles_snp_plot
        }

        source_gene_plot = ColumnDataSource(data_gene_plot)

        max_genes = 40
        # if len(lines) < 3 or len(genes_raw) > max_genes:
        if len(lines) < 3:
            plot_h_pix = 250
        else:
            plot_h_pix = 250 + (len(lines) - 2) * 50

        gene_plot = figure(
            min_border_top=2,
            min_border_bottom=0,
            min_border_left=100,
            min_border_right=5,
            x_range=xr,
            y_range=yr2,
            border_fill_color='white',
            title="",
            h_symmetry=False,
            v_symmetry=False,
            logo=None,
            plot_width=800,
            plot_height=plot_h_pix,
            tools=
            "hover,xpan,box_zoom,wheel_zoom,tap,undo,redo,reset,previewsave")

        # if len(genes_raw) <= max_genes:
        gene_plot.segment(genes_plot_start,
                          genes_plot_yn,
                          genes_plot_end,
                          genes_plot_yn,
                          color="black",
                          alpha=1,
                          line_width=2)
        gene_plot.rect(x='exons_plot_x',
                       y='exons_plot_yn',
                       width='exons_plot_w',
                       height='exons_plot_h',
                       source=source_gene_plot,
                       fill_color='grey',
                       line_color="grey")
        gene_plot.text(genes_plot_start,
                       genes_plot_yn,
                       text=genes_plot_name,
                       alpha=1,
                       text_font_size="7pt",
                       text_font_style="bold",
                       text_baseline="middle",
                       text_align="right",
                       angle=0)
        hover = gene_plot.select(dict(type=HoverTool))
        hover.tooltips = OrderedDict([
            ("Gene", "@exons_plot_name"),
            ("ID", "@exons_plot_id"),
            ("Exon", "@exons_plot_exon"),
        ])

        # else:
        #     x_coord_text = x[0] + (x[-1] - x[0]) / 2.0
        #     gene_plot.text(x_coord_text, n_rows / 2.0, text=message, alpha=1,
        #                    text_font_size="12pt", text_font_style="bold", text_baseline="middle", text_align="center", angle=0)

        gene_plot.xaxis.axis_label = "Chromosome " + \
            snp_coords[1][1] + " Coordinate (Mb)(" + genome_build_vars[genome_build]['title'] + ")"
        gene_plot.yaxis.axis_label = "Genes (All Transcripts)"
        gene_plot.ygrid.grid_line_color = None
        gene_plot.yaxis.axis_line_color = None
        gene_plot.yaxis.minor_tick_line_color = None
        gene_plot.yaxis.major_tick_line_color = None
        gene_plot.yaxis.major_label_text_color = None

        gene_plot.toolbar_location = "below"

    # Gene Plot (Collapsed)
    else:
        genes_c_file = tmp_dir + "genes_c_" + request + ".json"
        genes_c_raw = open(genes_c_file).readlines()

        genes_c_plot_start = []
        genes_c_plot_end = []
        genes_c_plot_y = []
        genes_c_plot_name = []
        exons_c_plot_x = []
        exons_c_plot_y = []
        exons_c_plot_w = []
        exons_c_plot_h = []
        exons_c_plot_name = []
        exons_c_plot_id = []
        message_c = ["Too many genes to plot."]
        lines_c = [0]
        gap = 80000
        tall = 0.75
        if genes_c_raw != None and len(genes_c_raw) > 0:
            for gene_c_raw_obj in genes_c_raw:
                gene_c_obj = json.loads(gene_c_raw_obj)
                chrom = gene_c_obj["chrom"]
                txStart = gene_c_obj["txStart"]
                txEnd = gene_c_obj["txEnd"]
                exonStarts = gene_c_obj["exonStarts"]
                exonEnds = gene_c_obj["exonEnds"]
                name2 = gene_c_obj["name2"]
                transcripts = gene_c_obj["transcripts"]
                name = name2
                e_start = exonStarts.split(",")
                e_end = exonEnds.split(",")
                e_transcripts = transcripts.split(",")

                # Determine Y Coordinate
                i = 0
                y_coord = None
                while y_coord == None:
                    if i > len(lines_c) - 1:
                        y_coord = i + 1
                        lines_c.append(int(txEnd))
                    elif int(txStart) > (gap + lines_c[i]):
                        y_coord = i + 1
                        lines_c[i] = int(txEnd)
                    else:
                        i += 1

                genes_c_plot_start.append(int(txStart) / 1000000.0)
                genes_c_plot_end.append(int(txEnd) / 1000000.0)
                genes_c_plot_y.append(y_coord)
                genes_c_plot_name.append(name + "  ")

                # for i in range(len(e_start)):
                for i in range(len(e_start) - 1):
                    width = (int(e_end[i]) - int(e_start[i])) / 1000000.0
                    x_coord = int(e_start[i]) / 1000000.0 + (width / 2)

                    exons_c_plot_x.append(x_coord)
                    exons_c_plot_y.append(y_coord)
                    exons_c_plot_w.append(width)
                    exons_c_plot_h.append(tall)
                    exons_c_plot_name.append(name)
                    exons_c_plot_id.append(e_transcripts[i].replace("-", ","))

        n_rows_c = len(lines_c)
        genes_c_plot_yn = [n_rows_c - x + 0.5 for x in genes_c_plot_y]
        exons_c_plot_yn = [n_rows_c - x + 0.5 for x in exons_c_plot_y]
        yr2_c = Range1d(start=0, end=n_rows_c)

        data_gene_c_plot = {
            'exons_c_plot_x': exons_c_plot_x,
            'exons_c_plot_yn': exons_c_plot_yn,
            'exons_c_plot_w': exons_c_plot_w,
            'exons_c_plot_h': exons_c_plot_h,
            'exons_c_plot_name': exons_c_plot_name,
            'exons_c_plot_id': exons_c_plot_id
        }
        source_gene_c_plot = ColumnDataSource(data_gene_c_plot)
        max_genes_c = 40
        # if len(lines_c) < 3 or len(genes_c_raw) > max_genes_c:
        if len(lines_c) < 3:
            plot_h_pix = 250
        else:
            plot_h_pix = 250 + (len(lines_c) - 2) * 50

        gene_plot = figure(
            min_border_top=2,
            min_border_bottom=0,
            min_border_left=100,
            min_border_right=5,
            x_range=xr,
            y_range=yr2_c,
            border_fill_color='white',
            title="",
            h_symmetry=False,
            v_symmetry=False,
            logo=None,
            plot_width=900,
            plot_height=plot_h_pix,
            tools=
            "hover,xpan,box_zoom,wheel_zoom,tap,undo,redo,reset,previewsave")

        # if len(genes_c_raw) <= max_genes_c:
        gene_plot.segment(genes_c_plot_start,
                          genes_c_plot_yn,
                          genes_c_plot_end,
                          genes_c_plot_yn,
                          color="black",
                          alpha=1,
                          line_width=2)
        gene_plot.rect(x='exons_c_plot_x',
                       y='exons_c_plot_yn',
                       width='exons_c_plot_w',
                       height='exons_c_plot_h',
                       source=source_gene_c_plot,
                       fill_color="grey",
                       line_color="grey")
        gene_plot.text(genes_c_plot_start,
                       genes_c_plot_yn,
                       text=genes_c_plot_name,
                       alpha=1,
                       text_font_size="7pt",
                       text_font_style="bold",
                       text_baseline="middle",
                       text_align="right",
                       angle=0)
        hover = gene_plot.select(dict(type=HoverTool))
        hover.tooltips = OrderedDict([
            ("Gene", "@exons_c_plot_name"),
            ("Transcript IDs", "@exons_c_plot_id"),
        ])

        # else:
        # 	x_coord_text = coord1/1000000.0 + (coord2/1000000.0 - coord1/1000000.0) / 2.0
        # 	gene_c_plot.text(x_coord_text, n_rows_c / 2.0, text=message_c, alpha=1,
        # 				   text_font_size="12pt", text_font_style="bold", text_baseline="middle", text_align="center", angle=0)

        gene_plot.xaxis.axis_label = "Chromosome " + snp_coords[1][
            1] + " Coordinate (Mb)(" + genome_build_vars[genome_build][
                'title'] + ")"
        gene_plot.yaxis.axis_label = "Genes (Transcripts Collapsed)"
        gene_plot.ygrid.grid_line_color = None
        gene_plot.yaxis.axis_line_color = None
        gene_plot.yaxis.minor_tick_line_color = None
        gene_plot.yaxis.major_tick_line_color = None
        gene_plot.yaxis.major_label_text_color = None

        gene_plot.toolbar_location = "below"

    # Change output backend to SVG temporarily for headless export
    # Will be changed back to canvas in LDlink.js
    matrix_plot.output_backend = "svg"
    connector.output_backend = "svg"
    rug.output_backend = "svg"
    gene_plot.output_backend = "svg"
    export_svgs(matrix_plot,
                filename=tmp_dir + "matrix_plot_1_" + request + ".svg")
    export_svgs(connector,
                filename=tmp_dir + "connector_1_" + request + ".svg")
    export_svgs(rug, filename=tmp_dir + "rug_1_" + request + ".svg")
    export_svgs(gene_plot,
                filename=tmp_dir + "gene_plot_1_" + request + ".svg")

    # 1 pixel = 0.0264583333 cm
    svg_height = str(25.00 + (0.0264583333 * plot_h_pix)) + "cm"
    svg_height_scaled = str(110.00 + (0.1322916665 * plot_h_pix)) + "cm"

    # Concatenate svgs
    sg.Figure(
        "21.59cm", svg_height,
        sg.SVG(tmp_dir + "matrix_plot_1_" + request + ".svg"),
        sg.SVG(tmp_dir + "connector_1_" + request + ".svg").scale(.97).move(
            0, 700),
        sg.SVG(tmp_dir + "rug_1_" + request + ".svg").scale(.97).move(0, 790),
        sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").scale(.97).move(
            0, 840)).save(tmp_dir + "matrix_plot_" + request + ".svg")

    sg.Figure(
        "107.95cm", svg_height_scaled,
        sg.SVG(tmp_dir + "matrix_plot_1_" + request + ".svg").scale(5),
        sg.SVG(tmp_dir + "connector_1_" + request + ".svg").scale(4.85).move(
            0, 3500),
        sg.SVG(tmp_dir + "rug_1_" + request + ".svg").scale(4.85).move(
            0, 3930),
        sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").scale(4.85).move(
            0, 4160)).save(tmp_dir + "matrix_plot_scaled_" + request + ".svg")

    # Export to PDF
    subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "matrix_plot_" +
                    request + ".svg " + tmp_dir + "matrix_plot_" + request +
                    ".pdf",
                    shell=True)
    # Export to PNG
    subprocess.call("phantomjs ./rasterize.js " + tmp_dir +
                    "matrix_plot_scaled_" + request + ".svg " + tmp_dir +
                    "matrix_plot_" + request + ".png",
                    shell=True)
    # Export to JPEG
    subprocess.call("phantomjs ./rasterize.js " + tmp_dir +
                    "matrix_plot_scaled_" + request + ".svg " + tmp_dir +
                    "matrix_plot_" + request + ".jpeg",
                    shell=True)
    # Remove individual SVG files after they are combined
    subprocess.call("rm " + tmp_dir + "matrix_plot_1_" + request + ".svg",
                    shell=True)
    subprocess.call("rm " + tmp_dir + "gene_plot_1_" + request + ".svg",
                    shell=True)
    subprocess.call("rm " + tmp_dir + "rug_1_" + request + ".svg", shell=True)
    subprocess.call("rm " + tmp_dir + "connector_1_" + request + ".svg",
                    shell=True)
    # Remove scaled SVG file after it is converted to png and jpeg
    subprocess.call("rm " + tmp_dir + "matrix_plot_scaled_" + request + ".svg",
                    shell=True)
    # Remove temporary file(s)
    subprocess.call("rm " + tmp_dir + "genes_*" + request + "*.json",
                    shell=True)

    reset_output()

    return None
Пример #2
0
def calculate_proxy_svg(snp,
                        pop,
                        request,
                        genome_build,
                        r2_d="r2",
                        window=500000,
                        collapseTranscript=True):

    # Set data directories using config.yml
    with open('config.yml', 'r') as yml_file:
        config = yaml.load(yml_file)
    env = config['env']
    connect_external = config['database']['connect_external']
    api_mongo_addr = config['database']['api_mongo_addr']
    data_dir = config['data']['data_dir']
    tmp_dir = config['data']['tmp_dir']
    genotypes_dir = config['data']['genotypes_dir']
    mongo_username = config['database']['mongo_user_readonly']
    mongo_password = config['database']['mongo_password']
    mongo_port = config['database']['mongo_port']
    aws_info = config['aws']
    num_subprocesses = config['performance']['num_subprocesses']

    export_s3_keys = retrieveAWSCredentials()

    # Ensure tmp directory exists
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)

    if request is False:
        request = str(time.strftime("%I%M%S"))

    # Create JSON output

    # Find coordinates (GRCh37/hg19) or (GRCh38/hg38) for SNP RS number

    # Connect to Mongo snp database
    if env == 'local' or connect_external:
        mongo_host = api_mongo_addr
    else:
        mongo_host = 'localhost'
    client = MongoClient(
        'mongodb://' + mongo_username + ':' + mongo_password + '@' +
        mongo_host + '/admin', mongo_port)
    db = client["LDLink"]

    def get_coords(db, rsid):
        rsid = rsid.strip("rs")
        query_results = db.dbsnp.find_one({"id": rsid})
        query_results_sanitized = json.loads(json_util.dumps(query_results))
        return query_results_sanitized

    # Query genomic coordinates
    def get_rsnum(db, coord):
        temp_coord = coord.strip("chr").split(":")
        chro = temp_coord[0]
        pos = temp_coord[1]
        query_results = db.dbsnp.find({
            "chromosome":
            chro.upper() if chro == 'x' or chro == 'y' else str(chro),
            genome_build_vars[genome_build]['position']:
            str(pos)
        })
        query_results_sanitized = json.loads(json_util.dumps(query_results))
        return query_results_sanitized

    # Replace input genomic coordinates with variant ids (rsids)
    def replace_coord_rsid(db, snp):
        if snp[0:2] == "rs":
            return snp
        else:
            snp_info_lst = get_rsnum(db, snp)
            print("snp_info_lst")
            print(snp_info_lst)
            if snp_info_lst != None:
                if len(snp_info_lst) > 1:
                    var_id = "rs" + snp_info_lst[0]['id']
                    ref_variants = []
                    for snp_info in snp_info_lst:
                        if snp_info['id'] == snp_info['ref_id']:
                            ref_variants.append(snp_info['id'])
                    if len(ref_variants) > 1:
                        var_id = "rs" + ref_variants[0]
                    elif len(ref_variants) == 0 and len(snp_info_lst) > 1:
                        var_id = "rs" + snp_info_lst[0]['id']
                    else:
                        var_id = "rs" + ref_variants[0]
                    return var_id
                elif len(snp_info_lst) == 1:
                    var_id = "rs" + snp_info_lst[0]['id']
                    return var_id
                else:
                    return snp
            else:
                return snp
        return snp

    snp = replace_coord_rsid(db, snp)

    # Find RS number in snp database
    snp_coord = get_coords(db, snp)

    # Get population ids from LDproxy.py tmp output files
    pop_list = open(tmp_dir + "pops_" + request + ".txt").readlines()
    ids = []
    for i in range(len(pop_list)):
        ids.append(pop_list[i].strip())

    pop_ids = list(set(ids))

    # Extract query SNP phased genotypes
    vcf_filePath = "%s/%s%s/%s" % (
        config['aws']['data_subfolder'], genotypes_dir,
        genome_build_vars[genome_build]['1000G_dir'],
        genome_build_vars[genome_build]['1000G_file'] %
        (snp_coord['chromosome']))
    vcf_query_snp_file = "s3://%s/%s" % (config['aws']['bucket'], vcf_filePath)

    checkS3File(aws_info, config['aws']['bucket'], vcf_filePath)

    tabix_snp_h = export_s3_keys + " cd {1}; tabix -HD {0} | grep CHROM".format(
        vcf_query_snp_file, data_dir + genotypes_dir +
        genome_build_vars[genome_build]['1000G_dir'])
    head = [
        x.decode('utf-8')
        for x in subprocess.Popen(tabix_snp_h,
                                  shell=True,
                                  stdout=subprocess.PIPE).stdout.readlines()
    ][0].strip().split()

    tabix_snp = export_s3_keys + " cd {4}; tabix -D {0} {1}:{2}-{2} | grep -v -e END > {3}".format(
        vcf_query_snp_file, genome_build_vars[genome_build]['1000G_chr_prefix']
        + snp_coord['chromosome'],
        snp_coord[genome_build_vars[genome_build]['position']],
        tmp_dir + "snp_no_dups_" + request + ".vcf", data_dir + genotypes_dir +
        genome_build_vars[genome_build]['1000G_dir'])
    subprocess.call(tabix_snp, shell=True)

    # Check SNP is in the 1000G population, has the correct RS number, and not
    # monoallelic
    vcf = open(tmp_dir + "snp_no_dups_" + request + ".vcf").readlines()

    if len(vcf) == 0:
        subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt",
                        shell=True)
        subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True)
        return None
    elif len(vcf) > 1:
        geno = []
        for i in range(len(vcf)):
            # if vcf[i].strip().split()[2] == snp:
            geno = vcf[i].strip().split()
            geno[0] = geno[0].lstrip('chr')
        if geno == []:
            subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt",
                            shell=True)
            subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf",
                            shell=True)
            return None
    else:
        geno = vcf[0].strip().split()
        geno[0] = geno[0].lstrip('chr')

    if geno[2] != snp and snp[0:2] == "rs" and "rs" in geno[2]:
        snp = geno[2]

    if "," in geno[3] or "," in geno[4]:
        subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt",
                        shell=True)
        subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True)
        return None

    index = []
    for i in range(9, len(head)):
        if head[i] in pop_ids:
            index.append(i)

    genotypes = {"0": 0, "1": 0}
    for i in index:
        sub_geno = geno[i].split("|")
        for j in sub_geno:
            if j in genotypes:
                genotypes[j] += 1
            else:
                genotypes[j] = 1

    if genotypes["0"] == 0 or genotypes["1"] == 0:
        subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt",
                        shell=True)
        subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True)
        return None

    # Define window of interest around query SNP
    # window = 500000
    coord1 = int(
        snp_coord[genome_build_vars[genome_build]['position']]) - window
    if coord1 < 0:
        coord1 = 0
    coord2 = int(
        snp_coord[genome_build_vars[genome_build]['position']]) + window

    # Calculate proxy LD statistics in parallel
    # threads = 4
    # block = (2 * window) // 4
    # block = (2 * window) // num_subprocesses
    windowChunkRanges = chunkWindow(
        int(snp_coord[genome_build_vars[genome_build]['position']]), window,
        num_subprocesses)

    commands = []
    # for i in range(num_subprocesses):
    #     if i == min(range(num_subprocesses)) and i == max(range(num_subprocesses)):
    #         command = "python3 LDproxy_sub.py " + "True " + snp + " " + \
    #             snp_coord['chromosome'] + " " + str(coord1) + " " + \
    #             str(coord2) + " " + request + " " + str(i)
    #     elif i == min(range(num_subprocesses)):
    #         command = "python3 LDproxy_sub.py " + "True " + snp + " " + \
    #             snp_coord['chromosome'] + " " + str(coord1) + " " + \
    #             str(coord1 + block) + " " + request + " " + str(i)
    #     elif i == max(range(num_subprocesses)):
    #         command = "python3 LDproxy_sub.py " + "True " + snp + " " + snp_coord['chromosome'] + " " + str(
    #             coord1 + (block * i) + 1) + " " + str(coord2) + " " + request + " " + str(i)
    #     else:
    #         command = "python3 LDproxy_sub.py " + "True " + snp + " " + snp_coord['chromosome'] + " " + str(coord1 + (
    #             block * i) + 1) + " " + str(coord1 + (block * (i + 1))) + " " + request + " " + str(i)
    #     commands.append(command)

    for subprocess_id in range(num_subprocesses):
        getWindowVariantsArgs = " ".join([
            "True",
            str(snp),
            str(snp_coord['chromosome']),
            str(windowChunkRanges[subprocess_id][0]),
            str(windowChunkRanges[subprocess_id][1]),
            str(request), genome_build,
            str(subprocess_id)
        ])
        commands.append("python3 LDproxy_sub.py " + getWindowVariantsArgs)

    processes = [
        subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
        for command in commands
    ]

    # collect output in parallel
    def get_output(process):
        return process.communicate()[0].splitlines()

    if not hasattr(threading.current_thread(), "_children"):
        threading.current_thread()._children = weakref.WeakKeyDictionary()

    pool = Pool(len(processes))
    out_raw = pool.map(get_output, processes)
    pool.close()
    pool.join()

    # Aggregate output
    out_prox = []
    for i in range(len(out_raw)):
        for j in range(len(out_raw[i])):
            col = out_raw[i][j].decode('utf-8').strip().split("\t")
            col[6] = int(col[6])
            col[7] = float(col[7])
            col[8] = float(col[8])
            col.append(abs(int(col[6])))
            out_prox.append(col)

    # Sort output
    if r2_d not in ["r2", "d"]:
        r2_d = "r2"

    out_dist_sort = sorted(out_prox, key=operator.itemgetter(14))
    if r2_d == "r2":
        out_ld_sort = sorted(out_dist_sort,
                             key=operator.itemgetter(8),
                             reverse=True)
    else:
        out_ld_sort = sorted(out_dist_sort,
                             key=operator.itemgetter(7),
                             reverse=True)

    # Organize scatter plot data
    q_rs = []
    q_allele = []
    q_coord = []
    q_maf = []
    p_rs = []
    p_allele = []
    p_coord = []
    p_maf = []
    dist = []
    d_prime = []
    d_prime_round = []
    r2 = []
    r2_round = []
    corr_alleles = []
    regdb = []
    funct = []
    color = []
    size = []
    for i in range(len(out_ld_sort)):
        q_rs_i, q_allele_i, q_coord_i, p_rs_i, p_allele_i, p_coord_i, dist_i, d_prime_i, r2_i, corr_alleles_i, regdb_i, q_maf_i, p_maf_i, funct_i, dist_abs = out_ld_sort[
            i]

        if float(r2_i) > 0.01:
            q_rs.append(q_rs_i)
            q_allele.append(q_allele_i)
            q_coord.append(float(q_coord_i.split(":")[1]) / 1000000)
            q_maf.append(str(round(float(q_maf_i), 4)))
            if p_rs_i == ".":
                p_rs_i = p_coord_i
            p_rs.append(p_rs_i)
            p_allele.append(p_allele_i)
            p_coord.append(float(p_coord_i.split(":")[1]) / 1000000)
            p_maf.append(str(round(float(p_maf_i), 4)))
            dist.append(str(round(dist_i / 1000000.0, 4)))
            d_prime.append(float(d_prime_i))
            d_prime_round.append(str(round(float(d_prime_i), 4)))
            r2.append(float(r2_i))
            r2_round.append(str(round(float(r2_i), 4)))
            corr_alleles.append(corr_alleles_i)

            # Correct Missing Annotations
            if regdb_i == ".":
                regdb_i = ""
            regdb.append(regdb_i)
            if funct_i == ".":
                funct_i = ""
            if funct_i == "NA":
                funct_i = "none"
            funct.append(funct_i)

            # Set Color
            if i == 0:
                color_i = "blue"
            elif funct_i != "none" and funct_i != "":
                color_i = "red"
            else:
                color_i = "orange"
            color.append(color_i)

            # Set Size
            size_i = 9 + float(p_maf_i) * 14.0
            size.append(size_i)

    # Begin Bokeh Plotting
    from collections import OrderedDict
    from bokeh.embed import components, file_html
    from bokeh.layouts import gridplot
    from bokeh.models import HoverTool, LinearAxis, Range1d
    from bokeh.plotting import ColumnDataSource, curdoc, figure, output_file, reset_output, save
    from bokeh.resources import CDN
    from bokeh.io import export_svgs
    import svgutils.compose as sg

    reset_output()

    # Proxy Plot
    x = p_coord
    if r2_d == "r2":
        y = r2
    else:
        y = d_prime
    whitespace = 0.01
    xr = Range1d(start=coord1 / 1000000.0 - whitespace,
                 end=coord2 / 1000000.0 + whitespace)
    yr = Range1d(start=-0.03, end=1.03)
    sup_2 = "\u00B2"

    proxy_plot = figure(
        title="Proxies for " + snp + " in " + pop,
        min_border_top=2,
        min_border_bottom=2,
        min_border_left=60,
        min_border_right=60,
        h_symmetry=False,
        v_symmetry=False,
        plot_width=900,
        plot_height=600,
        x_range=xr,
        y_range=yr,
        tools="hover,tap,pan,box_zoom,box_select,undo,redo,reset,previewsave",
        logo=None,
        toolbar_location="above")

    proxy_plot.title.align = "center"

    # Add recombination rate from LDproxy.py output file
    recomb_file = tmp_dir + "recomb_" + request + ".json"
    recomb_raw = open(recomb_file).readlines()

    recomb_x = []
    recomb_y = []

    for recomb_raw_obj in recomb_raw:
        recomb_obj = json.loads(recomb_raw_obj)
        recomb_x.append(
            int(recomb_obj[genome_build_vars[genome_build]['position']]) /
            1000000.0)
        recomb_y.append(float(recomb_obj['rate']) / 100.0)

    data = {
        'x': x,
        'y': y,
        'qrs': q_rs,
        'q_alle': q_allele,
        'q_maf': q_maf,
        'prs': p_rs,
        'p_alle': p_allele,
        'p_maf': p_maf,
        'dist': dist,
        'r': r2_round,
        'd': d_prime_round,
        'alleles': corr_alleles,
        'regdb': regdb,
        'funct': funct,
        'size': size,
        'color': color
    }
    source = ColumnDataSource(data)

    proxy_plot.line(recomb_x, recomb_y, line_width=1, color="black", alpha=0.5)

    proxy_plot.circle(x='x',
                      y='y',
                      size='size',
                      color='color',
                      alpha=0.5,
                      source=source)

    hover = proxy_plot.select(dict(type=HoverTool))
    hover.tooltips = OrderedDict([
        ("Query Variant", "@qrs @q_alle"),
        ("Proxy Variant", "@prs @p_alle"),
        ("Distance (Mb)", "@dist"),
        ("MAF (Query,Proxy)", "@q_maf,@p_maf"),
        ("R" + sup_2, "@r"),
        ("D\'", "@d"),
        ("Correlated Alleles", "@alleles"),
        ("RegulomeDB", "@regdb"),
        ("Functional Class", "@funct"),
    ])

    proxy_plot.text(x,
                    y,
                    text=regdb,
                    alpha=1,
                    text_font_size="7pt",
                    text_baseline="middle",
                    text_align="center",
                    angle=0)

    if r2_d == "r2":
        proxy_plot.yaxis.axis_label = "R" + sup_2
    else:
        proxy_plot.yaxis.axis_label = "D\'"

    proxy_plot.extra_y_ranges = {"y2_axis": Range1d(start=-3, end=103)}
    proxy_plot.add_layout(
        LinearAxis(y_range_name="y2_axis",
                   axis_label="Combined Recombination Rate (cM/Mb)"), "right")

    # Rug Plot
    y2_ll = [-0.03] * len(x)
    y2_ul = [1.03] * len(x)
    yr_rug = Range1d(start=-0.03, end=1.03)

    data_rug = {
        'x': x,
        'y': y,
        'y2_ll': y2_ll,
        'y2_ul': y2_ul,
        'qrs': q_rs,
        'q_alle': q_allele,
        'q_maf': q_maf,
        'prs': p_rs,
        'p_alle': p_allele,
        'p_maf': p_maf,
        'dist': dist,
        'r': r2_round,
        'd': d_prime_round,
        'alleles': corr_alleles,
        'regdb': regdb,
        'funct': funct,
        'size': size,
        'color': color
    }
    source_rug = ColumnDataSource(data_rug)

    rug = figure(x_range=xr,
                 y_range=yr_rug,
                 border_fill_color='white',
                 y_axis_type=None,
                 title="",
                 min_border_top=2,
                 min_border_bottom=2,
                 min_border_left=60,
                 min_border_right=60,
                 h_symmetry=False,
                 v_symmetry=False,
                 plot_width=900,
                 plot_height=50,
                 tools="xpan,tap",
                 logo=None)

    rug.segment(x0='x',
                y0='y2_ll',
                x1='x',
                y1='y2_ul',
                source=source_rug,
                color='color',
                alpha=0.5,
                line_width=1)
    rug.toolbar_location = None

    if collapseTranscript == "false":
        # Gene Plot (All Transcripts)
        genes_file = tmp_dir + "genes_" + request + ".json"
        genes_raw = open(genes_file).readlines()

        genes_plot_start = []
        genes_plot_end = []
        genes_plot_y = []
        genes_plot_name = []
        exons_plot_x = []
        exons_plot_y = []
        exons_plot_w = []
        exons_plot_h = []
        exons_plot_name = []
        exons_plot_id = []
        exons_plot_exon = []
        lines = [0]
        gap = 80000
        tall = 0.75
        if genes_raw != None and len(genes_raw) > 0:
            for gene_raw_obj in genes_raw:
                gene_obj = json.loads(gene_raw_obj)
                bin = gene_obj["bin"]
                name_id = gene_obj["name"]
                chrom = gene_obj["chrom"]
                strand = gene_obj["strand"]
                txStart = gene_obj["txStart"]
                txEnd = gene_obj["txEnd"]
                cdsStart = gene_obj["cdsStart"]
                cdsEnd = gene_obj["cdsEnd"]
                exonCount = gene_obj["exonCount"]
                exonStarts = gene_obj["exonStarts"]
                exonEnds = gene_obj["exonEnds"]
                score = gene_obj["score"]
                name2 = gene_obj["name2"]
                cdsStartStat = gene_obj["cdsStartStat"]
                cdsEndStat = gene_obj["cdsEndStat"]
                exonFrames = gene_obj["exonFrames"]
                name = name2
                id = name_id
                e_start = exonStarts.split(",")
                e_end = exonEnds.split(",")

                # Determine Y Coordinate
                i = 0
                y_coord = None
                while y_coord == None:
                    if i > len(lines) - 1:
                        y_coord = i + 1
                        lines.append(int(txEnd))
                    elif int(txStart) > (gap + lines[i]):
                        y_coord = i + 1
                        lines[i] = int(txEnd)
                    else:
                        i += 1

                genes_plot_start.append(int(txStart) / 1000000.0)
                genes_plot_end.append(int(txEnd) / 1000000.0)
                genes_plot_y.append(y_coord)
                genes_plot_name.append(name + "  ")

                for i in range(len(e_start) - 1):
                    if strand == "+":
                        exon = i + 1
                    else:
                        exon = len(e_start) - 1 - i

                    width = (int(e_end[i]) - int(e_start[i])) / 1000000.0
                    x_coord = int(e_start[i]) / 1000000.0 + (width / 2)

                    exons_plot_x.append(x_coord)
                    exons_plot_y.append(y_coord)
                    exons_plot_w.append(width)
                    exons_plot_h.append(tall)
                    exons_plot_name.append(name)
                    exons_plot_id.append(id)
                    exons_plot_exon.append(exon)

        n_rows = len(lines)
        genes_plot_yn = [n_rows - x + 0.5 for x in genes_plot_y]
        exons_plot_yn = [n_rows - x + 0.5 for x in exons_plot_y]
        yr2 = Range1d(start=0, end=n_rows)

        data_gene_plot = {
            'exons_plot_x': exons_plot_x,
            'exons_plot_yn': exons_plot_yn,
            'exons_plot_w': exons_plot_w,
            'exons_plot_h': exons_plot_h,
            'exons_plot_name': exons_plot_name,
            'exons_plot_id': exons_plot_id,
            'exons_plot_exon': exons_plot_exon
        }

        source_gene_plot = ColumnDataSource(data_gene_plot)

        if len(lines) < 3:
            plot_h_pix = 250
        else:
            plot_h_pix = 250 + (len(lines) - 2) * 50

        gene_plot = figure(
            x_range=xr,
            y_range=yr2,
            border_fill_color='white',
            title="",
            min_border_top=2,
            min_border_bottom=2,
            min_border_left=60,
            min_border_right=60,
            h_symmetry=False,
            v_symmetry=False,
            plot_width=900,
            plot_height=plot_h_pix,
            tools="hover,tap,xpan,box_zoom,undo,redo,reset,previewsave",
            logo=None)

        gene_plot.segment(genes_plot_start,
                          genes_plot_yn,
                          genes_plot_end,
                          genes_plot_yn,
                          color="black",
                          alpha=1,
                          line_width=2)

        gene_plot.rect(x='exons_plot_x',
                       y='exons_plot_yn',
                       width='exons_plot_w',
                       height='exons_plot_h',
                       source=source_gene_plot,
                       fill_color="grey",
                       line_color="grey")
        gene_plot.xaxis.axis_label = "Chromosome " + snp_coord[
            'chromosome'] + " Coordinate (Mb)(" + genome_build_vars[
                genome_build]['title'] + ")"
        gene_plot.yaxis.axis_label = "Genes (All Transcripts)"
        gene_plot.ygrid.grid_line_color = None
        gene_plot.yaxis.axis_line_color = None
        gene_plot.yaxis.minor_tick_line_color = None
        gene_plot.yaxis.major_tick_line_color = None
        gene_plot.yaxis.major_label_text_color = None

        hover = gene_plot.select(dict(type=HoverTool))
        hover.tooltips = OrderedDict([
            ("Gene", "@exons_plot_name"),
            ("ID", "@exons_plot_id"),
            ("Exon", "@exons_plot_exon"),
        ])

        gene_plot.text(genes_plot_start,
                       genes_plot_yn,
                       text=genes_plot_name,
                       alpha=1,
                       text_font_size="7pt",
                       text_font_style="bold",
                       text_baseline="middle",
                       text_align="right",
                       angle=0)

        gene_plot.toolbar_location = "below"

    # Gene Plot (Collapsed)
    else:
        genes_c_file = tmp_dir + "genes_c_" + request + ".json"
        genes_c_raw = open(genes_c_file).readlines()

        genes_c_plot_start = []
        genes_c_plot_end = []
        genes_c_plot_y = []
        genes_c_plot_name = []
        exons_c_plot_x = []
        exons_c_plot_y = []
        exons_c_plot_w = []
        exons_c_plot_h = []
        exons_c_plot_name = []
        exons_c_plot_id = []
        message_c = ["Too many genes to plot."]
        lines_c = [0]
        gap = 80000
        tall = 0.75
        if genes_c_raw != None and len(genes_c_raw) > 0:
            for gene_c_raw_obj in genes_c_raw:
                gene_c_obj = json.loads(gene_c_raw_obj)
                chrom = gene_c_obj["chrom"]
                txStart = gene_c_obj["txStart"]
                txEnd = gene_c_obj["txEnd"]
                exonStarts = gene_c_obj["exonStarts"]
                exonEnds = gene_c_obj["exonEnds"]
                name2 = gene_c_obj["name2"]
                transcripts = gene_c_obj["transcripts"]
                name = name2
                e_start = exonStarts.split(",")
                e_end = exonEnds.split(",")
                e_transcripts = transcripts.split(",")

                # Determine Y Coordinate
                i = 0
                y_coord = None
                while y_coord == None:
                    if i > len(lines_c) - 1:
                        y_coord = i + 1
                        lines_c.append(int(txEnd))
                    elif int(txStart) > (gap + lines_c[i]):
                        y_coord = i + 1
                        lines_c[i] = int(txEnd)
                    else:
                        i += 1

                genes_c_plot_start.append(int(txStart) / 1000000.0)
                genes_c_plot_end.append(int(txEnd) / 1000000.0)
                genes_c_plot_y.append(y_coord)
                genes_c_plot_name.append(name + "  ")

                # for i in range(len(e_start)):
                for i in range(len(e_start) - 1):
                    width = (int(e_end[i]) - int(e_start[i])) / 1000000.0
                    x_coord = int(e_start[i]) / 1000000.0 + (width / 2)

                    exons_c_plot_x.append(x_coord)
                    exons_c_plot_y.append(y_coord)
                    exons_c_plot_w.append(width)
                    exons_c_plot_h.append(tall)
                    exons_c_plot_name.append(name)
                    exons_c_plot_id.append(e_transcripts[i].replace("-", ","))

        n_rows_c = len(lines_c)
        genes_c_plot_yn = [n_rows_c - x + 0.5 for x in genes_c_plot_y]
        exons_c_plot_yn = [n_rows_c - x + 0.5 for x in exons_c_plot_y]
        yr2_c = Range1d(start=0, end=n_rows_c)

        data_gene_c_plot = {
            'exons_c_plot_x': exons_c_plot_x,
            'exons_c_plot_yn': exons_c_plot_yn,
            'exons_c_plot_w': exons_c_plot_w,
            'exons_c_plot_h': exons_c_plot_h,
            'exons_c_plot_name': exons_c_plot_name,
            'exons_c_plot_id': exons_c_plot_id
        }
        source_gene_c_plot = ColumnDataSource(data_gene_c_plot)
        max_genes_c = 40
        # if len(lines_c) < 3 or len(genes_c_raw) > max_genes_c:
        if len(lines_c) < 3:
            plot_h_pix = 250
        else:
            plot_h_pix = 250 + (len(lines_c) - 2) * 50

        gene_plot = figure(
            min_border_top=2,
            min_border_bottom=0,
            min_border_left=100,
            min_border_right=5,
            x_range=xr,
            y_range=yr2_c,
            border_fill_color='white',
            title="",
            h_symmetry=False,
            v_symmetry=False,
            logo=None,
            plot_width=900,
            plot_height=plot_h_pix,
            tools=
            "hover,xpan,box_zoom,wheel_zoom,tap,undo,redo,reset,previewsave")

        # if len(genes_c_raw) <= max_genes_c:
        gene_plot.segment(genes_c_plot_start,
                          genes_c_plot_yn,
                          genes_c_plot_end,
                          genes_c_plot_yn,
                          color="black",
                          alpha=1,
                          line_width=2)
        gene_plot.rect(x='exons_c_plot_x',
                       y='exons_c_plot_yn',
                       width='exons_c_plot_w',
                       height='exons_c_plot_h',
                       source=source_gene_c_plot,
                       fill_color="grey",
                       line_color="grey")
        gene_plot.text(genes_c_plot_start,
                       genes_c_plot_yn,
                       text=genes_c_plot_name,
                       alpha=1,
                       text_font_size="7pt",
                       text_font_style="bold",
                       text_baseline="middle",
                       text_align="right",
                       angle=0)
        hover = gene_plot.select(dict(type=HoverTool))
        hover.tooltips = OrderedDict([
            ("Gene", "@exons_c_plot_name"),
            ("Transcript IDs", "@exons_c_plot_id"),
        ])

        # else:
        # 	x_coord_text = coord1/1000000.0 + (coord2/1000000.0 - coord1/1000000.0) / 2.0
        # 	gene_c_plot.text(x_coord_text, n_rows_c / 2.0, text=message_c, alpha=1,
        # 				   text_font_size="12pt", text_font_style="bold", text_baseline="middle", text_align="center", angle=0)

        gene_plot.xaxis.axis_label = "Chromosome " + snp_coord[
            'chromosome'] + " Coordinate (Mb)(" + genome_build_vars[
                genome_build]['title'] + ")"
        gene_plot.yaxis.axis_label = "Genes (Transcripts Collapsed)"
        gene_plot.ygrid.grid_line_color = None
        gene_plot.yaxis.axis_line_color = None
        gene_plot.yaxis.minor_tick_line_color = None
        gene_plot.yaxis.major_tick_line_color = None
        gene_plot.yaxis.major_label_text_color = None

        gene_plot.toolbar_location = "below"

    # Change output backend to SVG temporarily for headless export
    # Will be changed back to canvas in LDlink.js
    proxy_plot.output_backend = "svg"
    rug.output_backend = "svg"
    gene_plot.output_backend = "svg"
    export_svgs(proxy_plot,
                filename=tmp_dir + "proxy_plot_1_" + request + ".svg")
    export_svgs(gene_plot,
                filename=tmp_dir + "gene_plot_1_" + request + ".svg")

    # 1 pixel = 0.0264583333 cm
    svg_height = str(20.00 + (0.0264583333 * plot_h_pix)) + "cm"
    svg_height_scaled = str(100.00 + (0.1322916665 * plot_h_pix)) + "cm"

    # Concatenate svgs
    sg.Figure("24.59cm", svg_height,
              sg.SVG(tmp_dir + "proxy_plot_1_" + request + ".svg"),
              sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").move(
                  0, 630)).save(tmp_dir + "proxy_plot_" + request + ".svg")

    sg.Figure(
        "122.95cm", svg_height_scaled,
        sg.SVG(tmp_dir + "proxy_plot_1_" + request + ".svg").scale(5),
        sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").scale(5).move(
            0, 3150)).save(tmp_dir + "proxy_plot_scaled_" + request + ".svg")

    # Export to PDF
    subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "proxy_plot_" +
                    request + ".svg " + tmp_dir + "proxy_plot_" + request +
                    ".pdf",
                    shell=True)
    # Export to PNG
    subprocess.call("phantomjs ./rasterize.js " + tmp_dir +
                    "proxy_plot_scaled_" + request + ".svg " + tmp_dir +
                    "proxy_plot_" + request + ".png",
                    shell=True)
    # Export to JPEG
    subprocess.call("phantomjs ./rasterize.js " + tmp_dir +
                    "proxy_plot_scaled_" + request + ".svg " + tmp_dir +
                    "proxy_plot_" + request + ".jpeg",
                    shell=True)
    # Remove individual SVG files after they are combined
    subprocess.call("rm " + tmp_dir + "proxy_plot_1_" + request + ".svg",
                    shell=True)
    subprocess.call("rm " + tmp_dir + "gene_plot_1_" + request + ".svg",
                    shell=True)
    # Remove scaled SVG file after it is converted to png and jpeg
    subprocess.call("rm " + tmp_dir + "proxy_plot_scaled_" + request + ".svg",
                    shell=True)

    reset_output()

    # Remove temporary files
    subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt", shell=True)
    subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True)
    subprocess.call("rm " + tmp_dir + "genes_*" + request + "*.json",
                    shell=True)
    subprocess.call("rm " + tmp_dir + "recomb_" + request + ".txt", shell=True)

    # Return plot output
    return None
Пример #3
0
# Get population ids
pop_list = open(tmp_dir + "pops_" + request + ".txt").readlines()
ids = []
for i in range(len(pop_list)):
    ids.append(pop_list[i].strip())

pop_ids = list(set(ids))

# Get VCF region
vcf_filePath = "%s/%s%s/%s" % (config['aws']['data_subfolder'], genotypes_dir,
                               genome_build_vars[genome_build]['1000G_dir'],
                               genome_build_vars[genome_build]["1000G_file"] %
                               (chr))
vcf_query_snp_file = "s3://%s/%s" % (config['aws']['bucket'], vcf_filePath)

checkS3File(aws_info, config['aws']['bucket'], vcf_filePath)


# Define function to calculate LD metrics
def set_alleles(a1, a2):
    if len(a1) == 1 and len(a2) == 1:
        a1_n = a1
        a2_n = a2
    elif len(a1) == 1 and len(a2) > 1:
        a1_n = "-"
        a2_n = a2[1:]
    elif len(a1) > 1 and len(a2) == 1:
        a1_n = a1[1:]
        a2_n = "-"
    elif len(a1) > 1 and len(a2) > 1:
        a1_n = a1[1:]
Пример #4
0
def get_ld_stats(variantPair, pop_ids):
    # parse ld pair array parameter input
    snp1 = variantPair[0]
    snp1_coord = {
        "chromosome": variantPair[1],
        genome_build_vars[genome_build]['position']: variantPair[2]
    }
    snp2 = variantPair[3]
    snp2_coord = {
        "chromosome": variantPair[4],
        genome_build_vars[genome_build]['position']: variantPair[5]
    }

    # errors/warnings encountered
    output = {"error": [], "warning": []}
    # Extract 1000 Genomes phased genotypes
    # SNP1
    vcf_filePath1 = "%s/%s%s/%s" % (
        config['aws']['data_subfolder'], genotypes_dir,
        genome_build_vars[genome_build]['1000G_dir'],
        genome_build_vars[genome_build]['1000G_file'] %
        snp1_coord['chromosome'])
    vcf_query_snp_file1 = "s3://%s/%s" % (config['aws']['bucket'],
                                          vcf_filePath1)

    checkS3File(aws_info, config['aws']['bucket'], vcf_filePath1)

    tabix_snp1_offset = export_s3_keys + " cd {3}; tabix -D {0} {1}:{2}-{2} | grep -v -e END".format(
        vcf_query_snp_file1,
        genome_build_vars[genome_build]['1000G_chr_prefix'] +
        snp1_coord['chromosome'],
        snp1_coord[genome_build_vars[genome_build]['position']], data_dir +
        genotypes_dir + genome_build_vars[genome_build]['1000G_dir'])
    vcf1_offset = [
        x.decode('utf-8')
        for x in subprocess.Popen(tabix_snp1_offset,
                                  shell=True,
                                  stdout=subprocess.PIPE).stdout.readlines()
    ]

    # SNP2
    vcf_filePath2 = "%s/%s%s/%s" % (
        config['aws']['data_subfolder'], genotypes_dir,
        genome_build_vars[genome_build]['1000G_dir'],
        genome_build_vars[genome_build]['1000G_file'] %
        snp2_coord['chromosome'])
    vcf_query_snp_file2 = "s3://%s/%s" % (config['aws']['bucket'],
                                          vcf_filePath2)

    checkS3File(aws_info, config['aws']['bucket'], vcf_filePath2)

    tabix_snp2_offset = export_s3_keys + " cd {3}; tabix -D {0} {1}:{2}-{2} | grep -v -e END".format(
        vcf_query_snp_file2,
        genome_build_vars[genome_build]['1000G_chr_prefix'] +
        snp2_coord['chromosome'],
        snp2_coord[genome_build_vars[genome_build]['position']], data_dir +
        genotypes_dir + genome_build_vars[genome_build]['1000G_dir'])
    vcf2_offset = [
        x.decode('utf-8')
        for x in subprocess.Popen(tabix_snp2_offset,
                                  shell=True,
                                  stdout=subprocess.PIPE).stdout.readlines()
    ]

    vcf1_pos = snp1_coord[genome_build_vars[genome_build]['position']]
    vcf2_pos = snp2_coord[genome_build_vars[genome_build]['position']]
    vcf1 = vcf1_offset
    vcf2 = vcf2_offset

    # SNP1
    if len(vcf1) == 0:
        output["error"].append(snp1 + " is not in 1000G reference panel.")
        return {
            "r2": "NA",
            "D_prime": "NA",
            "p": "NA",
            "alleles": "NA",
            "output": output
        }
    elif len(vcf1) > 1:
        geno1 = []
        for i in range(len(vcf1)):
            if vcf1[i].strip().split()[2] == snp1:
                geno1 = vcf1[i].strip().split()
                geno1[0] = geno1[0].lstrip('chr')
        if geno1 == []:
            output["error"].append(snp1 + " is not in 1000G reference panel.")
            return {
                "r2": "NA",
                "D_prime": "NA",
                "p": "NA",
                "alleles": "NA",
                "output": output
            }
    else:
        geno1 = vcf1[0].strip().split()
        geno1[0] = geno1[0].lstrip('chr')
    if geno1[2] != snp1 and snp1[0:2] == "rs" and "rs" in geno1[2]:
        output["warning"].append(
            "Genomic position for query variant1 (" + snp1 +
            ") does not match RS number at 1000G position (chr" + geno1[0] +
            ":" + geno1[1] + " = " + geno1[2] + ")")
        snp1 = geno1[2]
    if "," in geno1[3] or "," in geno1[4]:
        output["error"].append(snp1 + " is not a biallelic variant.")
        return {
            "r2": "NA",
            "D_prime": "NA",
            "p": "NA",
            "alleles": "NA",
            "output": output
        }
    if len(geno1[3]) == 1 and len(geno1[4]) == 1:
        snp1_a1 = geno1[3]
        snp1_a2 = geno1[4]
    elif len(geno1[3]) == 1 and len(geno1[4]) > 1:
        snp1_a1 = "-"
        snp1_a2 = geno1[4][1:]
    elif len(geno1[3]) > 1 and len(geno1[4]) == 1:
        snp1_a1 = geno1[3][1:]
        snp1_a2 = "-"
    elif len(geno1[3]) > 1 and len(geno1[4]) > 1:
        snp1_a1 = geno1[3][1:]
        snp1_a2 = geno1[4][1:]
    allele1 = {
        "0|0": [snp1_a1, snp1_a1],
        "0|1": [snp1_a1, snp1_a2],
        "1|0": [snp1_a2, snp1_a1],
        "1|1": [snp1_a2, snp1_a2],
        "0": [snp1_a1, "."],
        "1": [snp1_a2, "."],
        "./.": [".", "."],
        ".": [".", "."]
    }
    # SNP2
    if len(vcf2) == 0:
        output["error"].append(snp2 + " is not in 1000G reference panel.")
        return {
            "r2": "NA",
            "D_prime": "NA",
            "p": "NA",
            "alleles": "NA",
            "output": output
        }
    elif len(vcf2) > 1:
        geno2 = []
        for i in range(len(vcf2)):
            if vcf2[i].strip().split()[2] == snp2:
                geno2 = vcf2[i].strip().split()
                geno2[0] = geno2[0].lstrip('chr')
        if geno2 == []:
            output["error"].append(snp2 + " is not in 1000G reference panel.")
            return {
                "r2": "NA",
                "D_prime": "NA",
                "p": "NA",
                "alleles": "NA",
                "output": output
            }
    else:
        geno2 = vcf2[0].strip().split()
        geno2[0] = geno2[0].lstrip('chr')
    if geno2[2] != snp2 and snp2[0:2] == "rs" and "rs" in geno2[2]:
        output["warning"].append(
            "Genomic position for query variant2 (" + snp2 +
            ") does not match RS number at 1000G position (chr" + geno2[0] +
            ":" + geno2[1] + " = " + geno2[2] + ")")
        snp2 = geno2[2]
    if "," in geno2[3] or "," in geno2[4]:
        output["error"].append(snp2 + " is not a biallelic variant.")
        return {
            "r2": "NA",
            "D_prime": "NA",
            "p": "NA",
            "alleles": "NA",
            "output": output
        }
    if len(geno2[3]) == 1 and len(geno2[4]) == 1:
        snp2_a1 = geno2[3]
        snp2_a2 = geno2[4]
    elif len(geno2[3]) == 1 and len(geno2[4]) > 1:
        snp2_a1 = "-"
        snp2_a2 = geno2[4][1:]
    elif len(geno2[3]) > 1 and len(geno2[4]) == 1:
        snp2_a1 = geno2[3][1:]
        snp2_a2 = "-"
    elif len(geno2[3]) > 1 and len(geno2[4]) > 1:
        snp2_a1 = geno2[3][1:]
        snp2_a2 = geno2[4][1:]
    allele2 = {
        "0|0": [snp2_a1, snp2_a1],
        "0|1": [snp2_a1, snp2_a2],
        "1|0": [snp2_a2, snp2_a1],
        "1|1": [snp2_a2, snp2_a2],
        "0": [snp2_a1, "."],
        "1": [snp2_a2, "."],
        "./.": [".", "."],
        ".": [".", "."]
    }

    if geno1[1] != vcf1_pos:
        output["error"].append(
            "VCF File does not match variant coordinates for SNP1.")
        return {
            "r2": "NA",
            "D_prime": "NA",
            "p": "NA",
            "alleles": "NA",
            "output": output
        }
    if geno2[1] != vcf2_pos:
        output["error"].append(
            "VCF File does not match variant coordinates for SNP2.")
        return {
            "r2": "NA",
            "D_prime": "NA",
            "p": "NA",
            "alleles": "NA",
            "output": output
        }

    # Get headers
    tabix_snp1_h = export_s3_keys + " cd {1}; tabix -HD {0} | grep CHROM".format(
        vcf_query_snp_file1, data_dir + genotypes_dir +
        genome_build_vars[genome_build]['1000G_dir'])
    head1 = [
        x.decode('utf-8')
        for x in subprocess.Popen(tabix_snp1_h,
                                  shell=True,
                                  stdout=subprocess.PIPE).stdout.readlines()
    ][0].strip().split()
    tabix_snp2_h = export_s3_keys + " cd {1}; tabix -HD {0} | grep CHROM".format(
        vcf_query_snp_file2, data_dir + genotypes_dir +
        genome_build_vars[genome_build]['1000G_dir'])
    head2 = [
        x.decode('utf-8')
        for x in subprocess.Popen(tabix_snp2_h,
                                  shell=True,
                                  stdout=subprocess.PIPE).stdout.readlines()
    ][0].strip().split()

    # Combine phased genotypes
    geno = {}
    for i in range(9, len(head1)):
        geno[head1[i]] = [allele1[geno1[i]], ".."]
    for i in range(9, len(head2)):
        if head2[i] in geno:
            geno[head2[i]][1] = allele2[geno2[i]]

    # Extract haplotypes
    hap = {}
    for ind in pop_ids:
        if ind in geno:
            hap1 = geno[ind][0][0] + "_" + geno[ind][1][0]
            hap2 = geno[ind][0][1] + "_" + geno[ind][1][1]
            if hap1 in hap:
                hap[hap1] += 1
            else:
                hap[hap1] = 1
            if hap2 in hap:
                hap[hap2] += 1
            else:
                hap[hap2] = 1

    # Remove missing haplotypes
    keys = list(hap.keys())
    for key in keys:
        if "." in key:
            hap.pop(key, None)
    # Check all haplotypes are present
    if len(hap) != 4:
        snp1_a = [snp1_a1, snp1_a2]
        snp2_a = [snp2_a1, snp2_a2]
        haps = [
            snp1_a[0] + "_" + snp2_a[0], snp1_a[0] + "_" + snp2_a[1],
            snp1_a[1] + "_" + snp2_a[0], snp1_a[1] + "_" + snp2_a[1]
        ]
        for i in haps:
            if i not in hap:
                hap[i] = 0

    # Sort haplotypes
    A = hap[sorted(hap)[0]]
    B = hap[sorted(hap)[1]]
    C = hap[sorted(hap)[2]]
    D = hap[sorted(hap)[3]]
    N = A + B + C + D
    # tmax = max(A, B, C, D)

    hap1 = sorted(hap, key=hap.get, reverse=True)[0]
    hap2 = sorted(hap, key=hap.get, reverse=True)[1]
    # hap3 = sorted(hap, key=hap.get, reverse=True)[2]
    # hap4 = sorted(hap, key=hap.get, reverse=True)[3]

    delta = float(A * D - B * C)
    Ms = float((A + C) * (B + D) * (A + B) * (C + D))
    # print("Ms=", Ms)
    if Ms != 0:
        # D prime
        if delta < 0:
            D_prime = abs(delta / min((A + C) * (A + B), (B + D) * (C + D)))
        else:
            D_prime = abs(delta / min((A + C) * (C + D), (A + B) * (B + D)))
        # R2
        r2 = (delta**2) / Ms
    else:
        output["error"].append("Variant MAF is 0.0, variant removed.")
        return {"r2": "NA", "D_prime": "NA", "alleles": "NA", "output": output}

    allele1 = str(sorted(hap)[0].split("_")[1])
    allele1_freq = str(round(float(A + C) /
                             N, 3)) if N > float(A + C) else "NA"

    allele2 = str(sorted(hap)[1].split("_")[1])
    allele2_freq = str(round(float(B + D) /
                             N, 3)) if N > float(B + D) else "NA"

    alleles = ", ".join(
        ["=".join([allele1, allele1_freq]), "=".join([allele2, allele2_freq])])

    return {"r2": r2, "D_prime": D_prime, "alleles": alleles, "output": output}
Пример #5
0
def calculate_hap(snplst, pop, request, web, genome_build):
    # Set data directories using config.yml
    with open('config.yml', 'r') as yml_file:
        config = yaml.load(yml_file)
    dbsnp_version = config['data']['dbsnp_version']
    data_dir = config['data']['data_dir']
    tmp_dir = config['data']['tmp_dir']
    population_samples_dir = config['data']['population_samples_dir']
    genotypes_dir = config['data']['genotypes_dir']
    aws_info = config['aws']

    # Create JSON output
    output = {}

    # Validate genome build param
    if genome_build not in genome_build_vars['vars']:
        output[
            "error"] = "Invalid genome build. Please specify either " + ", ".join(
                genome_build_vars['vars']) + ". " + str(
                    output["warning"] if "warning" in output else "")
        return (json.dumps(output, sort_keys=True, indent=2))

    # Open Inputted SNPs list file
    snps_raw = open(snplst).readlines()
    if len(snps_raw) > 30:
        output["error"] = "Maximum variant list is 30 RS numbers or coordinates. Your list contains " + \
            str(len(snps_raw))+" entries. " + str(output["warning"] if "warning" in output else "")
        return (json.dumps(output, sort_keys=True, indent=2))

    # Remove duplicate RS numbers and cast to lower case
    snps = []
    for snp_raw in snps_raw:
        snp = snp_raw.lower().strip().split()
        if snp not in snps:
            snps.append(snp)

    # Select desired ancestral populations
    pops = pop.split("+")
    pop_dirs = []
    for pop_i in pops:
        if pop_i in [
                "ALL", "AFR", "AMR", "EAS", "EUR", "SAS", "ACB", "ASW", "BEB",
                "CDX", "CEU", "CHB", "CHS", "CLM", "ESN", "FIN", "GBR", "GIH",
                "GWD", "IBS", "ITU", "JPT", "KHV", "LWK", "MSL", "MXL", "PEL",
                "PJL", "PUR", "STU", "TSI", "YRI"
        ]:
            pop_dirs.append(data_dir + population_samples_dir + pop_i + ".txt")
        else:
            output[
                "error"] = pop_i + " is not an ancestral population. Choose one of the following ancestral populations: AFR, AMR, EAS, EUR, or SAS; or one of the following sub-populations: ACB, ASW, BEB, CDX, CEU, CHB, CHS, CLM, ESN, FIN, GBR, GIH, GWD, IBS, ITU, JPT, KHV, LWK, MSL, MXL, PEL, PJL, PUR, STU, TSI, or YRI. " + str(
                    output["warning"] if "warning" in output else "")
            return (json.dumps(output, sort_keys=True, indent=2))

    get_pops = "cat " + " ".join(pop_dirs)
    pop_list = [
        x.decode('utf-8') for x in subprocess.Popen(
            get_pops, shell=True, stdout=subprocess.PIPE).stdout.readlines()
    ]

    ids = [i.strip() for i in pop_list]
    pop_ids = list(set(ids))

    db = connectMongoDBReadOnly(web)

    def get_coords(db, rsid):
        rsid = rsid.strip("rs")
        query_results = db.dbsnp.find_one({"id": rsid})
        query_results_sanitized = json.loads(json_util.dumps(query_results))
        return query_results_sanitized

    # Query genomic coordinates
    def get_rsnum(db, coord):
        temp_coord = coord.strip("chr").split(":")
        chro = temp_coord[0]
        pos = temp_coord[1]
        query_results = db.dbsnp.find({
            "chromosome":
            chro.upper() if chro == 'x' or chro == 'y' else str(chro),
            genome_build_vars[genome_build]['position']:
            str(pos)
        })
        query_results_sanitized = json.loads(json_util.dumps(query_results))
        return query_results_sanitized

    # Replace input genomic coordinates with variant ids (rsids)
    def replace_coords_rsid(db, snp_lst):
        new_snp_lst = []
        for snp_raw_i in snp_lst:
            if snp_raw_i[0][0:2] == "rs":
                new_snp_lst.append(snp_raw_i)
            else:
                snp_info_lst = get_rsnum(db, snp_raw_i[0])
                # print("snp_info_lst")
                # print(snp_info_lst)
                if snp_info_lst != None:
                    if len(snp_info_lst) > 1:
                        var_id = "rs" + snp_info_lst[0]['id']
                        ref_variants = []
                        for snp_info in snp_info_lst:
                            if snp_info['id'] == snp_info['ref_id']:
                                ref_variants.append(snp_info['id'])
                        if len(ref_variants) > 1:
                            var_id = "rs" + ref_variants[0]
                            if "warning" in output:
                                output["warning"] = output["warning"] + \
                                "Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp_raw_i[0] + ". "
                            else:
                                output["warning"] = "Multiple rsIDs (" + ", ".join(
                                    ["rs" + ref_id for ref_id in ref_variants]
                                ) + ") map to genomic coordinates " + snp_raw_i[
                                    0] + ". "
                        elif len(ref_variants) == 0 and len(snp_info_lst) > 1:
                            var_id = "rs" + snp_info_lst[0]['id']
                            if "warning" in output:
                                output["warning"] = output["warning"] + \
                                "Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp_raw_i[0] + ". "
                            else:
                                output["warning"] = "Multiple rsIDs (" + ", ".join(
                                    ["rs" + ref_id for ref_id in ref_variants]
                                ) + ") map to genomic coordinates " + snp_raw_i[
                                    0] + ". "
                        else:
                            var_id = "rs" + ref_variants[0]
                        new_snp_lst.append([var_id])
                    elif len(snp_info_lst) == 1:
                        var_id = "rs" + snp_info_lst[0]['id']
                        new_snp_lst.append([var_id])
                    else:
                        new_snp_lst.append(snp_raw_i)
                else:
                    new_snp_lst.append(snp_raw_i)
        return new_snp_lst

    snps = replace_coords_rsid(db, snps)
    # print("Input SNPs (replace genomic coords with RSIDs)", str(snps))
    # Find RS numbers and genomic coords in snp database
    rs_nums = []
    snp_pos = []
    snp_coords = []
    warn = []
    tabix_coords = ""
    for snp_i in snps:
        if len(snp_i) > 0:  # Length entire list of snps
            if len(snp_i[0]) > 2:  # Length of each snp in snps
                # Check first two charcters are rs and last charcter of each snp
                if (snp_i[0][0:2] == "rs"
                        or snp_i[0][0:3] == "chr") and snp_i[0][-1].isdigit():
                    snp_coord = get_coords(db, snp_i[0])
                    if snp_coord != None and snp_coord[genome_build_vars[
                            genome_build]['position']] != "NA":
                        # check if variant is on chrY for genome build = GRCh38
                        if snp_coord['chromosome'] == "Y" and (
                                genome_build == "grch38"
                                or genome_build == "grch38_high_coverage"):
                            if "warning" in output:
                                output["warning"] = output["warning"] + \
                                    "Input variants on chromosome Y are unavailable for GRCh38, only available for GRCh37 (" + "rs" + snp_coord['id'] + " = chr" + snp_coord['chromosome'] + ":" + snp_coord[genome_build_vars[genome_build]['position']] + "). "
                            else:
                                output[
                                    "warning"] = "Input variants on chromosome Y are unavailable for GRCh38, only available for GRCh37 (" + "rs" + snp_coord[
                                        'id'] + " = chr" + snp_coord[
                                            'chromosome'] + ":" + snp_coord[
                                                genome_build_vars[genome_build]
                                                ['position']] + "). "
                            warn.append(snp_i[0])
                        else:
                            rs_nums.append(snp_i[0])
                            snp_pos.append(snp_coord[
                                genome_build_vars[genome_build]['position']])
                            temp = [
                                snp_i[0], snp_coord['chromosome'],
                                snp_coord[genome_build_vars[genome_build]
                                          ['position']]
                            ]
                            snp_coords.append(temp)
                    else:
                        warn.append(snp_i[0])
                else:
                    warn.append(snp_i[0])
            else:
                warn.append(snp_i[0])

    if warn != []:
        if "warning" in output:
            output["warning"] = output["warning"] + \
                "The following RS number(s) or coordinate(s) inputs have warnings: " + ", ".join(warn) + ". "
        else:
            output[
                "warning"] = "The following RS number(s) or coordinate(s) inputs have warnings: " + ", ".join(
                    warn) + ". "

    if len(rs_nums) == 0:
        output[
            "error"] = "Input variant list does not contain any valid RS numbers or coordinates. " + str(
                output["warning"] if "warning" in output else "")
        return (json.dumps(output, sort_keys=True, indent=2))

    # Check SNPs are all on the same chromosome
    for i in range(len(snp_coords)):
        if snp_coords[0][1] != snp_coords[i][1]:
            output["error"] = "Not all input variants are on the same chromosome: "+snp_coords[i-1][0]+"=chr" + \
                str(snp_coords[i-1][1])+":"+str(snp_coords[i-1][2])+", "+snp_coords[i][0] + \
                "=chr"+str(snp_coords[i][1])+":"+str(snp_coords[i][2])+". " + str(output["warning"] if "warning" in output else "")
            return (json.dumps(output, sort_keys=True, indent=2))

    # Check max distance between SNPs
    distance_bp = []
    for i in range(len(snp_coords)):
        distance_bp.append(int(snp_coords[i][2]))
    distance_max = max(distance_bp) - min(distance_bp)
    if distance_max > 1000000:
        if "warning" in output:
            output["warning"] = output["warning"] + \
                "Switch rate errors become more common as distance between query variants increases (Query range = "+str(
                    distance_max)+" bp). "
        else:
            output[
                "warning"] = "Switch rate errors become more common as distance between query variants increases (Query range = " + str(
                    distance_max) + " bp). "

    # Sort coordinates and make tabix formatted coordinates
    snp_pos_int = [int(i) for i in snp_pos]
    snp_pos_int.sort()
    # keep track of rs and snp postion after sort
    rs_snp_pos = []
    for i in snp_pos_int:
        rs_snp_pos.append(snp_pos.index(str(i)))

    snp_coord_str = [
        genome_build_vars[genome_build]['1000G_chr_prefix'] +
        snp_coords[0][1] + ":" + str(i) + "-" + str(i) for i in snp_pos_int
    ]
    tabix_coords = " " + " ".join(snp_coord_str)
    #print("tabix_coords", tabix_coords)
    # # Extract 1000 Genomes phased genotypes
    vcf_filePath = "%s/%s%s/%s" % (
        config['aws']['data_subfolder'], genotypes_dir,
        genome_build_vars[genome_build]['1000G_dir'],
        genome_build_vars[genome_build]['1000G_file'] % (snp_coords[0][1]))
    vcf_query_snp_file = "s3://%s/%s" % (config['aws']['bucket'], vcf_filePath)

    checkS3File(aws_info, config['aws']['bucket'], vcf_filePath)
    vcf, h = retrieveTabix1000GData(
        vcf_query_snp_file, tabix_coords, data_dir + genotypes_dir +
        genome_build_vars[genome_build]['1000G_dir'])

    # Define function to correct indel alleles
    def set_alleles(a1, a2):
        if len(a1) == 1 and len(a2) == 1:
            a1_n = a1
            a2_n = a2
        elif len(a1) == 1 and len(a2) > 1:
            a1_n = "-"
            a2_n = a2[1:]
        elif len(a1) > 1 and len(a2) == 1:
            a1_n = a1[1:]
            a2_n = "-"
        elif len(a1) > 1 and len(a2) > 1:
            a1_n = a1[1:]
            a2_n = a2[1:]
        return (a1_n, a2_n)

    # Make sure there are genotype data in VCF file
    #if vcf[-1][0:6] == "#CHROM":
    #    output["error"] = "No query variants were found in 1000G VCF file. " + str(output["warning"] if "warning" in output else "")
    #    return(json.dumps(output, sort_keys=True, indent=2))

    head = vcf[h].strip().split()

    # Extract haplotypes
    index = []
    for i in range(9, len(head)):
        if head[i] in pop_ids:
            index.append(i)

    hap1 = [[]]
    for i in range(len(index) - 1):
        hap1.append([])
    hap2 = [[]]
    for i in range(len(index) - 1):
        hap2.append([])

    # parse vcf
    snp_dict, missing_snp = parse_vcf(vcf[h + 1:], snp_coords, True)
    # throw error if no data is returned from 1000G
    if len(missing_snp.split()) == len(snp_pos):
        output[
            "error"] = "Input variant list does not contain any valid RS numbers or coordinates. " + str(
                output["warning"] if "warning" in output else "")
        return (json.dumps(output, sort_keys=True, indent=2))

    if len(missing_snp) > 0:
        output["warning"] = "Query variant " + str(
            missing_snp) + " is missing from 1000G (" + genome_build_vars[
                genome_build]['title'] + ") data. " + str(
                    output["warning"] if "warning" in output else "")

    rsnum_lst = []
    allele_lst = []
    pos_lst = []

    for s_key in snp_dict:
        # parse snp_key such as chr7:pos_rs4
        snp_keys = s_key.split("_")
        snp_key = snp_keys[0].split(':')[1]
        rs_input = snp_keys[1]
        geno_list = snp_dict[s_key]
        g = -1
        for geno in geno_list:
            g = g + 1
            geno = geno.strip().split()
            geno[0] = geno[0].lstrip('chr')
            # if 1000G position does not match dbSNP position for variant, use dbSNP position
            if geno[1] != snp_key:
                mismatch_msg = "Genomic position ("+geno[1]+") in 1000G data does not match dbSNP" + \
                        dbsnp_version + " (" + genome_build_vars[genome_build]['title'] + ") search coordinates for query variant " +\
                        rs_input + ". "
                if "warning" in output:
                    output["warning"] = output["warning"] + mismatch_msg
                else:
                    output["warning"] = mismatch_msg
                # throw an error in the event of missing query SNPs in 1000G data
                geno[1] = snp_key

            if "," not in geno[3] and "," not in geno[4]:
                a1, a2 = set_alleles(geno[3], geno[4])
                count0 = 0
                count1 = 0
                #print(geno)
                for i in range(len(index)):
                    if geno[index[i]] == "0|0":
                        hap1[i].append(a1)
                        hap2[i].append(a1)
                        count0 += 2
                    elif geno[index[i]] == "0|1":
                        hap1[i].append(a1)
                        hap2[i].append(a2)
                        count0 += 1
                        count1 += 1
                    elif geno[index[i]] == "1|0":
                        hap1[i].append(a2)
                        hap2[i].append(a1)
                        count0 += 1
                        count1 += 1
                    elif geno[index[i]] == "1|1":
                        hap1[i].append(a2)
                        hap2[i].append(a2)
                        count1 += 2
                    elif geno[index[i]] == "0":
                        hap1[i].append(a1)
                        hap2[i].append(".")
                        count0 += 1
                    elif geno[index[i]] == "1":
                        hap1[i].append(a2)
                        hap2[i].append(".")
                        count1 += 1
                    else:
                        hap1[i].append(".")
                        hap2[i].append(".")
                rsnum_lst.append(rs_input)
                position = "chr" + geno[0] + ":" + geno[1]
                pos_lst.append(position)
                f0 = round(float(count0) / (count0 + count1), 4)
                f1 = round(float(count1) / (count0 + count1), 4)
                if f0 >= f1:
                    alleles = a1+"="+str(round(f0, 3))+", " + \
                        a2+"="+str(round(f1, 3))
                else:
                    alleles = a2+"="+str(round(f1, 3))+", " + \
                        a1+"="+str(round(f0, 3))
                allele_lst.append(alleles)

    haps = {}
    for i in range(len(index)):
        h1 = "_".join(hap1[i])
        h2 = "_".join(hap2[i])
        if h1 in haps:
            haps[h1] += 1
        else:
            haps[h1] = 1

        if h2 in haps:
            haps[h2] += 1
        else:
            haps[h2] = 1

    # Remove Missing Haplotypes
    keys = list(haps.keys())
    for key in keys:
        if "." in key:
            haps.pop(key, None)

    # Sort results
    results = []
    for hap in haps:
        temp = [hap, haps[hap]]
        results.append(temp)

    total_haps = sum(haps.values())

    results_sort1 = sorted(results, key=operator.itemgetter(0))
    results_sort2 = sorted(results_sort1,
                           key=operator.itemgetter(1),
                           reverse=True)

    # Generate JSON output
    digits = len(str(len(results_sort2)))
    haps_out = {}
    for i in range(len(results_sort2)):
        hap_info = {}
        hap_info["Haplotype"] = results_sort2[i][0]
        hap_info["Count"] = results_sort2[i][1]
        hap_info["Frequency"] = round(
            float(results_sort2[i][1]) / total_haps, 4)
        haps_out["haplotype_" + (digits - len(str(i + 1))) * "0" +
                 str(i + 1)] = hap_info
    output["haplotypes"] = haps_out

    digits = len(str(len(rsnum_lst)))
    snps_out = {}
    for i in range(len(rsnum_lst)):
        snp_info = {}
        snp_info["RS"] = rsnum_lst[i]
        snp_info["Alleles"] = allele_lst[i]
        snp_info["Coord"] = pos_lst[i]
        snps_out["snp_" + (digits - len(str(i + 1))) * "0" +
                 str(i + 1)] = snp_info
    output["snps"] = snps_out

    # Create SNP File
    snp_out = open(tmp_dir + "snps_" + request + ".txt", "w")
    print("RS_Number\tPosition (" +
          genome_build_vars[genome_build]['title_hg'] + ")\tAllele Frequency",
          file=snp_out)
    for k in sorted(output["snps"].keys()):
        rs_k = output["snps"][k]["RS"]
        coord_k = output["snps"][k]["Coord"]
        alleles_k0 = output["snps"][k]["Alleles"].strip(" ").split(",")
        alleles_k1 = alleles_k0[0]+"0"*(7-len(str(alleles_k0[0]))) + \
            ","+alleles_k0[1]+"0"*(8-len(str(alleles_k0[1])))
        temp_k = [rs_k, coord_k, alleles_k1]
        print("\t".join(temp_k), file=snp_out)
    snp_out.close()

    # Create Haplotype File
    hap_out = open(tmp_dir + "haplotypes_" + request + ".txt", "w")
    print("Haplotype\tCount\tFrequency", file=hap_out)
    for k in sorted(output["haplotypes"].keys()):
        hap_k = output["haplotypes"][k]["Haplotype"]
        count_k = str(output["haplotypes"][k]["Count"])
        freq_k = str(output["haplotypes"][k]["Frequency"])
        temp_k = [hap_k, count_k, freq_k]
        print("\t".join(temp_k), file=hap_out)
    hap_out.close()

    # Return JSON output
    return (json.dumps(output, sort_keys=True, indent=2))
Пример #6
0
def calculate_pair(snp_pairs, pop, web, genome_build, request):

    # Set data directories using config.yml
    with open('config.yml', 'r') as yml_file:
        config = yaml.load(yml_file)
    env = config['env']
    connect_external = config['database']['connect_external']
    api_mongo_addr = config['database']['api_mongo_addr']
    dbsnp_version = config['data']['dbsnp_version']
    population_samples_dir = config['data']['population_samples_dir']
    data_dir = config['data']['data_dir']
    tmp_dir = config['data']['tmp_dir']
    genotypes_dir = config['data']['genotypes_dir']
    aws_info = config['aws']
    mongo_username = config['database']['mongo_user_readonly']
    mongo_password = config['database']['mongo_password']
    mongo_port = config['database']['mongo_port']

    export_s3_keys = retrieveAWSCredentials()

    # Ensure tmp directory exists
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)

    # Create JSON output
    output_list = []

    snp_pair_limit = 10
    
    # Throw max SNP pairs error message
    if len(snp_pairs) > snp_pair_limit:
        error_out = [{
            "error": "Maximum SNP pair list is " + str(snp_pair_limit) + " pairs. Your list contains " + str(len(snp_pairs)) + " pairs."
        }]
        return(json.dumps(error_out, sort_keys=True, indent=2))

    # Validate genome build param
    # print("genome_build " + genome_build)
    if genome_build not in genome_build_vars['vars']:
        error_out = [{
            "error": "Invalid genome build. Please specify either " + ", ".join(genome_build_vars['vars']) + "."
        }]
        return(json.dumps(error_out, sort_keys=True, indent=2))

    # Select desired ancestral populations
    pops = pop.split("+")
    pop_dirs = []
    for pop_i in pops:
        if pop_i in ["ALL", "AFR", "AMR", "EAS", "EUR", "SAS", "ACB", "ASW", "BEB", "CDX", "CEU", "CHB", "CHS", "CLM", "ESN", "FIN", "GBR", "GIH", "GWD", "IBS", "ITU", "JPT", "KHV", "LWK", "MSL", "MXL", "PEL", "PJL", "PUR", "STU", "TSI", "YRI"]:
            pop_dirs.append(data_dir + population_samples_dir + pop_i + ".txt")
        else:
            error_out = [{
                "error": pop_i + " is not an ancestral population. Choose one of the following ancestral populations: AFR, AMR, EAS, EUR, or SAS; or one of the following sub-populations: ACB, ASW, BEB, CDX, CEU, CHB, CHS, CLM, ESN, FIN, GBR, GIH, GWD, IBS, ITU, JPT, KHV, LWK, MSL, MXL, PEL, PJL, PUR, STU, TSI, or YRI."
            }]
            return(json.dumps(error_out, sort_keys=True, indent=2))

    get_pops = "cat " + " ".join(pop_dirs)
    pop_list = [x.decode('utf-8') for x in subprocess.Popen(get_pops, shell=True, stdout=subprocess.PIPE).stdout.readlines()]

    ids = [i.strip() for i in pop_list]
    pop_ids = list(set(ids))

    # Connect to Mongo snp database
    if env == 'local' or connect_external:
        mongo_host = api_mongo_addr
    else: 
        mongo_host = 'localhost'
    if web:
        client = MongoClient('mongodb://' + mongo_username + ':' + mongo_password + '@' + mongo_host+'/admin', mongo_port)
    else:
        if env == 'local' or connect_external:
            client = MongoClient('mongodb://' + mongo_username + ':' + mongo_password + '@' + mongo_host+'/admin', mongo_port)
        else:
            client = MongoClient('localhost', mongo_port)
    db = client["LDLink"]

    def get_coords(db, rsid):
        rsid = rsid.strip("rs")
        query_results = db.dbsnp.find_one({"id": rsid})
        query_results_sanitized = json.loads(json_util.dumps(query_results))
        return query_results_sanitized

    # Replace input genomic coordinates with variant ids (rsids)
    def replace_coord_rsid(db, snp):
        if snp[0:2] == "rs":
            return snp
        else:
            snp_info_lst = get_rsnum(db, snp, genome_build)
            print("snp_info_lst")
            print(snp_info_lst)
            if snp_info_lst != None:
                if len(snp_info_lst) > 1:
                    var_id = "rs" + snp_info_lst[0]['id']
                    ref_variants = []
                    for snp_info in snp_info_lst:
                        if snp_info['id'] == snp_info['ref_id']:
                            ref_variants.append(snp_info['id'])
                    if len(ref_variants) > 1:
                        var_id = "rs" + ref_variants[0]
                        if "warning" in output:
                            output["warning"] = output["warning"] + \
                            "Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp + ". "
                        else:
                            output["warning"] = "Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp + ". "
                    elif len(ref_variants) == 0 and len(snp_info_lst) > 1:
                        var_id = "rs" + snp_info_lst[0]['id']
                        if "warning" in output:
                            output["warning"] = output["warning"] + \
                            "Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp + ". "
                        else:
                            output["warning"] = "Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp + ". "
                    else:
                        var_id = "rs" + ref_variants[0]
                    return var_id
                elif len(snp_info_lst) == 1:
                    var_id = "rs" + snp_info_lst[0]['id']
                    return var_id
                else:
                    return snp
            else:
                return snp
        return snp

    if len(snp_pairs) < 1:
        output = {}
        output["error"] = "Missing at least 1 SNP pair input. " + str(output["warning"] if "warning" in output else "")
        output_list.append(output)

    for pair in snp_pairs:
        output = {}
        output["pair"] = pair

        if len(pair) < 2 or len(pair) > 2 or len(pair[0]) < 3 or len(pair[1]) < 3:
            output["error"] = "Missing or additional SNPs in pair. " + str(output["warning"] if "warning" in output else "")
            output_list.append(output)
            continue

        # trim any whitespace
        snp1 = pair[0].lower().strip()
        snp2 = pair[1].lower().strip()

        # Find RS numbers in snp database
        # SNP1
        if re.compile(r'rs\d+', re.IGNORECASE).match(snp1) is None and re.compile(r'chr\d+:\d+', re.IGNORECASE).match(snp1) is None and re.compile(r'chr[X|Y]:\d+', re.IGNORECASE).match(snp1) is None:
            output["error"] = snp1 + " is not a valid SNP. " + str(output["warning"] if "warning" in output else "")
            output_list.append(output)
            continue
        snp1 = replace_coord_rsid(db, snp1)
        snp1_coord = get_coords(db, snp1)
        if snp1_coord == None or snp1_coord[genome_build_vars[genome_build]['position']] == "NA":
            output["error"] = snp1 + " is not in dbSNP build " + dbsnp_version + " (" + genome_build_vars[genome_build]['title'] + "). " + str(output["warning"] if "warning" in output else "")
            output_list.append(output)
            continue

        # SNP2
        if re.compile(r'rs\d+', re.IGNORECASE).match(snp2) is None and re.compile(r'chr\d+:\d+', re.IGNORECASE).match(snp2) is None and re.compile(r'chr[X|Y]:\d+', re.IGNORECASE).match(snp2) is None:
            output["error"] = snp1 + " is not a valid SNP. " + str(output["warning"] if "warning" in output else "")
            output_list.append(output)
            continue
        snp2 = replace_coord_rsid(db, snp2)
        snp2_coord = get_coords(db, snp2)
        if snp2_coord == None or snp2_coord[genome_build_vars[genome_build]['position']] == "NA":
            output["error"] = snp2 + " is not in dbSNP build " + dbsnp_version + " (" + genome_build_vars[genome_build]['title'] + "). " + str(output["warning"] if "warning" in output else "")
            output_list.append(output)
            continue

        # Check if SNPs are on the same chromosome
        if snp1_coord['chromosome'] != snp2_coord['chromosome']:
            if "warning" in output:
                output["warning"] = output["warning"] + snp1 + " and " + snp2 + " are on different chromosomes. "
            else:
                output["warning"] = snp1 + " and " + snp2 + " are on different chromosomes. "

        # Check if input SNPs are on chromosome Y while genome build == grch38
        # SNP1
        if snp1_coord['chromosome'] == "Y" and (genome_build == "grch38" or genome_build == "grch38_high_coverage"):
            output["error"] = "Input variants on chromosome Y are unavailable for GRCh38, only available for GRCh37 (" + "rs" + snp1_coord['id'] + " - chr" + snp1_coord['chromosome'] + ":" + snp1_coord[genome_build_vars[genome_build]['position']] + "). " + str(output["warning"] if "warning" in output else "")
            output_list.append(output) 
            continue

        # SNP2
        if snp2_coord['chromosome'] == "Y" and (genome_build == "grch38" or genome_build == "grch38_high_coverage"):
            output["error"] = "Input variants on chromosome Y are unavailable for GRCh38, only available for GRCh37 (" + "rs" + snp2_coord['id'] + " - chr" + snp2_coord['chromosome'] + ":" + snp2_coord[genome_build_vars[genome_build]['position']] + "). " + str(output["warning"] if "warning" in output else "")
            output_list.append(output)
            continue

        # Extract 1000 Genomes phased genotypes

        # SNP1
        vcf_filePath1 = "%s/%s%s/%s" % (config['aws']['data_subfolder'], genotypes_dir, genome_build_vars[genome_build]['1000G_dir'], genome_build_vars[genome_build]['1000G_file'] % snp1_coord['chromosome'])
        vcf_file1 = "s3://%s/%s" % (config['aws']['bucket'], vcf_filePath1)

        checkS3File(aws_info, config['aws']['bucket'], vcf_filePath1)

        tabix_snp1_offset = export_s3_keys + " cd {3}; tabix -D {0} {1}:{2}-{2} | grep -v -e END".format(
            vcf_file1, genome_build_vars[genome_build]['1000G_chr_prefix'] + snp1_coord['chromosome'], snp1_coord[genome_build_vars[genome_build]['position']], data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir'])
        vcf1_offset = [x.decode('utf-8') for x in subprocess.Popen(tabix_snp1_offset, shell=True, stdout=subprocess.PIPE).stdout.readlines()]

        # SNP2
        vcf_filePath2 = "%s/%s%s/%s" % (config['aws']['data_subfolder'], genotypes_dir, genome_build_vars[genome_build]['1000G_dir'], genome_build_vars[genome_build]['1000G_file'] % snp2_coord['chromosome'])
        vcf_file2 = "s3://%s/%s" % (config['aws']['bucket'], vcf_filePath2)

        checkS3File(aws_info, config['aws']['bucket'], vcf_filePath2)

        tabix_snp2_offset = export_s3_keys + " cd {3}; tabix -D {0} {1}:{2}-{2} | grep -v -e END".format(
            vcf_file2, genome_build_vars[genome_build]['1000G_chr_prefix'] + snp2_coord['chromosome'], snp2_coord[genome_build_vars[genome_build]['position']], data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir'])
        vcf2_offset = [x.decode('utf-8') for x in subprocess.Popen(tabix_snp2_offset, shell=True, stdout=subprocess.PIPE).stdout.readlines()]

        vcf1_pos = snp1_coord[genome_build_vars[genome_build]['position']]
        vcf2_pos = snp2_coord[genome_build_vars[genome_build]['position']]
        vcf1 = vcf1_offset
        vcf2 = vcf2_offset

        # Import SNP VCF files

        # SNP1
        if len(vcf1) == 0:
            output["error"] = snp1 + " is not in 1000G reference panel. " + str(output["warning"] if "warning" in output else "")
            output_list.append(output)
            continue

        elif len(vcf1) > 1:
            geno1 = []
            for i in range(len(vcf1)):
                geno1 = vcf1[i].strip().split()
                geno1[0] = geno1[0].lstrip('chr')
                if not (geno1[0] == snp1_coord['chromosome'] and geno1[1] == snp1_coord[genome_build_vars[genome_build]['position']]):
                    geno1 = []
            if geno1 == []:
                output["error"] = snp1 + " is not in 1000G reference panel. " + str(output["warning"] if "warning" in output else "")
                output_list.append(output)
                continue

        else:
            geno1 = vcf1[0].strip().split()
            geno1[0] = geno1[0].lstrip('chr')

        if geno1[2] != snp1 and snp1[0:2] == "rs" and "rs" in geno1[2]:
            if "warning" in output:
                output["warning"] = output["warning"] + \
                    "Genomic position for query variant1 (" + snp1 + \
                    ") does not match RS number at 1000G position (chr" + \
                    geno1[0]+":"+geno1[1]+" = "+geno1[2]+"). "
            else:
                output["warning"] = "Genomic position for query variant1 (" + snp1 + \
                    ") does not match RS number at 1000G position (chr" + \
                    geno1[0]+":"+geno1[1]+" = "+geno1[2]+"). "
            snp1 = geno1[2]

        if "," in geno1[3] or "," in geno1[4]:
            output["error"] = snp1 + " is not a biallelic variant. " + str(output["warning"] if "warning" in output else "")
            output_list.append(output)
            continue

        if len(geno1[3]) == 1 and len(geno1[4]) == 1:
            snp1_a1 = geno1[3]
            snp1_a2 = geno1[4]
        elif len(geno1[3]) == 1 and len(geno1[4]) > 1:
            snp1_a1 = "-"
            snp1_a2 = geno1[4][1:]
        elif len(geno1[3]) > 1 and len(geno1[4]) == 1:
            snp1_a1 = geno1[3][1:]
            snp1_a2 = "-"
        elif len(geno1[3]) > 1 and len(geno1[4]) > 1:
            snp1_a1 = geno1[3][1:]
            snp1_a2 = geno1[4][1:]

        allele1 = {"0|0": [snp1_a1, snp1_a1], "0|1": [snp1_a1, snp1_a2], "1|0": [snp1_a2, snp1_a1], "1|1": [
            snp1_a2, snp1_a2], "0": [snp1_a1, "."], "1": [snp1_a2, "."], "./.": [".", "."], ".": [".", "."]}

        # SNP2
        if len(vcf2) == 0:
            output["error"] = snp2 + " is not in 1000G reference panel. " + str(output["warning"] if "warning" in output else "")
            output_list.append(output)
            continue

        elif len(vcf2) > 1:
            geno2 = []
            for i in range(len(vcf2)):
                geno2 = vcf2[i].strip().split()
                geno2[0] = geno2[0].lstrip('chr')
                if not (geno2[0] == snp2_coord['chromosome'] and geno2[1] == snp2_coord[genome_build_vars[genome_build]['position']]):
                    geno2 = []
            if geno2 == []:
                output["error"] = snp2 + " is not in 1000G reference panel. " + str(output["warning"] if "warning" in output else "")
                output_list.append(output)
                continue

        else:
            geno2 = vcf2[0].strip().split()
            geno2[0] = geno2[0].lstrip('chr')

        if geno2[2] != snp2 and snp2[0:2] == "rs" and "rs" in geno2[2]:
            if "warning" in output:
                output["warning"] = output["warning"] + \
                    "Genomic position for query variant2 (" + snp2 + \
                    ") does not match RS number at 1000G position (chr" + \
                    geno2[0]+":"+geno2[1]+" = "+geno2[2]+"). "
            else:
                output["warning"] = "Genomic position for query variant2 (" + snp2 + \
                    ") does not match RS number at 1000G position (chr" + \
                    geno2[0]+":"+geno2[1]+" = "+geno2[2]+"). "
            snp2 = geno2[2]

        if "," in geno2[3] or "," in geno2[4]:
            output["error"] = snp2 + " is not a biallelic variant. " + str(output["warning"] if "warning" in output else "")
            output_list.append(output)
            continue

        if len(geno2[3]) == 1 and len(geno2[4]) == 1:
            snp2_a1 = geno2[3]
            snp2_a2 = geno2[4]
        elif len(geno2[3]) == 1 and len(geno2[4]) > 1:
            snp2_a1 = "-"
            snp2_a2 = geno2[4][1:]
        elif len(geno2[3]) > 1 and len(geno2[4]) == 1:
            snp2_a1 = geno2[3][1:]
            snp2_a2 = "-"
        elif len(geno2[3]) > 1 and len(geno2[4]) > 1:
            snp2_a1 = geno2[3][1:]
            snp2_a2 = geno2[4][1:]

        allele2 = {"0|0": [snp2_a1, snp2_a1], "0|1": [snp2_a1, snp2_a2], "1|0": [snp2_a2, snp2_a1], "1|1": [
            snp2_a2, snp2_a2], "0": [snp2_a1, "."], "1": [snp2_a2, "."], "./.": [".", "."], ".": [".", "."]}

        if geno1[1] != vcf1_pos:
            if "warning" in output:
                output["warning"] =  output["warning"] + "VCF File does not match variant coordinates for SNP1. "
            else:
                output["warning"] = "VCF File does not match variant coordinates for SNP1. "
            output_list.append(output)
            geno1[1] = vcf1_pos

        if geno2[1] != vcf2_pos:
            if "warning" in output:
                output["warning"] = output["warning"] + "VCF File does not match variant coordinates for SNP2. "
            else:
                output["warning"] = "VCF File does not match variant coordinates for SNP2. "
            output_list.append(output)
            geno2[1] = vcf2_pos

        # Get headers
        tabix_snp1_h = export_s3_keys + " cd {1}; tabix -HD {0} | grep CHROM".format(vcf_file1, data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir'])
        head1 = [x.decode('utf-8') for x in subprocess.Popen(tabix_snp1_h, shell=True, stdout=subprocess.PIPE).stdout.readlines()][0].strip().split()

        tabix_snp2_h = export_s3_keys + " cd {1}; tabix -HD {0} | grep CHROM".format(vcf_file2, data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir'])
        head2 =  [x.decode('utf-8') for x in subprocess.Popen(tabix_snp2_h, shell=True, stdout=subprocess.PIPE).stdout.readlines()][0].strip().split()

        # Combine phased genotypes
        geno = {}
        for i in range(9, len(head1)):
            geno[head1[i]] = [allele1[geno1[i]], ".."]

        for i in range(9, len(head2)):
            if head2[i] in geno:
                geno[head2[i]][1] = allele2[geno2[i]]

        # Extract haplotypes
        hap = {}
        for ind in pop_ids:
            if ind in geno:
                hap1 = geno[ind][0][0] + "_" + geno[ind][1][0]
                hap2 = geno[ind][0][1] + "_" + geno[ind][1][1]
                if hap1 in hap:
                    hap[hap1] += 1
                else:
                    hap[hap1] = 1

                if hap2 in hap:
                    hap[hap2] += 1
                else:
                    hap[hap2] = 1

        # Remove missing haplotypes
        keys = list(hap.keys())
        for key in keys:
            if "." in key:
                hap.pop(key, None)

        # Check all haplotypes are present
        if len(hap) != 4:
            snp1_a = [snp1_a1, snp1_a2]
            snp2_a = [snp2_a1, snp2_a2]
            haps = [snp1_a[0] + "_" + snp2_a[0], snp1_a[0] + "_" + snp2_a[1],
                    snp1_a[1] + "_" + snp2_a[0], snp1_a[1] + "_" + snp2_a[1]]
            for i in haps:
                if i not in hap:
                    hap[i] = 0

        # Sort haplotypes
        A = hap[sorted(hap)[0]]
        B = hap[sorted(hap)[1]]
        C = hap[sorted(hap)[2]]
        D = hap[sorted(hap)[3]]
        N = A + B + C + D
        tmax = max(A, B, C, D)

        hap1 = sorted(hap, key=hap.get, reverse=True)[0]
        hap2 = sorted(hap, key=hap.get, reverse=True)[1]
        hap3 = sorted(hap, key=hap.get, reverse=True)[2]
        hap4 = sorted(hap, key=hap.get, reverse=True)[3]

        delta = float(A * D - B * C)
        Ms = float((A + C) * (B + D) * (A + B) * (C + D))
        if Ms != 0:

            # D prime
            if delta < 0:
                D_prime = abs(delta / min((A + C) * (A + B), (B + D) * (C + D)))
            else:
                D_prime = abs(delta / min((A + C) * (C + D), (A + B) * (B + D)))

            # R2
            r2 = (delta**2) / Ms

            # P-value
            num = (A + B + C + D) * (A * D - B * C)**2
            denom = Ms
            chisq = num / denom
            p = 2 * (1 - (0.5 * (1 + math.erf(chisq**0.5 / 2**0.5))))

        else:
            D_prime = "NA"
            r2 = "NA"
            chisq = "NA"
            p = "NA"

        # Find Correlated Alleles
        if str(r2) != "NA" and float(r2) > 0.1:
            Ac=hap[sorted(hap)[0]]
            Bc=hap[sorted(hap)[1]]
            Cc=hap[sorted(hap)[2]]
            Dc=hap[sorted(hap)[3]]

            if ((Ac*Dc) / max((Bc*Cc), 0.01) > 1):
                corr1 = snp1 + "(" + sorted(hap)[0].split("_")[0] + ") allele is correlated with " + snp2 + "(" + sorted(hap)[0].split("_")[1] + ") allele"
                corr2 = snp1 + "(" + sorted(hap)[3].split("_")[0] + ") allele is correlated with " + snp2 + "(" + sorted(hap)[3].split("_")[1] + ") allele"
                corr_alleles = [corr1, corr2]
            else:
                corr1 = snp1 + "(" + sorted(hap)[1].split("_")[0] + ") allele is correlated with " + snp2 + "(" + sorted(hap)[1].split("_")[1] + ") allele"
                corr2 = snp1 + "(" + sorted(hap)[2].split("_")[0] + ") allele is correlated with " + snp2 + "(" + sorted(hap)[2].split("_")[1] + ") allele"
                corr_alleles = [corr1, corr2]
        else:
            corr_alleles = [snp1 + " and " + snp2 + " are in linkage equilibrium"]
            

        # Create JSON output
        snp_1 = {}
        snp_1["rsnum"] = snp1
        snp_1["coord"] = "chr" + snp1_coord['chromosome'] + ":" + \
            vcf1_pos

        snp_1_allele_1 = {}
        snp_1_allele_1["allele"] = sorted(hap)[0].split("_")[0]
        snp_1_allele_1["count"] = str(A + B)
        snp_1_allele_1["frequency"] = str(round(float(A + B) / N, 3))
        snp_1["allele_1"] = snp_1_allele_1

        snp_1_allele_2 = {}
        snp_1_allele_2["allele"] = sorted(hap)[2].split("_")[0]
        snp_1_allele_2["count"] = str(C + D)
        snp_1_allele_2["frequency"] = str(round(float(C + D) / N, 3))
        snp_1["allele_2"] = snp_1_allele_2
        output["snp1"] = snp_1

        snp_2 = {}
        snp_2["rsnum"] = snp2
        snp_2["coord"] = "chr" + snp2_coord['chromosome'] + ":" + \
            vcf2_pos

        snp_2_allele_1 = {}
        snp_2_allele_1["allele"] = sorted(hap)[0].split("_")[1]
        snp_2_allele_1["count"] = str(A + C)
        snp_2_allele_1["frequency"] = str(round(float(A + C) / N, 3))
        snp_2["allele_1"] = snp_2_allele_1

        snp_2_allele_2 = {}
        snp_2_allele_2["allele"] = sorted(hap)[1].split("_")[1]
        snp_2_allele_2["count"] = str(B + D)
        snp_2_allele_2["frequency"] = str(round(float(B + D) / N, 3))
        snp_2["allele_2"] = snp_2_allele_2
        output["snp2"] = snp_2

        two_by_two = {}
        cells = {}
        cells["c11"] = str(A)
        cells["c12"] = str(B)
        cells["c21"] = str(C)
        cells["c22"] = str(D)
        two_by_two["cells"] = cells
        two_by_two["total"] = str(N)
        output["two_by_two"] = two_by_two

        haplotypes = {}
        hap_1 = {}
        hap_1["alleles"] = hap1
        hap_1["count"] = str(hap[hap1])
        hap_1["frequency"] = str(round(float(hap[hap1]) / N, 3))
        haplotypes["hap1"] = hap_1

        hap_2 = {}
        hap_2["alleles"] = hap2
        hap_2["count"] = str(hap[hap2])
        hap_2["frequency"] = str(round(float(hap[hap2]) / N, 3))
        haplotypes["hap2"] = hap_2

        hap_3 = {}
        hap_3["alleles"] = hap3
        hap_3["count"] = str(hap[hap3])
        hap_3["frequency"] = str(round(float(hap[hap3]) / N, 3))
        haplotypes["hap3"] = hap_3

        hap_4 = {}
        hap_4["alleles"] = hap4
        hap_4["count"] = str(hap[hap4])
        hap_4["frequency"] = str(round(float(hap[hap4]) / N, 3))
        haplotypes["hap4"] = hap_4
        output["haplotypes"] = haplotypes

        statistics = {}
        if Ms != 0:
            statistics["d_prime"] = str(round(D_prime, 4))
            statistics["r2"] = str(round(r2, 4))
            statistics["chisq"] = str(round(chisq, 4))
            if p >= 0.0001:
                statistics["p"] = str(round(p, 4))
            else:
                statistics["p"] = "<0.0001"
        else:
            statistics["d_prime"] = D_prime
            statistics["r2"] = r2
            statistics["chisq"] = chisq
            statistics["p"] = p

        output["statistics"] = statistics
        output["corr_alleles"] = corr_alleles
        output["request"] = request
        output_list.append(output)

    ### OUTPUT ERROR IF ONLY SINGLE SNP PAIR ###
    if len(snp_pairs) == 1 and len(output_list) == 1 and "error" in output_list[0]:
        return(json.dumps(output_list, sort_keys=True, indent=2))

    # Generate output file only for single SNP pair inputs
    if len(snp_pairs) == 1 and len(output_list) == 1:
        ldpair_out = open(tmp_dir + "LDpair_" + request + ".txt", "w")
        print("Query SNPs:", file=ldpair_out)
        print(output_list[0]["snp1"]["rsnum"] + \
            " (" + output_list[0]["snp1"]["coord"] + ")", file=ldpair_out)
        print(output_list[0]["snp2"]["rsnum"] + \
            " (" + output_list[0]["snp2"]["coord"] + ")", file=ldpair_out)
        print("", file=ldpair_out)
        print(pop + " Haplotypes:", file=ldpair_out)
        print(" " * 15 + output_list[0]["snp2"]["rsnum"], file=ldpair_out)
        print(" " * 15 + \
            output_list[0]["snp2"]["allele_1"]["allele"] + " " * \
            7 + output_list[0]["snp2"]["allele_2"]["allele"], file=ldpair_out)
        print(" " * 13 + "-" * 17, file=ldpair_out)
        print(" " * 11 + output_list[0]["snp1"]["allele_1"]["allele"] + " | " + output_list[0]["two_by_two"]["cells"]["c11"] + " " * (5 - len(output["two_by_two"]["cells"]["c11"])) + " | " + output["two_by_two"]["cells"]["c12"] + " " * (
            5 - len(output_list[0]["two_by_two"]["cells"]["c12"])) + " | " + output_list[0]["snp1"]["allele_1"]["count"] + " " * (5 - len(output["snp1"]["allele_1"]["count"])) + " (" + output["snp1"]["allele_1"]["frequency"] + ")", file=ldpair_out)
        print(output_list[0]["snp1"]["rsnum"] + " " * \
            (10 - len(output_list[0]["snp1"]["rsnum"])) + " " * 3 + "-" * 17, file=ldpair_out)
        print(" " * 11 + output_list[0]["snp1"]["allele_2"]["allele"] + " | " + output_list[0]["two_by_two"]["cells"]["c21"] + " " * (5 - len(output["two_by_two"]["cells"]["c21"])) + " | " + output["two_by_two"]["cells"]["c22"] + " " * (
            5 - len(output_list[0]["two_by_two"]["cells"]["c22"])) + " | " + output_list[0]["snp1"]["allele_2"]["count"] + " " * (5 - len(output["snp1"]["allele_2"]["count"])) + " (" + output["snp1"]["allele_2"]["frequency"] + ")", file=ldpair_out)
        print(" " * 13 + "-" * 17, file=ldpair_out)
        print(" " * 15 + output_list[0]["snp2"]["allele_1"]["count"] + " " * (5 - len(output_list[0]["snp2"]["allele_1"]["count"])) + " " * 3 + output["snp2"]["allele_2"]["count"] + " " * (
            5 - len(output_list[0]["snp2"]["allele_2"]["count"])) + " " * 3 + output_list[0]["two_by_two"]["total"], file=ldpair_out)
        print(" " * 14 + "(" + output_list[0]["snp2"]["allele_1"]["frequency"] + ")" + " " * (5 - len(output_list[0]["snp2"]["allele_1"]["frequency"])) + \
            " (" + output_list[0]["snp2"]["allele_2"]["frequency"] + ")" + \
            " " * (5 - len(output_list[0]["snp2"]["allele_2"]["frequency"])), file=ldpair_out)
        print("", file=ldpair_out)
        print("          " + output_list[0]["haplotypes"]["hap1"]["alleles"] + ": " + \
            output_list[0]["haplotypes"]["hap1"]["count"] + \
            " (" + output_list[0]["haplotypes"]["hap1"]["frequency"] + ")", file=ldpair_out)
        print("          " + output_list[0]["haplotypes"]["hap2"]["alleles"] + ": " + \
            output_list[0]["haplotypes"]["hap2"]["count"] + \
            " (" + output_list[0]["haplotypes"]["hap2"]["frequency"] + ")", file=ldpair_out)
        print("          " + output_list[0]["haplotypes"]["hap3"]["alleles"] + ": " + \
            output_list[0]["haplotypes"]["hap3"]["count"] + \
            " (" + output_list[0]["haplotypes"]["hap3"]["frequency"] + ")", file=ldpair_out)
        print("          " + output_list[0]["haplotypes"]["hap4"]["alleles"] + ": " + \
            output["haplotypes"]["hap4"]["count"] + \
            " (" + output["haplotypes"]["hap4"]["frequency"] + ")", file=ldpair_out)
        print("", file=ldpair_out)
        print("          D': " + output_list[0]["statistics"]["d_prime"], file=ldpair_out)
        print("          R2: " + output_list[0]["statistics"]["r2"], file=ldpair_out)
        print("      Chi-sq: " + output_list[0]["statistics"]["chisq"], file=ldpair_out)
        print("     p-value: " + output_list[0]["statistics"]["p"], file=ldpair_out)
        print("", file=ldpair_out)
        if len(output_list[0]["corr_alleles"]) == 2:
            print(output_list[0]["corr_alleles"][0], file=ldpair_out)
            print(output_list[0]["corr_alleles"][1], file=ldpair_out)
        else:
            print(output_list[0]["corr_alleles"][0], file=ldpair_out)

        try:
            output_list[0]["warning"]
        except KeyError:
            www = "do nothing"
        else:
            print("WARNING: " + output_list[0]["warning"] + "!", file=ldpair_out)
        ldpair_out.close()

    # Return output
    return(json.dumps(output_list, sort_keys=True, indent=2))
Пример #7
0
def calculate_proxy(snp, pop, request, web, genome_build, r2_d="r2", window=500000, collapseTranscript=True):

    # trim any whitespace
    snp = snp.lower().strip()

    start_time = time.time()

    # Set data directories using config.yml
    with open('config.yml', 'r') as yml_file:
        config = yaml.load(yml_file)
    env = config['env']
    api_mongo_addr = config['api']['api_mongo_addr']
    dbsnp_version = config['data']['dbsnp_version']
    data_dir = config['data']['data_dir']
    tmp_dir = config['data']['tmp_dir']
    population_samples_dir = config['data']['population_samples_dir']
    genotypes_dir = config['data']['genotypes_dir']
    aws_info = config['aws']
    mongo_username = config['database']['mongo_user_readonly']
    mongo_password = config['database']['mongo_password']
    mongo_port = config['database']['mongo_port']
    num_subprocesses = config['performance']['num_subprocesses']

    export_s3_keys = retrieveAWSCredentials()

    # Ensure tmp directory exists
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)

    if request is False:
        request = str(time.strftime("%I%M%S"))

    # Create JSON output
    out_json = open(tmp_dir + 'proxy' + request + ".json", "w")
    output = {}

    # Validate genome build param
    if genome_build not in genome_build_vars['vars']:
        output["error"] = "Invalid genome build. Please specify either " + ", ".join(genome_build_vars['vars']) + "."
        json_output = json.dumps(output, sort_keys=True, indent=2)
        print(json_output, file=out_json)
        out_json.close()
        return("", "")

    if window < 0 or window > 1000000:
        output["error"] = "Window value must be a number between 0 and 1,000,000."
        json_output = json.dumps(output, sort_keys=True, indent=2)
        print(json_output, file=out_json)
        out_json.close()
        return("", "")

    # Connect to Mongo snp database
    if env == 'local':
        mongo_host = api_mongo_addr
    else: 
        mongo_host = 'localhost'
    if web:
        client = MongoClient('mongodb://' + mongo_username + ':' + mongo_password + '@' + mongo_host+'/admin', mongo_port)
    else:
        if env == 'local':
            client = MongoClient('mongodb://' + mongo_username + ':' + mongo_password + '@' + mongo_host+'/admin', mongo_port)
        else:
            client = MongoClient('localhost', mongo_port)
    db = client["LDLink"]

    def get_coords(rsid):
        rsid = rsid.strip("rs")
        query_results = db.dbsnp.find_one({"id": rsid})
        query_results_sanitized = json.loads(json_util.dumps(query_results))
        return query_results_sanitized

    # Query genomic coordinates
    def get_rsnum(coord):
        temp_coord = coord.strip("chr").split(":")
        chro = temp_coord[0]
        pos = temp_coord[1]
        query_results = db.dbsnp.find({"chromosome": chro.upper() if chro == 'x' or chro == 'y' else str(chro), genome_build_vars[genome_build]['position']: str(pos)})
        query_results_sanitized = json.loads(json_util.dumps(query_results))
        return query_results_sanitized

    # Replace input genomic coordinates with variant ids (rsids)
    def replace_coord_rsid(snp):
        if snp[0:2] == "rs":
            return snp
        else:
            snp_info_lst = get_rsnum(snp)
            print("snp_info_lst")
            print(snp_info_lst)
            if snp_info_lst != None:
                if len(snp_info_lst) > 1:
                    var_id = "rs" + snp_info_lst[0]['id']
                    ref_variants = []
                    for snp_info in snp_info_lst:
                        if snp_info['id'] == snp_info['ref_id']:
                            ref_variants.append(snp_info['id'])
                    if len(ref_variants) > 1:
                        var_id = "rs" + ref_variants[0]
                        if "warning" in output:
                            output["warning"] = output["warning"] + \
                            ". Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp
                        else:
                            output["warning"] = "Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp
                    elif len(ref_variants) == 0 and len(snp_info_lst) > 1:
                        var_id = "rs" + snp_info_lst[0]['id']
                        if "warning" in output:
                            output["warning"] = output["warning"] + \
                            ". Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp
                        else:
                            output["warning"] = "Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp
                    else:
                        var_id = "rs" + ref_variants[0]
                    return var_id
                elif len(snp_info_lst) == 1:
                    var_id = "rs" + snp_info_lst[0]['id']
                    return var_id
                else:
                    return snp
            else:
                return snp
        return snp

    snp = replace_coord_rsid(snp)

    # Find RS number in snp database
    snp_coord = get_coords(snp)

    if snp_coord == None or snp_coord[genome_build_vars[genome_build]['position']] == "NA":
        output["error"] = snp + " is not in dbSNP " + dbsnp_version + " (" + genome_build_vars[genome_build]['title'] + ")."
        json_output = json.dumps(output, sort_keys=True, indent=2)
        print(json_output, file=out_json)
        out_json.close()
        return("", "")

    # check if variant is on chrY for genome build = GRCh38
    if snp_coord['chromosome'] == "Y" and (genome_build == "grch38" or genome_build == "grch38_high_coverage"):
        output["error"] = "Input variants on chromosome Y are unavailable for GRCh38, only available for GRCh37 (" + "rs" + snp_coord['id'] + " = chr" + snp_coord['chromosome'] + ":" + snp_coord[genome_build_vars[genome_build]['position']] + ")"
        json_output = json.dumps(output, sort_keys=True, indent=2)
        print(json_output, file=out_json)
        out_json.close()
        return("", "")

    # Select desired ancestral populations
    pops = pop.split("+")
    pop_dirs = []
    for pop_i in pops:
        if pop_i in ["ALL", "AFR", "AMR", "EAS", "EUR", "SAS", "ACB", "ASW", "BEB", "CDX", "CEU", "CHB", "CHS", "CLM", "ESN", "FIN", "GBR", "GIH", "GWD", "IBS", "ITU", "JPT", "KHV", "LWK", "MSL", "MXL", "PEL", "PJL", "PUR", "STU", "TSI", "YRI"]:
            pop_dirs.append(data_dir + population_samples_dir + pop_i + ".txt")
        else:
            output["error"] = pop_i + " is not an ancestral population. Choose one of the following ancestral populations: AFR, AMR, EAS, EUR, or SAS; or one of the following sub-populations: ACB, ASW, BEB, CDX, CEU, CHB, CHS, CLM, ESN, FIN, GBR, GIH, GWD, IBS, ITU, JPT, KHV, LWK, MSL, MXL, PEL, PJL, PUR, STU, TSI, or YRI."
            json_output = json.dumps(output, sort_keys=True, indent=2)
            print(json_output, file=out_json)
            out_json.close()
            return("", "")

    get_pops = "cat " + " ".join(pop_dirs) + " > " + \
        tmp_dir + "pops_" + request + ".txt"
    subprocess.call(get_pops, shell=True)

    # Get population ids
    pop_list = open(tmp_dir + "pops_" + request + ".txt").readlines()
    ids = []
    for i in range(len(pop_list)):
        ids.append(pop_list[i].strip())

    pop_ids = list(set(ids))

    # Extract query SNP phased genotypes
    vcf_filePath = "%s/%s%s/%s" % (config['aws']['data_subfolder'], genotypes_dir, genome_build_vars[genome_build]["1000G_dir"], genome_build_vars[genome_build]["1000G_file"] % (snp_coord['chromosome']))
    vcf_file = "s3://%s/%s" % (config['aws']['bucket'], vcf_filePath)

    checkS3File(aws_info, config['aws']['bucket'], vcf_filePath)

    tabix_snp_h = export_s3_keys + " cd {1}; tabix -HD {0} | grep CHROM".format(vcf_file, data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir'])
    head = [x.decode('utf-8') for x in subprocess.Popen(tabix_snp_h, shell=True, stdout=subprocess.PIPE).stdout.readlines()][0].strip().split()

    tabix_snp = export_s3_keys + " cd {4}; tabix -D {0} {1}:{2}-{2} | grep -v -e END > {3}".format(
        vcf_file, genome_build_vars[genome_build]['1000G_chr_prefix'] + snp_coord['chromosome'], snp_coord[genome_build_vars[genome_build]['position']], tmp_dir + "snp_no_dups_" + request + ".vcf", data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir'])
    subprocess.call(tabix_snp, shell=True)

    # Check SNP is in the 1000G population, has the correct RS number, and not
    # monoallelic
    vcf = open(tmp_dir + "snp_no_dups_" + request + ".vcf").readlines()

    if len(vcf) == 0:
        output["error"] = snp + " is not in 1000G reference panel."
        json_output = json.dumps(output, sort_keys=True, indent=2)
        print(json_output, file=out_json)
        out_json.close()
        subprocess.call("rm " + tmp_dir + "pops_" +
                        request + ".txt", shell=True)
        subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True)
        return("", "")

    elif len(vcf) > 1:
        geno = []
        for i in range(len(vcf)):
            if vcf[i].strip().split()[2] == snp:
                geno = vcf[i].strip().split()
                geno[0] = geno[0].lstrip('chr')
        if geno == []:
            output["error"] = snp + " is not in 1000G reference panel."
            json_output = json.dumps(output, sort_keys=True, indent=2)
            print(json_output, file=out_json)
            out_json.close()
            subprocess.call("rm " + tmp_dir + "pops_" +
                            request + ".txt", shell=True)
            subprocess.call("rm " + tmp_dir + "*" +
                            request + "*.vcf", shell=True)
            return("", "")

    else:
        geno = vcf[0].strip().split()
        geno[0] = geno[0].lstrip('chr')

    if geno[2] != snp and snp[0:2]=="rs" and "rs" in geno[2]:
            output["warning"] = "Genomic position for query variant (" + snp + \
                ") does not match RS number at 1000G position (chr" + \
                geno[0]+":"+geno[1]+" = "+geno[2]+")"
            snp = geno[2]

    if "," in geno[3] or "," in geno[4]:
        output["error"] = snp + " is not a biallelic variant."
        json_output = json.dumps(output, sort_keys=True, indent=2)
        print(json_output, file=out_json)
        out_json.close()
        subprocess.call("rm " + tmp_dir + "pops_" +
                        request + ".txt", shell=True)
        subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True)
        return("", "")

    index = []
    for i in range(9, len(head)):
        if head[i] in pop_ids:
            index.append(i)

    genotypes = {"0": 0, "1": 0}
    for i in index:
        sub_geno = geno[i].split("|")
        for j in sub_geno:
            if j in genotypes:
                genotypes[j] += 1
            else:
                genotypes[j] = 1

    if genotypes["0"] == 0 or genotypes["1"] == 0:
        output["error"] = snp + \
            " is monoallelic in the " + pop + " population."
        json_output = json.dumps(output, sort_keys=True, indent=2)
        print(json_output, file=out_json)
        out_json.close()
        subprocess.call("rm " + tmp_dir + "pops_" +
                        request + ".txt", shell=True)
        subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True)
        return("", "")

    # Define window of interest around query SNP
    # window = 500000
    coord1 = int(snp_coord[genome_build_vars[genome_build]['position']]) - window
    if coord1 < 0:
        coord1 = 0
    coord2 = int(snp_coord[genome_build_vars[genome_build]['position']]) + window
    print("")

    # Calculate proxy LD statistics in parallel
    # threads = 4
    # block = (2 * window) // 4
    # block = (2 * window) // num_subprocesses

    windowChunkRanges = chunkWindow(int(snp_coord[genome_build_vars[genome_build]['position']]), window, num_subprocesses)

    commands = []

    for subprocess_id in range(num_subprocesses):
        getWindowVariantsArgs = " ".join([str(web), str(snp), str(snp_coord['chromosome']), str(windowChunkRanges[subprocess_id][0]), str(windowChunkRanges[subprocess_id][1]), str(request), genome_build, str(subprocess_id)])
        commands.append("python3 LDproxy_sub.py " + getWindowVariantsArgs)

    processes = [subprocess.Popen(
        command, shell=True, stdout=subprocess.PIPE) for command in commands]

    # collect output in parallel
    def get_output(process):
        return process.communicate()[0].splitlines()

    if not hasattr(threading.current_thread(), "_children"):
        threading.current_thread()._children = weakref.WeakKeyDictionary()

    pool = Pool(len(processes))
    out_raw = pool.map(get_output, processes)
    pool.close()
    pool.join()

    # Aggregate output
    out_prox = []
    for i in range(len(out_raw)):
        for j in range(len(out_raw[i])):
            col = out_raw[i][j].decode('utf-8').strip().split("\t")
            col[6] = int(col[6])
            col[7] = float(col[7])
            col[8] = float(col[8])
            col.append(abs(int(col[6])))
            out_prox.append(col)

    # Sort output
    if r2_d not in ["r2", "d"]:
        if "warning" in output:
            output["warning"] = output["warning"] + ". " + r2_d + \
                " is not an acceptable value for r2_d (r2 or d required). r2 is used by default"
        else:
            output["warning"] = r2_d + \
                " is not an acceptable value for r2_d (r2 or d required). r2 is used by default"
        r2_d = "r2"

    out_dist_sort = sorted(out_prox, key=operator.itemgetter(14))

    if r2_d == "r2":
        out_ld_sort = sorted(
            out_dist_sort, key=operator.itemgetter(8), reverse=True)
    else:
        out_ld_sort = sorted(
            out_dist_sort, key=operator.itemgetter(7), reverse=True)

    # Populate JSON and text output
    outfile = open(tmp_dir + "proxy" + request + ".txt", "w")
    header = ["RS_Number", "Coord", "Alleles", "MAF", "Distance",
              "Dprime", "R2", "Correlated_Alleles", "RegulomeDB", "Function"]
    print("\t".join(header), file=outfile)

    ucsc_track = {}
    ucsc_track["header"] = ["chr", "pos", "rsid", "stat"]

    query_snp = {}
    query_snp["RS"] = out_ld_sort[0][3]
    query_snp["Alleles"] = out_ld_sort[0][1]
    query_snp["Coord"] = out_ld_sort[0][2]
    query_snp["Dist"] = out_ld_sort[0][6]
    query_snp["Dprime"] = str(round(float(out_ld_sort[0][7]), 4))
    query_snp["R2"] = str(round(float(out_ld_sort[0][8]), 4))
    query_snp["Corr_Alleles"] = out_ld_sort[0][9]
    query_snp["RegulomeDB"] = out_ld_sort[0][10]
    query_snp["MAF"] = str(round(float(out_ld_sort[0][11]), 4))
    query_snp["Function"] = out_ld_sort[0][13]

    output["query_snp"] = query_snp

    temp = [query_snp["RS"], query_snp["Coord"], query_snp["Alleles"], query_snp["MAF"], str(query_snp["Dist"]), str(
            query_snp["Dprime"]), str(query_snp["R2"]), query_snp["Corr_Alleles"], query_snp["RegulomeDB"], query_snp["Function"]]
    print("\t".join(temp), file=outfile)

    chr, pos = query_snp["Coord"].split(':')
    if r2_d == "r2":
        temp2 = [chr, pos, query_snp["RS"], query_snp["R2"]]
    else:
        temp2 = [chr, pos, query_snp["RS"], query_snp["Dprime"]]

    ucsc_track["query_snp"] = temp2

    ucsc_track["0.8-1.0"] = []
    ucsc_track["0.6-0.8"] = []
    ucsc_track["0.4-0.6"] = []
    ucsc_track["0.2-0.4"] = []
    ucsc_track["0.0-0.2"] = []

    proxies = {}
    rows = []
    digits = len(str(len(out_ld_sort)))

    for i in range(1, len(out_ld_sort)):
        if float(out_ld_sort[i][8]) > 0.01 and out_ld_sort[i][3] != snp:
            proxy_info = {}
            row = []
            proxy_info["RS"] = out_ld_sort[i][3]
            proxy_info["Alleles"] = out_ld_sort[i][4]
            proxy_info["Coord"] = out_ld_sort[i][5]
            proxy_info["Dist"] = out_ld_sort[i][6]
            proxy_info["Dprime"] = str(round(float(out_ld_sort[i][7]), 4))
            proxy_info["R2"] = str(round(float(out_ld_sort[i][8]), 4))
            proxy_info["Corr_Alleles"] = out_ld_sort[i][9]
            proxy_info["RegulomeDB"] = out_ld_sort[i][10]
            proxy_info["MAF"] = str(round(float(out_ld_sort[i][12]), 4))
            proxy_info["Function"] = out_ld_sort[i][13]
            proxies["proxy_" + (digits - len(str(i))) *
                    "0" + str(i)] = proxy_info
            chr, pos = proxy_info["Coord"].split(':')

            # Adding a row for the Data Table
            row.append(proxy_info["RS"])
            row.append(chr)
            row.append(pos)
            row.append(proxy_info["Alleles"])
            row.append(str(round(float(proxy_info["MAF"]), 4)))
            row.append(abs(proxy_info["Dist"]))
            row.append(str(round(float(proxy_info["Dprime"]), 4)))
            row.append(str(round(float(proxy_info["R2"]), 4)))
            row.append(proxy_info["Corr_Alleles"])
            row.append(proxy_info["RegulomeDB"])
            row.append("HaploReg link")
            row.append(proxy_info["Function"])
            rows.append(row)

            temp = [proxy_info["RS"], proxy_info["Coord"], proxy_info["Alleles"], proxy_info["MAF"], str(proxy_info["Dist"]), str(
                    proxy_info["Dprime"]), str(proxy_info["R2"]), proxy_info["Corr_Alleles"], proxy_info["RegulomeDB"], proxy_info["Function"]]
            print("\t".join(temp), file=outfile)

            chr, pos = proxy_info["Coord"].split(':')
            if r2_d == "r2":
                temp2 = [chr, pos, proxy_info["RS"],
                         round(float(out_ld_sort[i][8]), 4)]
            else:
                temp2 = [chr, pos, proxy_info["RS"],
                         round(float(out_ld_sort[i][7]), 4)]

            if 0.8 < temp2[3] <= 1.0:
                ucsc_track["0.8-1.0"].append(temp2)
            elif 0.6 < temp2[3] <= 0.8:
                ucsc_track["0.6-0.8"].append(temp2)
            elif 0.4 < temp2[3] <= 0.6:
                ucsc_track["0.4-0.6"].append(temp2)
            elif 0.2 < temp2[3] <= 0.4:
                ucsc_track["0.2-0.4"].append(temp2)
            else:
                ucsc_track["0.0-0.2"].append(temp2)

    track = open(tmp_dir + "track" + request + ".txt", "w")
    print("browser position chr" + \
        str(snp_coord['chromosome']) + ":" + str(coord1) + "-" + str(coord2), file=track)
    print("", file=track)

    if r2_d == "r2":
        print("track type=bedGraph name=\"R2 Plot\" description=\"Plot of R2 values\" color=50,50,50 visibility=full alwaysZero=on graphType=bar maxHeightPixels=60", file=track)
    else:
        print("track type=bedGraph name=\"D Prime Plot\" description=\"Plot of D prime values\" color=50,50,50 visibility=full alwaysZero=on graphType=bar maxHeightPixels=60", file=track)

    print("\t".join(
        [str(ucsc_track["query_snp"][i]) for i in [0, 1, 1, 3]]), file=track)
    if len(ucsc_track["0.8-1.0"]) > 0:
        for var in ucsc_track["0.8-1.0"]:
            print("\t".join([str(var[i]) for i in [0, 1, 1, 3]]), file=track)
    if len(ucsc_track["0.6-0.8"]) > 0:
        for var in ucsc_track["0.6-0.8"]:
            print("\t".join([str(var[i]) for i in [0, 1, 1, 3]]), file=track)
    if len(ucsc_track["0.4-0.6"]) > 0:
        for var in ucsc_track["0.4-0.6"]:
            print("\t".join([str(var[i]) for i in [0, 1, 1, 3]]), file=track)
    if len(ucsc_track["0.2-0.4"]) > 0:
        for var in ucsc_track["0.2-0.4"]:
            print("\t".join([str(var[i]) for i in [0, 1, 1, 3]]), file=track)
    if len(ucsc_track["0.0-0.2"]) > 0:
        for var in ucsc_track["0.0-0.2"]:
            print("\t".join([str(var[i]) for i in [0, 1, 1, 3]]), file=track)
    print("", file=track)

    print("track type=bed name=\"" + snp + \
        "\" description=\"Query Variant: " + snp + "\" color=108,108,255", file=track)
    print("\t".join([ucsc_track["query_snp"][i]
                               for i in [0, 1, 1, 2]]), file=track)
    print("", file=track)

    if len(ucsc_track["0.8-1.0"]) > 0:
        if r2_d == "r2":
            print("track type=bed name=\"0.8<R2<=1.0\" description=\"Proxy Variants with 0.8<R2<=1.0\" color=198,129,0", file=track)
        else:
            print("track type=bed name=\"0.8<D'<=1.0\" description=\"Proxy Variants with 0.8<D'<=1.0\" color=198,129,0", file=track)
        for var in ucsc_track["0.8-1.0"]:
            print("\t".join([var[i] for i in [0, 1, 1, 2]]), file=track)
        print("", file=track)

    if len(ucsc_track["0.6-0.8"]) > 0:
        if r2_d == "r2":
            print("track type=bed name=\"0.6<R2<=0.8\" description=\"Proxy Variants with 0.6<R2<=0.8\" color=198,129,0", file=track)
        else:
            print("track type=bed name=\"0.6<D'<=0.8\" description=\"Proxy Variants with 0.6<D'<=0.8\" color=198,129,0", file=track)
        for var in ucsc_track["0.6-0.8"]:
            print("\t".join([var[i] for i in [0, 1, 1, 2]]), file=track)
        print("", file=track)

    if len(ucsc_track["0.4-0.6"]) > 0:
        if r2_d == "r2":
            print("track type=bed name=\"0.4<R2<=0.6\" description=\"Proxy Variants with 0.4<R2<=0.6\" color=198,129,0", file=track)
        else:
            print("track type=bed name=\"0.4<D'<=0.6\" description=\"Proxy Variants with 0.4<D'<=0.6\" color=198,129,0", file=track)
        for var in ucsc_track["0.4-0.6"]:
            print("\t".join([var[i] for i in [0, 1, 1, 2]]), file=track)
        print("", file=track)

    if len(ucsc_track["0.2-0.4"]) > 0:
        if r2_d == "r2":
            print("track type=bed name=\"0.2<R2<=0.4\" description=\"Proxy Variants with 0.2<R2<=0.4\" color=198,129,0", file=track)
        else:
            print("track type=bed name=\"0.2<D'<=0.4\" description=\"Proxy Variants with 0.2<D'<=0.4\" color=198,129,0", file=track)
        for var in ucsc_track["0.2-0.4"]:
            print("\t".join([var[i] for i in [0, 1, 1, 2]]), file=track)
        print("", file=track)

    if len(ucsc_track["0.0-0.2"]) > 0:
        if r2_d == "r2":
            print("track type=bed name=\"0.0<R2<=0.2\" description=\"Proxy Variants with 0.0<R2<=0.2\" color=198,129,0", file=track)
        else:
            print("track type=bed name=\"0.0<D'<=0.2\" description=\"Proxy Variants with 0.0<D'<=0.2\" color=198,129,0", file=track)
        for var in ucsc_track["0.0-0.2"]:
            print("\t".join([var[i] for i in [0, 1, 1, 2]]), file=track)
        print("", file=track)

    output["aaData"] = rows
    output["proxy_snps"] = proxies

    # Output JSON and text file
    json_output = json.dumps(output, sort_keys=True, indent=2)
    print(json_output, file=out_json)
    out_json.close()

    outfile.close()
    track.close()

    out_script = ""
    out_div = ""
    
    if web:
        # Organize scatter plot data
        q_rs = []
        q_allele = []
        q_coord = []
        q_maf = []
        p_rs = []
        p_allele = []
        p_coord = []
        p_maf = []
        dist = []
        d_prime = []
        d_prime_round = []
        r2 = []
        r2_round = []
        corr_alleles = []
        regdb = []
        funct = []
        color = []
        size = []
        for i in range(len(out_ld_sort)):
            q_rs_i, q_allele_i, q_coord_i, p_rs_i, p_allele_i, p_coord_i, dist_i, d_prime_i, r2_i, corr_alleles_i, regdb_i, q_maf_i, p_maf_i, funct_i, dist_abs = out_ld_sort[
                i]

            if float(r2_i) > 0.01:
                q_rs.append(q_rs_i)
                q_allele.append(q_allele_i)
                q_coord.append(float(q_coord_i.split(":")[1]) / 1000000)
                q_maf.append(str(round(float(q_maf_i), 4)))
                if p_rs_i == ".":
                    p_rs_i = p_coord_i
                p_rs.append(p_rs_i)
                p_allele.append(p_allele_i)
                p_coord.append(float(p_coord_i.split(":")[1]) / 1000000)
                p_maf.append(str(round(float(p_maf_i), 4)))
                dist.append(str(round(dist_i / 1000000.0, 4)))
                d_prime.append(float(d_prime_i))
                d_prime_round.append(str(round(float(d_prime_i), 4)))
                r2.append(float(r2_i))
                r2_round.append(str(round(float(r2_i), 4)))
                corr_alleles.append(corr_alleles_i)

                # Correct Missing Annotations
                if regdb_i == ".":
                    regdb_i = ""
                regdb.append(regdb_i)
                if funct_i == ".":
                    funct_i = ""
                if funct_i == "NA":
                    funct_i = "none"
                funct.append(funct_i)

                # Set Color
                if i == 0:
                    color_i = "blue"
                elif funct_i != "none" and funct_i != "":
                    color_i = "red"
                else:
                    color_i = "orange"
                color.append(color_i)

                # Set Size
                size_i = 9 + float(p_maf_i) * 14.0
                size.append(size_i)

        # Begin Bokeh Plotting
        from collections import OrderedDict
        from bokeh.embed import components, file_html
        from bokeh.layouts import gridplot
        from bokeh.models import HoverTool, LinearAxis, Range1d
        from bokeh.plotting import ColumnDataSource, curdoc, figure, output_file, reset_output, save
        from bokeh.resources import CDN

        reset_output()

        # Proxy Plot
        x = p_coord
        if r2_d == "r2":
            y = r2
        else:
            y = d_prime
        whitespace = 0.01
        xr = Range1d(start=coord1 / 1000000.0 - whitespace,
                    end=coord2 / 1000000.0 + whitespace)
        yr = Range1d(start=-0.03, end=1.03)
        sup_2 = "\u00B2"

        proxy_plot = figure(
            title="Proxies for " + snp + " in " + pop,
            min_border_top=2, min_border_bottom=2, min_border_left=60, min_border_right=60, h_symmetry=False, v_symmetry=False,
            plot_width=900,
            plot_height=600,
            x_range=xr, y_range=yr,
            tools="hover,tap,pan,box_zoom,box_select,undo,redo,reset,previewsave", logo=None,
            toolbar_location="above")

        proxy_plot.title.align = "center"

        # Add recombination rate
        recomb_file = tmp_dir + "recomb_" + request + ".json"
        recomb_json = getRecomb(db, recomb_file, snp_coord['chromosome'], coord1 - whitespace, coord2 + whitespace, genome_build)

        recomb_x = []
        recomb_y = []

        for recomb_obj in recomb_json:
            recomb_x.append(int(recomb_obj[genome_build_vars[genome_build]['position']]) / 1000000.0)
            recomb_y.append(float(recomb_obj['rate']) / 100.0)

        data = {
            'x': x,
            'y': y,
            'qrs': q_rs,
            'q_alle': q_allele,
            'q_maf': q_maf,
            'prs': p_rs,
            'p_alle': p_allele,
            'p_maf': p_maf,
            'dist': dist,
            'r': r2_round,
            'd': d_prime_round,
            'alleles': corr_alleles,
            'regdb': regdb,
            'funct': funct,
            'size': size,
            'color': color
        }
        source = ColumnDataSource(data)

        proxy_plot.line(recomb_x, recomb_y, line_width=1, color="black", alpha=0.5)

        proxy_plot.circle(x='x', y='y', size='size',
                        color='color', alpha=0.5, source=source)

        hover = proxy_plot.select(dict(type=HoverTool))
        hover.tooltips = OrderedDict([
            ("Query Variant", "@qrs @q_alle"),
            ("Proxy Variant", "@prs @p_alle"),
            ("Distance (Mb)", "@dist"),
            ("MAF (Query,Proxy)", "@q_maf,@p_maf"),
            ("R" + sup_2, "@r"),
            ("D\'", "@d"),
            ("Correlated Alleles", "@alleles"),
            ("RegulomeDB", "@regdb"),
            ("Functional Class", "@funct"),
        ])

        proxy_plot.text(x, y, text=regdb, alpha=1, text_font_size="7pt",
                        text_baseline="middle", text_align="center", angle=0)

        if r2_d == "r2":
            proxy_plot.yaxis.axis_label = "R" + sup_2
        else:
            proxy_plot.yaxis.axis_label = "D\'"

        proxy_plot.extra_y_ranges = {"y2_axis": Range1d(start=-3, end=103)}
        proxy_plot.add_layout(LinearAxis(y_range_name="y2_axis",
                                        axis_label="Combined Recombination Rate (cM/Mb)"), "right")

        # Rug Plot
        y2_ll = [-0.03] * len(x)
        y2_ul = [1.03] * len(x)
        yr_rug = Range1d(start=-0.03, end=1.03)

        data_rug = {
            'x': x,
            'y': y,
            'y2_ll': y2_ll,
            'y2_ul': y2_ul,
            'qrs': q_rs,
            'q_alle': q_allele,
            'q_maf': q_maf,
            'prs': p_rs,
            'p_alle': p_allele,
            'p_maf': p_maf,
            'dist': dist,
            'r': r2_round,
            'd': d_prime_round,
            'alleles': corr_alleles,
            'regdb': regdb,
            'funct': funct,
            'size': size,
            'color': color
        }
        source_rug = ColumnDataSource(data_rug)

        rug = figure(
            x_range=xr, y_range=yr_rug, border_fill_color='white', y_axis_type=None,
            title="", min_border_top=2, min_border_bottom=2, min_border_left=60, min_border_right=60, h_symmetry=False, v_symmetry=False,
            plot_width=900, plot_height=50, tools="xpan,tap", logo=None)

        rug.segment(x0='x', y0='y2_ll', x1='x', y1='y2_ul', source=source_rug,
                    color='color', alpha=0.5, line_width=1)
        rug.toolbar_location = None

        if collapseTranscript == "false":
            # Gene Plot (All Transcripts)
            genes_file = tmp_dir + "genes_" + request + ".json"
            genes_json = getRefGene(db, genes_file, snp_coord['chromosome'], int(coord1), int(coord2), genome_build, False)

            genes_plot_start = []
            genes_plot_end = []
            genes_plot_y = []
            genes_plot_name = []
            exons_plot_x = []
            exons_plot_y = []
            exons_plot_w = []
            exons_plot_h = []
            exons_plot_name = []
            exons_plot_id = []
            exons_plot_exon = []
            lines = [0]
            gap = 80000
            tall = 0.75
            if genes_json != None and len(genes_json) > 0:
                for gene_obj in genes_json:
                    bin = gene_obj["bin"]
                    name_id = gene_obj["name"]
                    chrom = gene_obj["chrom"]
                    strand = gene_obj["strand"]
                    txStart = gene_obj["txStart"]
                    txEnd = gene_obj["txEnd"]
                    cdsStart = gene_obj["cdsStart"]
                    cdsEnd = gene_obj["cdsEnd"]
                    exonCount = gene_obj["exonCount"]
                    exonStarts = gene_obj["exonStarts"]
                    exonEnds = gene_obj["exonEnds"]
                    score = gene_obj["score"]
                    name2 = gene_obj["name2"]
                    cdsStartStat = gene_obj["cdsStartStat"]
                    cdsEndStat = gene_obj["cdsEndStat"] 
                    exonFrames = gene_obj["exonFrames"]
                    name = name2
                    id = name_id
                    e_start = exonStarts.split(",")
                    e_end = exonEnds.split(",")

                    # Determine Y Coordinate
                    i = 0
                    y_coord = None
                    while y_coord == None:
                        if i > len(lines) - 1:
                            y_coord = i + 1
                            lines.append(int(txEnd))
                        elif int(txStart) > (gap + lines[i]):
                            y_coord = i + 1
                            lines[i] = int(txEnd)
                        else:
                            i += 1

                    genes_plot_start.append(int(txStart) / 1000000.0)
                    genes_plot_end.append(int(txEnd) / 1000000.0)
                    genes_plot_y.append(y_coord)
                    genes_plot_name.append(name + "  ")

                    for i in range(len(e_start) - 1):
                        if strand == "+":
                            exon = i + 1
                        else:
                            exon = len(e_start) - 1 - i

                        width = (int(e_end[i]) - int(e_start[i])) / 1000000.0
                        x_coord = int(e_start[i]) / 1000000.0 + (width / 2)

                        exons_plot_x.append(x_coord)
                        exons_plot_y.append(y_coord)
                        exons_plot_w.append(width)
                        exons_plot_h.append(tall)
                        exons_plot_name.append(name)
                        exons_plot_id.append(id)
                        exons_plot_exon.append(exon)

            n_rows = len(lines)
            genes_plot_yn = [n_rows - x + 0.5 for x in genes_plot_y]
            exons_plot_yn = [n_rows - x + 0.5 for x in exons_plot_y]
            yr2 = Range1d(start=0, end=n_rows)

            data_gene_plot = {
                'exons_plot_x': exons_plot_x,
                'exons_plot_yn': exons_plot_yn,
                'exons_plot_w': exons_plot_w,
                'exons_plot_h': exons_plot_h,
                'exons_plot_name': exons_plot_name,
                'exons_plot_id': exons_plot_id,
                'exons_plot_exon': exons_plot_exon
            }

            source_gene_plot = ColumnDataSource(data_gene_plot)

            if len(lines) < 3:
                plot_h_pix = 250
            else:
                plot_h_pix = 250 + (len(lines) - 2) * 50

            gene_plot = figure(
                x_range=xr, y_range=yr2, border_fill_color='white',
                title="", min_border_top=2, min_border_bottom=2, min_border_left=60, min_border_right=60, h_symmetry=False, v_symmetry=False,
                plot_width=900, plot_height=plot_h_pix, tools="hover,tap,xpan,box_zoom,undo,redo,reset,previewsave", logo=None)

            gene_plot.segment(genes_plot_start, genes_plot_yn, genes_plot_end,
                            genes_plot_yn, color="black", alpha=1, line_width=2)

            gene_plot.rect(x='exons_plot_x', y='exons_plot_yn', width='exons_plot_w', height='exons_plot_h',
                        source=source_gene_plot, fill_color="grey", line_color="grey")
            gene_plot.xaxis.axis_label = "Chromosome " + snp_coord['chromosome'] + " Coordinate (Mb)(" + genome_build_vars[genome_build]['title'] + ")"
            gene_plot.yaxis.axis_label = "Genes (All Transcripts)"
            gene_plot.ygrid.grid_line_color = None
            gene_plot.yaxis.axis_line_color = None
            gene_plot.yaxis.minor_tick_line_color = None
            gene_plot.yaxis.major_tick_line_color = None
            gene_plot.yaxis.major_label_text_color = None

            hover = gene_plot.select(dict(type=HoverTool))
            hover.tooltips = OrderedDict([
                ("Gene", "@exons_plot_name"),
                ("ID", "@exons_plot_id"),
                ("Exon", "@exons_plot_exon"),
            ])

            gene_plot.text(genes_plot_start, genes_plot_yn, text=genes_plot_name, alpha=1, text_font_size="7pt",
                        text_font_style="bold", text_baseline="middle", text_align="right", angle=0)

            gene_plot.toolbar_location = "below"

            # Combine plots into a grid
            out_grid = gridplot(proxy_plot, rug, gene_plot, ncols=1,
                                toolbar_options=dict(logo=None))
        # Gene Plot (Collapsed)                        
        else:
            genes_c_file = tmp_dir + "genes_c_" + request + ".json"
            genes_c_json = getRefGene(db, genes_c_file, snp_coord['chromosome'], int(coord1), int(coord2), genome_build, True)

            genes_c_plot_start=[]
            genes_c_plot_end=[]
            genes_c_plot_y=[]
            genes_c_plot_name=[]
            exons_c_plot_x=[]
            exons_c_plot_y=[]
            exons_c_plot_w=[]
            exons_c_plot_h=[]
            exons_c_plot_name=[]
            exons_c_plot_id=[]
            message_c = ["Too many genes to plot."]
            lines_c=[0]
            gap=80000
            tall=0.75
            if genes_c_json != None and len(genes_c_json) > 0:
                for gene_c_obj in genes_c_json:
                    chrom = gene_c_obj["chrom"]
                    txStart = gene_c_obj["txStart"]
                    txEnd = gene_c_obj["txEnd"]
                    exonStarts = gene_c_obj["exonStarts"]
                    exonEnds = gene_c_obj["exonEnds"]
                    name2 = gene_c_obj["name2"]
                    transcripts = gene_c_obj["transcripts"]
                    name = name2
                    e_start = exonStarts.split(",")
                    e_end = exonEnds.split(",")
                    e_transcripts=transcripts.split(",")

                    # Determine Y Coordinate
                    i=0
                    y_coord=None
                    while y_coord==None:
                        if i>len(lines_c)-1:
                            y_coord=i+1
                            lines_c.append(int(txEnd))
                        elif int(txStart)>(gap+lines_c[i]):
                            y_coord=i+1
                            lines_c[i]=int(txEnd)
                        else:
                            i+=1

                    genes_c_plot_start.append(int(txStart)/1000000.0)
                    genes_c_plot_end.append(int(txEnd)/1000000.0)
                    genes_c_plot_y.append(y_coord)
                    genes_c_plot_name.append(name+"  ")

                    # for i in range(len(e_start)):
                    for i in range(len(e_start)-1):
                        width=(int(e_end[i])-int(e_start[i]))/1000000.0
                        x_coord=int(e_start[i])/1000000.0+(width/2)

                        exons_c_plot_x.append(x_coord)
                        exons_c_plot_y.append(y_coord)
                        exons_c_plot_w.append(width)
                        exons_c_plot_h.append(tall)
                        exons_c_plot_name.append(name)
                        exons_c_plot_id.append(e_transcripts[i].replace("-",","))


            n_rows_c=len(lines_c)
            genes_c_plot_yn=[n_rows_c-x+0.5 for x in genes_c_plot_y]
            exons_c_plot_yn=[n_rows_c-x+0.5 for x in exons_c_plot_y]
            yr2_c=Range1d(start=0, end=n_rows_c)

            data_gene_c_plot = {'exons_c_plot_x': exons_c_plot_x, 'exons_c_plot_yn': exons_c_plot_yn, 'exons_c_plot_w': exons_c_plot_w, 'exons_c_plot_h': exons_c_plot_h, 'exons_c_plot_name': exons_c_plot_name, 'exons_c_plot_id': exons_c_plot_id}
            source_gene_c_plot=ColumnDataSource(data_gene_c_plot)

            max_genes_c = 40
            # if len(lines_c) < 3 or len(genes_c_raw) > max_genes_c:
            if len(lines_c) < 3:
                plot_c_h_pix = 250
            else:
                plot_c_h_pix = 250 + (len(lines_c) - 2) * 50

            gene_c_plot = figure(min_border_top=2, min_border_bottom=0, min_border_left=100, min_border_right=5,
                            x_range=xr, y_range=yr2_c, border_fill_color='white',
                            title="", h_symmetry=False, v_symmetry=False, logo=None,
                            plot_width=900, plot_height=plot_c_h_pix, tools="hover,xpan,box_zoom,wheel_zoom,tap,undo,redo,reset,previewsave")

            # if len(genes_c_raw) <= max_genes_c:
            gene_c_plot.segment(genes_c_plot_start, genes_c_plot_yn, genes_c_plot_end,
                                genes_c_plot_yn, color="black", alpha=1, line_width=2)
            gene_c_plot.rect(x='exons_c_plot_x', y='exons_c_plot_yn', width='exons_c_plot_w', height='exons_c_plot_h',
                            source=source_gene_c_plot, fill_color="grey", line_color="grey")
            gene_c_plot.text(genes_c_plot_start, genes_c_plot_yn, text=genes_c_plot_name, alpha=1, text_font_size="7pt",
                            text_font_style="bold", text_baseline="middle", text_align="right", angle=0)
            hover = gene_c_plot.select(dict(type=HoverTool))
            hover.tooltips = OrderedDict([
                ("Gene", "@exons_c_plot_name"),
                ("Transcript IDs", "@exons_c_plot_id"),
            ])

            # else:
            # 	x_coord_text = coord1/1000000.0 + (coord2/1000000.0 - coord1/1000000.0) / 2.0
            # 	gene_c_plot.text(x_coord_text, n_rows_c / 2.0, text=message_c, alpha=1,
            # 				   text_font_size="12pt", text_font_style="bold", text_baseline="middle", text_align="center", angle=0)

            gene_c_plot.xaxis.axis_label = "Chromosome " + snp_coord['chromosome'] + " Coordinate (Mb)(" + genome_build_vars[genome_build]['title'] + ")"
            gene_c_plot.yaxis.axis_label = "Genes (Transcripts Collapsed)"
            gene_c_plot.ygrid.grid_line_color = None
            gene_c_plot.yaxis.axis_line_color = None
            gene_c_plot.yaxis.minor_tick_line_color = None
            gene_c_plot.yaxis.major_tick_line_color = None
            gene_c_plot.yaxis.major_label_text_color = None

            gene_c_plot.toolbar_location = "below"
            
            out_grid = gridplot(proxy_plot, rug, gene_c_plot,
                        ncols=1, toolbar_options=dict(logo=None))

        # Generate high quality images only if accessed via web instance
        
        # Open thread for high quality image exports
        command = "python3 LDproxy_plot_sub.py " + snp + " " + pop + " " + request + " " + genome_build + " " + r2_d + " " + str(window) + " " + collapseTranscript
        subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)

        ###########################
        # Html output for testing #
        ###########################
        #html=file_html(out_grid, CDN, "Test Plot")
        # out_html=open("LDproxy.html","w")
        #print >> out_html, html
        # out_html.close()

        out_script, out_div = components(out_grid, CDN)
        reset_output()

        # Print run time statistics
        pop_list = open(tmp_dir + "pops_" + request + ".txt").readlines()
        print("\nNumber of Individuals: " + str(len(pop_list)))

        print("SNPs in Region: " + str(len(out_prox)))

        duration = time.time() - start_time
        print("Run time: " + str(duration) + " seconds\n")

    # Return plot output
    return(out_script, out_div)
Пример #8
0
def calculate_clip(snplst, pop, request, web, genome_build, r2_threshold=0.1, maf_threshold=0.01):

    max_list = 5000

    # Set data directories using config.yml
    with open('config.yml', 'r') as yml_file:
        config = yaml.load(yml_file)
    env = config['env']
    api_mongo_addr = config['api']['api_mongo_addr']
    dbsnp_version = config['data']['dbsnp_version']
    population_samples_dir = config['data']['population_samples_dir']
    data_dir = config['data']['data_dir']
    tmp_dir = config['data']['tmp_dir']
    genotypes_dir = config['data']['genotypes_dir']
    aws_info = config['aws']
    mongo_username = config['database']['mongo_user_readonly']
    mongo_password = config['database']['mongo_password']
    mongo_port = config['database']['mongo_port']

    export_s3_keys = retrieveAWSCredentials()

    # Ensure tmp directory exists
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)

    # Create JSON output
    out_json = open(tmp_dir+"clip"+request+".json", "w")
    output = {}

    # Validate genome build param
    print("genome_build " + genome_build)
    if genome_build not in genome_build_vars['vars']:
        output["error"] = "Invalid genome build. Please specify either " + ", ".join(genome_build_vars['vars']) + "."
        json_output = json.dumps(output, sort_keys=True, indent=2)
        print(json_output, file=out_json)
        out_json.close()
        return("", "", "")

    # Open SNP list file
    snps_raw = open(snplst).readlines()
    if len(snps_raw) > max_list:
        output["error"] = "Maximum SNP list is " + \
            str(max_list)+" RS numbers. Your list contains " + \
            str(len(snps_raw))+" entries."
        json_output = json.dumps(output, sort_keys=True, indent=2)
        print(json_output, file=out_json)
        out_json.close()
        return("", "", "")

    # Remove duplicate RS numbers
    snps = []
    for snp_raw in snps_raw:
        snp = snp_raw.strip().split()
        if snp not in snps:
            snps.append(snp)

    # Select desired ancestral populations
    pops = pop.split("+")
    pop_dirs = []
    for pop_i in pops:
        if pop_i in ["ALL", "AFR", "AMR", "EAS", "EUR", "SAS", "ACB", "ASW", "BEB", "CDX", "CEU", "CHB", "CHS", "CLM", "ESN", "FIN", "GBR", "GIH", "GWD", "IBS", "ITU", "JPT", "KHV", "LWK", "MSL", "MXL", "PEL", "PJL", "PUR", "STU", "TSI", "YRI"]:
            pop_dirs.append(data_dir + population_samples_dir + pop_i + ".txt")
        else:
            output["error"] = pop_i+" is not an ancestral population. Choose one of the following ancestral populations: AFR, AMR, EAS, EUR, or SAS; or one of the following sub-populations: ACB, ASW, BEB, CDX, CEU, CHB, CHS, CLM, ESN, FIN, GBR, GIH, GWD, IBS, ITU, JPT, KHV, LWK, MSL, MXL, PEL, PJL, PUR, STU, TSI, or YRI."
            json_output = json.dumps(output, sort_keys=True, indent=2)
            print(json_output, file=out_json)
            out_json.close()
            return("", "", "")

    get_pops = "cat " + " ".join(pop_dirs)
    pop_list = [x.decode('utf-8') for x in subprocess.Popen(get_pops, shell=True, stdout=subprocess.PIPE).stdout.readlines()]

    ids = [i.strip() for i in pop_list]
    pop_ids = list(set(ids))

    # Connect to Mongo snp database
    if env == 'local':
        mongo_host = api_mongo_addr
    else: 
        mongo_host = 'localhost'
    if web:
        client = MongoClient('mongodb://' + mongo_username + ':' + mongo_password + '@' + mongo_host+'/admin', mongo_port)
    else:
        if env == 'local':
            client = MongoClient('mongodb://' + mongo_username + ':' + mongo_password + '@' + mongo_host+'/admin', mongo_port)
        else:
            client = MongoClient('localhost', mongo_port)
    db = client["LDLink"]

    def get_coords(db, rsid):
        rsid = rsid.strip("rs")
        query_results = db.dbsnp.find_one({"id": rsid})
        query_results_sanitized = json.loads(json_util.dumps(query_results))
        return query_results_sanitized

    # Replace input genomic coordinates with variant ids (rsids)
    def replace_coords_rsid(db, snp_lst):
        new_snp_lst = []
        for snp_raw_i in snp_lst:
            if snp_raw_i[0][0:2] == "rs":
                new_snp_lst.append(snp_raw_i)
            else:
                snp_info_lst = get_rsnum(db, snp_raw_i[0], genome_build)
                print("snp_info_lst")
                print(snp_info_lst)
                if snp_info_lst != None:
                    if len(snp_info_lst) > 1:
                        var_id = "rs" + snp_info_lst[0]['id']
                        ref_variants = []
                        for snp_info in snp_info_lst:
                            if snp_info['id'] == snp_info['ref_id']:
                                ref_variants.append(snp_info['id'])
                        if len(ref_variants) > 1:
                            var_id = "rs" + ref_variants[0]
                            if "warning" in output:
                                output["warning"] = output["warning"] + \
                                ". Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp_raw_i[0]
                            else:
                                output["warning"] = "Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp_raw_i[0]
                        elif len(ref_variants) == 0 and len(snp_info_lst) > 1:
                            var_id = "rs" + snp_info_lst[0]['id']
                            if "warning" in output:
                                output["warning"] = output["warning"] + \
                                ". Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp_raw_i[0]
                            else:
                                output["warning"] = "Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp_raw_i[0]
                        else:
                            var_id = "rs" + ref_variants[0]
                        new_snp_lst.append([var_id])
                    elif len(snp_info_lst) == 1:
                        var_id = "rs" + snp_info_lst[0]['id']
                        new_snp_lst.append([var_id])
                    else:
                        new_snp_lst.append(snp_raw_i)
                else:
                    new_snp_lst.append(snp_raw_i)
        return new_snp_lst

    snps = replace_coords_rsid(db, snps)

    # Find RS numbers in snp database
    details = collections.OrderedDict()
    rs_nums = []
    snp_pos = []
    snp_coords = []
    warn = []
    tabix_coords = ""
    for snp_i in snps:
        if len(snp_i) > 0:
            if len(snp_i[0]) > 2:
                if (snp_i[0][0:2] == "rs" or snp_i[0][0:3] == "chr") and snp_i[0][-1].isdigit():
                    snp_coord = get_coords(db, snp_i[0])
                    if snp_coord != None and snp_coord[genome_build_vars[genome_build]['position']] != "NA":
                        # check if variant is on chrY for genome build = GRCh38
                        if snp_coord['chromosome'] == "Y" and (genome_build == "grch38" or genome_build == "grch38_high_coverage"):
                            if "warning" in output:
                                output["warning"] = output["warning"] + \
                                    ". " + "Input variants on chromosome Y are unavailable for GRCh38, only available for GRCh37 (" + "rs" + snp_coord['id'] + " = chr" + snp_coord['chromosome'] + ":" + snp_coord[genome_build_vars[genome_build]['position']] + ")"
                            else:
                                output["warning"] = "Input variants on chromosome Y are unavailable for GRCh38, only available for GRCh37 (" + "rs" + snp_coord['id'] + " = chr" + snp_coord['chromosome'] + ":" + snp_coord[genome_build_vars[genome_build]['position']] + ")"
                            warn.append(snp_i[0])
                            details[snp_i[0]] = ["NA", "NA", "Chromosome Y variants are unavailable for GRCh38, only available for GRCh37."]
                        else:
                            rs_nums.append(snp_i[0])
                            snp_pos.append(snp_coord[genome_build_vars[genome_build]['position']])
                            temp = [snp_i[0], snp_coord['chromosome'], snp_coord[genome_build_vars[genome_build]['position']]]
                            snp_coords.append(temp)
                    else:
                        warn.append(snp_i[0])
                        details[snp_i[0]] = ["NA", "NA", "Variant not found in dbSNP" + dbsnp_version + " (" + genome_build_vars[genome_build]['title'] + "), variant removed."]
                else:
                    warn.append(snp_i[0])
                    details[snp_i[0]] = ["NA", "NA",
                                         "Not a RS number, query removed."]
            else:
                warn.append(snp_i[0])
                details[snp_i[0]] = ["NA", "NA",
                                     "Not a RS number, query removed."]
        else:
            output["error"] = "Input list of RS numbers is empty"
            json_output = json.dumps(output, sort_keys=True, indent=2)
            print(json_output, file=out_json)
            out_json.close()
            return("", "", "")

    if warn != []:
        if "warning" in output:
            output["warning"] = output["warning"] + \
                ". The following RS number(s) or coordinate(s) inputs have warnings: " + ", ".join(warn)
        else:
            output["warning"] = "The following RS number(s) or coordinate(s) inputs have warnings: " + ", ".join(warn)

    if len(rs_nums) == 0:
        output["error"] = "Input SNP list does not contain any valid RS numbers or coordinates. " + output["warning"]
        json_output = json.dumps(output, sort_keys=True, indent=2)
        print(json_output, file=out_json)
        out_json.close()
        return("", "", "")

    # Check SNPs are all on the same chromosome
    for i in range(len(snp_coords)):
        if snp_coords[0][1] != snp_coords[i][1]:
            output["error"] = "Not all input variants are on the same chromosome: "+snp_coords[i-1][0]+"=chr" + \
                str(snp_coords[i-1][1])+":"+str(snp_coords[i-1][2])+", "+snp_coords[i][0] + \
                "=chr"+str(snp_coords[i][1])+":"+str(snp_coords[i][2])+"."
            json_output = json.dumps(output, sort_keys=True, indent=2)
            print(json_output, file=out_json)
            out_json.close()
            return("", "", "")

    # Make tabix formatted coordinates
    snp_coord_str = [genome_build_vars[genome_build]['1000G_chr_prefix'] + snp_coords[0][1]+":"+i+"-"+i for i in snp_pos]
    tabix_coords = " "+" ".join(snp_coord_str)

    # Extract 1000 Genomes phased genotypes
    vcf_filePath = "%s/%s%s/%s" % (config['aws']['data_subfolder'], genotypes_dir, genome_build_vars[genome_build]['1000G_dir'], genome_build_vars[genome_build]['1000G_file'] % (snp_coords[0][1]))
    vcf_query_snp_file = "s3://%s/%s" % (config['aws']['bucket'], vcf_filePath)

    checkS3File(aws_info, config['aws']['bucket'], vcf_filePath)

    vcf = retrieveTabix1000GData(vcf_query_snp_file, tabix_coords, data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir'])

    # Make MAF function
    def calc_maf(genos):
        vals = {"0|0": 0, "0|1": 0, "1|0": 0, "1|1": 0, "0": 0, "1": 0}
        for i in range(len(genos)):
            if genos[i] in vals:
                vals[genos[i]] += 1

        zeros = vals["0|0"]*2+vals["0|1"]+vals["1|0"]+vals["0"]
        ones = vals["1|1"]*2+vals["0|1"]+vals["1|0"]+vals["1"]
        total = zeros+ones

        f0 = zeros*1.0/total
        f1 = ones*1.0/total
        maf = min(f0, f1)

        return f0, f1, maf

    # Define function to correct indel alleles
    def set_alleles(a1, a2):
        if len(a1) == 1 and len(a2) == 1:
            a1_n = a1
            a2_n = a2
        elif len(a1) == 1 and len(a2) > 1:
            a1_n = "-"
            a2_n = a2[1:]
        elif len(a1) > 1 and len(a2) == 1:
            a1_n = a1[1:]
            a2_n = "-"
        elif len(a1) > 1 and len(a2) > 1:
            a1_n = a1[1:]
            a2_n = a2[1:]
        return(a1_n, a2_n)

    # Make R2 function
    def calc_r2(var1, var2):
        hap_vals = {"0|0-0|0": 0, "0|0-0|1": 0, "0|0-1|0": 0, "0|0-1|1": 0, "0|1-0|0": 0, "0|1-0|1": 0, "0|1-1|0": 0, "0|1-1|1": 0, "1|0-0|0": 0,
                    "1|0-0|1": 0, "1|0-1|0": 0, "1|0-1|1": 0, "1|1-0|0": 0, "1|1-0|1": 0, "1|1-1|0": 0, "1|1-1|1": 0, "0-0": 0, "0-1": 0, "1-0": 0, "1-1": 0}
        for i in range(len(var1)):
            ind_geno = var1[i]+"-"+var2[i]
            if ind_geno in hap_vals:
                hap_vals[ind_geno] += 1

        A = hap_vals["0|0-0|0"]*2+hap_vals["0|0-0|1"]+hap_vals["0|0-1|0"]+hap_vals["0|1-0|0"] + \
            hap_vals["0|1-0|1"]+hap_vals["1|0-0|0"] + \
            hap_vals["1|0-1|0"]+hap_vals["0-0"]
        B = hap_vals["0|0-0|1"]+hap_vals["0|0-1|0"]+hap_vals["0|0-1|1"]*2+hap_vals["0|1-1|0"] + \
            hap_vals["0|1-1|1"]+hap_vals["1|0-0|1"] + \
            hap_vals["1|0-1|1"]+hap_vals["0-1"]
        C = hap_vals["0|1-0|0"]+hap_vals["0|1-1|0"]+hap_vals["1|0-0|0"]+hap_vals["1|0-0|1"] + \
            hap_vals["1|1-0|0"]*2+hap_vals["1|1-0|1"] + \
            hap_vals["1|1-1|0"]+hap_vals["1-0"]
        D = hap_vals["0|1-0|1"]+hap_vals["0|1-1|1"]+hap_vals["1|0-1|0"]+hap_vals["1|0-1|1"] + \
            hap_vals["1|1-0|1"]+hap_vals["1|1-1|0"] + \
            hap_vals["1|1-1|1"]*2+hap_vals["1-1"]

        delta = float(A*D-B*C)
        Ms = float((A+C)*(B+D)*(A+B)*(C+D))
        if Ms != 0:
            r2 = (delta**2)/Ms
        else:
            r2 = None

        return(r2)

    # Import SNP VCF file
    hap_dict = {}
    h = 0
    while vcf[h][0:2] == "##":
        h += 1

    head = vcf[h].strip().split()

    # Extract population specific haplotypes
    pop_index = []
    for i in range(9, len(head)):
        if head[i] in pop_ids:
            pop_index.append(i)

    rsnum_lst = []

    for g in range(h+1, len(vcf)):
        geno = vcf[g].strip().split()
        geno[0] = geno[0].lstrip('chr')
        if geno[1] not in snp_pos:
            if "warning" in output:
                output["warning"] = output["warning"]+". Genomic position ("+geno[1]+") in VCF file does not match db" + \
                    dbsnp_version + " (" + genome_build_vars[genome_build]['title'] + ") search coordinates for query variant"
            else:
                output["warning"] = "Genomic position ("+geno[1]+") in VCF file does not match db" + \
                    dbsnp_version + " (" + genome_build_vars[genome_build]['title'] + ") search coordinates for query variant"
            continue

        if snp_pos.count(geno[1]) == 1:
            rs_query = rs_nums[snp_pos.index(geno[1])]

        else:
            pos_index = []
            for p in range(len(snp_pos)):
                if snp_pos[p] == geno[1]:
                    pos_index.append(p)
            for p in pos_index:
                if rs_nums[p] not in rsnum_lst:
                    rs_query = rs_nums[p]
                    break

        if rs_query in rsnum_lst:
            continue

        rs_1000g = geno[2]

        if rs_query == rs_1000g:
            rsnum = rs_1000g
        else:
            count = -2
            found = "false"
            while count <= 2 and count+g < len(vcf):
                geno_next = vcf[g+count].strip().split()
                geno_next[0] = geno_next[0].lstrip('chr')
                if len(geno_next) >= 3 and rs_query == geno_next[2]:
                    found = "true"
                    break
                count += 1

            if found == "false":
                if "rs" in rs_1000g:
                    if "warning" in output:
                        output["warning"] = output["warning"] + \
                            ". Genomic position for query variant ("+rs_query + \
                            ") does not match RS number at 1000G position (chr" + \
                            geno[0]+":"+geno[1]+" = "+rs_1000g+")"
                    else:
                        output["warning"] = "Genomic position for query variant ("+rs_query + \
                            ") does not match RS number at 1000G position (chr" + \
                            geno[0]+":"+geno[1]+" = "+rs_1000g+")"

                indx = [i[0] for i in snps].index(rs_query)
                # snps[indx][0]=geno[2]
                # rsnum=geno[2]
                snps[indx][0] = rs_query
                rsnum = rs_query
                # try:
                # 	indx=[i[0] for i in snps].index(rs_query)
                # 	snps[indx][0]=geno[2]
                # 	rsnum=geno[2]
                # except ValueError:
                # 	print("List does not contain value:")
                # 	print "#####"
                # 	print "variable rs_query " + rs_query
                # 	print "variable snps " + str(snps)
                # 	print "#####"
            else:
                continue

        details[rsnum] = ["chr"+geno[0]+":"+geno[1]]

        if "," not in geno[3] and "," not in geno[4]:
            temp_genos = []
            for i in range(len(pop_index)):
                temp_genos.append(geno[pop_index[i]])
            f0, f1, maf = calc_maf(temp_genos)
            a0, a1 = set_alleles(geno[3], geno[4])
            details[rsnum].append(
                a0+"="+str(round(f0, 3))+", "+a1+"="+str(round(f1, 3)))
            if maf_threshold <= maf:
                hap_dict[rsnum] = [temp_genos]
                rsnum_lst.append(rsnum)
            else:
                details[rsnum].append(
                    "Variant MAF is "+str(round(maf, 4))+", variant removed.")
        else:
            details[rsnum].append(geno[3]+"=NA, "+geno[4]+"=NA")
            details[rsnum].append("Variant is not biallelic, variant removed.")

    for i in rs_nums:
        if i not in rsnum_lst:
            if i not in details:
                index_i = rs_nums.index(i)
                details[i] = ["chr"+snp_coords[index_i][1]+":"+snp_coords[index_i][2]+"-" +
                              snp_coords[index_i][2], "NA", "Variant not in 1000G VCF file, variant removed."]

    # Thin the SNPs
    # sup_2=u"\u00B2"
    sup_2 = "2"
    i = 0
    while i < len(rsnum_lst):
        details[rsnum_lst[i]].append("Variant kept.")
        remove_list = []
        for j in range(i+1, len(rsnum_lst)):
            r2 = calc_r2(hap_dict[rsnum_lst[i]][0], hap_dict[rsnum_lst[j]][0])
            if r2_threshold <= r2:
                snp = rsnum_lst[j]
                details[snp].append("Variant in LD with "+rsnum_lst[i] +
                                    " (R"+sup_2+"="+str(round(r2, 4))+"), variant removed.")
                remove_list.append(snp)
        for snp in remove_list:
            rsnum_lst.remove(snp)
        i += 1

    # Return output
    json_output = json.dumps(output, sort_keys=True, indent=2)
    print(json_output, file=out_json)
    out_json.close()
    return(snps, rsnum_lst, details)
Пример #9
0
def calculate_pop(snp1, snp2, pop, r2_d, web, genome_build, request=None):

    # trim any whitespace
    snp1 = snp1.lower().strip()
    snp2 = snp2.lower().strip() 

    snp1_input = snp1
    snp2_input = snp2

    # Set data directories using config.yml
    with open('config.yml', 'r') as yml_file:
        config = yaml.load(yml_file)
    env = config['env']
    api_mongo_addr = config['api']['api_mongo_addr']
    dbsnp_version = config['data']['dbsnp_version']
    population_samples_dir = config['data']['population_samples_dir']
    data_dir = config['data']['data_dir']
    tmp_dir = config['data']['tmp_dir']
    genotypes_dir = config['data']['genotypes_dir']
    aws_info = config['aws']
    mongo_username = config['database']['mongo_user_readonly']
    mongo_password = config['database']['mongo_password']
    mongo_port = config['database']['mongo_port']

    export_s3_keys = retrieveAWSCredentials()

    # Ensure tmp directory exists
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)

    # Create JSON output
    output = {}

    # Validate genome build param
    print("genome_build " + genome_build)
    if genome_build not in genome_build_vars['vars']:
        output["error"] = "Invalid genome build. Please specify either " + ", ".join(genome_build_vars['vars']) + "."
        return(json.dumps(output, sort_keys=True, indent=2))

    # Connect to Mongo snp database
    if env == 'local':
        mongo_host = api_mongo_addr
    else: 
        mongo_host = 'localhost'
    if web:
        client = MongoClient('mongodb://' + mongo_username + ':' + mongo_password + '@' + mongo_host+'/admin', mongo_port)
    else:
        if env == 'local':
            client = MongoClient('mongodb://' + mongo_username + ':' + mongo_password + '@' + mongo_host+'/admin', mongo_port)
        else:
            client = MongoClient('localhost', mongo_port)
    db = client["LDLink"]

    def get_chrom_coords(db, rsid):
        rsid = rsid.strip("rs")
        query_results = db.dbsnp.find_one({"id": rsid})
        query_results_sanitized = json.loads(json_util.dumps(query_results))
        return query_results_sanitized

    # Replace input genomic coordinates with variant ids (rsids)
    def replace_coord_rsid(db, snp):
        if snp[0:2] == "rs":
            return snp
        else:
            snp_info_lst = get_rsnum(db, snp, genome_build)
            # print "snp_info_lst"
            # print snp_info_lst
            if snp_info_lst != None:
                if len(snp_info_lst) > 1:
                    var_id = "rs" + snp_info_lst[0]['id']
                    ref_variants = []
                    for snp_info in snp_info_lst:
                        if snp_info['id'] == snp_info['ref_id']:
                            ref_variants.append(snp_info['id'])
                    if len(ref_variants) > 1:
                        var_id = "rs" + ref_variants[0]
                        if "warning" in output:
                            output["warning"] = output["warning"] + \
                            ". Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp
                        else:
                            output["warning"] = "Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp
                    elif len(ref_variants) == 0 and len(snp_info_lst) > 1:
                        var_id = "rs" + snp_info_lst[0]['id']
                        if "warning" in output:
                            output["warning"] = output["warning"] + \
                            ". Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp
                        else:
                            output["warning"] = "Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp
                    else:
                        var_id = "rs" + ref_variants[0]
                    return var_id
                elif len(snp_info_lst) == 1:
                    var_id = "rs" + snp_info_lst[0]['id']
                    return var_id
                else:
                    return snp
            else:
                return snp
        return snp

    snp1 = replace_coord_rsid(db, snp1)
    snp2 = replace_coord_rsid(db, snp2)

    snp1_ldpair = snp1
    snp2_ldpair = snp2
    
    snp1_coord = get_chrom_coords(db, snp1)
    snp2_coord = get_chrom_coords(db, snp2)

    # Check if RS numbers are in snp database
    # SNP1
    if snp1_coord == None or snp1_coord[genome_build_vars[genome_build]['position']] == "NA":
        output["error"] = snp1 + " is not in dbSNP build " + dbsnp_version + " (" + genome_build_vars[genome_build]['title'] + ")."
        if web:
            output = json.dumps(output, sort_keys=True, indent=2)
        return output
    # SNP2
    if snp2_coord == None or snp2_coord[genome_build_vars[genome_build]['position']] == "NA":
        output["error"] = snp2 + " is not in dbSNP build " + dbsnp_version + " (" + genome_build_vars[genome_build]['title'] + ")."
        if web:
            output = json.dumps(output, sort_keys=True, indent=2)
        return output
    # Check if SNPs are on the same chromosome
    if snp1_coord['chromosome'] != snp2_coord['chromosome']:
        output["warning"] = snp1 + " and " + \
            snp2 + " are on different chromosomes"

    # Check if input SNPs are on chromosome Y while genome build == grch38
    # SNP1
    if snp1_coord['chromosome'] == "Y" and (genome_build == "grch38" or genome_build == "grch38_high_coverage"):
        output["error"] = "Input variants on chromosome Y are unavailable for GRCh38, only available for GRCh37 (" + "rs" + snp1_coord['id'] + " - chr" + snp1_coord['chromosome'] + ":" + snp1_coord[genome_build_vars[genome_build]['position']] + ")"
        return(json.dumps(output, sort_keys=True, indent=2))

    # SNP2
    if snp2_coord['chromosome'] == "Y" and (genome_build == "grch38" or genome_build == "grch38_high_coverage"):
        output["error"] = "Input variants on chromosome Y are unavailable for GRCh38, only available for GRCh37 (" + "rs" + snp2_coord['id'] + " - chr" + snp2_coord['chromosome'] + ":" + snp2_coord[genome_build_vars[genome_build]['position']] + ")"
        return(json.dumps(output, sort_keys=True, indent=2))

    # create indexes for population order
    pop_order = {
        "ALL": 1,
        "AFR": 2,
        "YRI": 3,
        "LWK": 4,
        "GWD": 5,
        "MSL": 6,
        "ESN": 7,
        "ASW": 8,
        "ACB": 9,
        "AMR": 10,
        "MXL": 11,
        "PUR": 12,
        "CLM": 13,
        "PEL": 14,
        "EAS": 15,
        "CHB": 16,
        "JPT": 17,
        "CHS": 18,
        "CDX": 19,
        "KHV": 20,
        "EUR": 21,
        "CEU": 22,
        "TSI": 23,
        "FIN": 24,
        "GBR": 25,
        "IBS": 26,
        "SAS": 27,
        "GIH": 28,
        "PJL": 29,
        "BEB": 30,
        "STU": 31,
        "ITU": 32
    }

    pop_groups = {
        "ALL": ["ACB", "ASW", "BEB", "CDX", "CEU", "CHB", "CHS", "CLM", "ESN", "FIN", "GBR", "GIH", "GWD", "IBS", "ITU", "JPT", "KHV", "LWK", "MSL", "MXL", "PEL", "PJL", "PUR", "STU", "TSI", "YRI"],
        "AFR": ["YRI", "LWK", "GWD", "MSL", "ESN", "ASW", "ACB"],
        "AMR": ["MXL", "PUR", "CLM", "PEL"],
        "EAS": ["CHB", "JPT", "CHS", "CDX", "KHV"],
        "EUR": ["CEU", "TSI", "FIN", "GBR" , "IBS"],
        "SAS": ["GIH", "PJL", "BEB", "STU" , "ITU"]
    }

    # empty list for paths to population data
    pop_dirs = []
    pop_split = pop.split("+")
    
    # display superpopulation and all subpopulations
    if "ALL" in pop_split:
        # pop_split.remove("ALL")
        pop_split = pop_split + pop_groups["ALL"] + list(pop_groups.keys())
        pop_split = list(set(pop_split)) # unique elements
    else:
        if "AFR" in pop_split:
            # pop_split.remove("AFR")
            pop_split = pop_split + pop_groups["AFR"]
            pop_split = list(set(pop_split)) # unique elements
        if "AMR" in pop_split:
            # pop_split.remove("AMR")
            pop_split = pop_split + pop_groups["AMR"]
            pop_split = list(set(pop_split)) # unique elements
        if "EAS" in pop_split:
            # pop_split.remove("EAS")
            pop_split = pop_split + pop_groups["EAS"]
            pop_split = list(set(pop_split)) # unique elements
        if "EUR" in pop_split:
            # pop_split.remove("EUR")
            pop_split = pop_split + pop_groups["EUR"]
            pop_split = list(set(pop_split)) # unique elements
        if "SAS" in pop_split:
            # pop_split.remove("SAS")
            pop_split = pop_split + pop_groups["SAS"]
            pop_split = list(set(pop_split)) # unique elements
    
    for pop_i in pop_split:
        if pop_i in ["ALL", "AFR", "AMR", "EAS", "EUR", "SAS", "ACB", "ASW", "BEB", "CDX", "CEU", "CHB", "CHS", "CLM", "ESN", "FIN", "GBR", "GIH", "GWD", "IBS", "ITU", "JPT", "KHV", "LWK", "MSL", "MXL", "PEL", "PJL", "PUR", "STU", "TSI", "YRI"]:
            pop_dirs.append(data_dir + population_samples_dir + pop_i + ".txt")
        else:
            output["error"] = pop_i + " is not an ancestral population. Choose one of the following ancestral populations: AFR, AMR, EAS, EUR, or SAS; or one of the following sub-populations: ACB, ASW, BEB, CDX, CEU, CHB, CHS, CLM, ESN, FIN, GBR, GIH, GWD, IBS, ITU, JPT, KHV, LWK, MSL, MXL, PEL, PJL, PUR, STU, TSI, or YRI."
            if web:
                output = json.dumps(output, sort_keys=True, indent=2)
            return output
           
    #make empty dictionary to keep sample IDs in for each wanted population 
    ID_dict = {k: [] for k in pop_split}
    adds = ["CHROM", "POS", "ID", "REF", "ALT"]
    
    for pop_i in pop_split:        
        with open(data_dir + population_samples_dir + pop_i + ".txt", "r") as f:
            # print pop_dir + pop_i + ".txt"
            for line in f:
                cleanedLine = line.strip()
                if cleanedLine: # is not empty
                    ID_dict[pop_i].append(cleanedLine)
            for entry in adds:
                ID_dict[pop_i].append(entry)
    
    # Extract 1000 Genomes phased genotypes
    # SNP1
    vcf_filePath1 = "%s/%s%s/%s" % (config['aws']['data_subfolder'], genotypes_dir, genome_build_vars[genome_build]['1000G_dir'], genome_build_vars[genome_build]['1000G_file'] % snp1_coord['chromosome'])
    vcf_rs1 = "s3://%s/%s" % (config['aws']['bucket'], vcf_filePath1)

    checkS3File(aws_info, config['aws']['bucket'], vcf_filePath1)

    rs1_test = export_s3_keys + " cd {3}; tabix -D {0} {1}:{2}-{2} | grep -v -e END".format(vcf_rs1, genome_build_vars[genome_build]['1000G_chr_prefix'] + snp1_coord['chromosome'], snp1_coord[genome_build_vars[genome_build]['position']], data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir']) 
    vcf1 = [x.decode('utf-8') for x in subprocess.Popen(rs1_test, shell=True, stdout=subprocess.PIPE).stdout.readlines()]

    vcf_filePath2 = "%s/%s%s/%s" % (config['aws']['data_subfolder'], genotypes_dir, genome_build_vars[genome_build]['1000G_dir'], genome_build_vars[genome_build]['1000G_file'] % snp2_coord['chromosome'])
    vcf_rs2 = "s3://%s/%s" % (config['aws']['bucket'], vcf_filePath2)

    checkS3File(aws_info, config['aws']['bucket'], vcf_filePath2)

    # need to add | grep -v -e END ???
    rs2_test = export_s3_keys + " cd {3}; tabix -D {0} {1}:{2}-{2} | grep -v -e END".format(vcf_rs2, genome_build_vars[genome_build]['1000G_chr_prefix'] + snp2_coord['chromosome'], snp2_coord[genome_build_vars[genome_build]['position']], data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir'])
    vcf2 = [x.decode('utf-8') for x in subprocess.Popen(rs2_test, shell=True, stdout=subprocess.PIPE).stdout.readlines()]

    # Check if SNPs are in 1000G reference panel
    # SNP1
    if len(vcf1) == 0:
        output["error"] = snp1 + " is not in 1000G reference panel."
        if web:
            output = json.dumps(output, sort_keys=True, indent=2)
        return output
    elif len(vcf1) > 1:
        geno1 = []
        for i in range(len(vcf1)):
            if vcf1[i].strip().split()[2] == snp1:
                geno1 = vcf1[i].strip().split()
                geno1[0] = geno1[0].lstrip('chr')
        if geno1 == []:
            output["error"] = snp1 + " is not in 1000G reference panel."
            if web:
                output = json.dumps(output, sort_keys=True, indent=2)
            return output
    else:
        geno1 = vcf1[0].strip().split()
        geno1[0] = geno1[0].lstrip('chr')

    if geno1[2] != snp1 and snp1[0:2] == "rs" and "rs" in geno1[2]:
        if "warning" in output:
            output["warning"] = output["warning"] + \
                ". Genomic position for query variant1 (" + snp1 + \
                ") does not match RS number at 1000G position (chr" + \
                geno1[0]+":"+geno1[1]+" = "+geno1[2]+")"
        else:
            output["warning"] = "Genomic position for query variant1 (" + snp1 + \
                ") does not match RS number at 1000G position (chr" + \
                geno1[0]+":"+geno1[1]+" = "+geno1[2]+")"
        snp1 = geno1[2]

    if "," in geno1[3] or "," in geno1[4]:
        output["error"] = snp1 + " is not a biallelic variant."
        return(json.dumps(output, sort_keys=True, indent=2))

    # SNP2
    if len(vcf2) == 0:
        output["error"] = snp2 + " is not in 1000G reference panel."
        if web:
            output = json.dumps(output, sort_keys=True, indent=2)
        return output
    elif len(vcf2) > 1:
        geno2 = []
        for i in range(len(vcf2)):
            if vcf2[i].strip().split()[2] == snp2:
                geno2 = vcf2[i].strip().split()
                geno2[0] = geno2[0].lstrip('chr')
        if geno2 == []:
            output["error"] = snp2 + " is not in 1000G reference panel."
            if web:
                output = json.dumps(output, sort_keys=True, indent=2)
            return output
    else:
        geno2 = vcf2[0].strip().split()
        geno2[0] = geno2[0].lstrip('chr')

    if geno2[2] != snp2 and snp2[0:2] == "rs" and "rs" in geno2[2]:
        if "warning" in output:
            output["warning"] = output["warning"] + \
                ". Genomic position for query variant2 (" + snp2 + \
                ") does not match RS number at 1000G position (chr" + \
                geno2[0]+":"+geno2[1]+" = "+geno2[2]+")"
        else:
            output["warning"] = "Genomic position for query variant2 (" + snp2 + \
                ") does not match RS number at 1000G position (chr" + \
                geno2[0]+":"+geno2[1]+" = "+geno2[2]+")"
        snp2 = geno2[2]

    if "," in geno2[3] or "," in geno2[4]:
        output["error"] = snp2 + " is not a biallelic variant."
        return(json.dumps(output, sort_keys=True, indent=2))

    # vcf1 = vcf1[0].strip().split()
    # vcf2 = vcf2[0].strip().split()

    # Get headers
    tabix_snp1_h = export_s3_keys + " cd {1}; tabix -HD {0} | grep CHROM".format(vcf_rs1, data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir'])
    head1 = [x.decode('utf-8') for x in subprocess.Popen(tabix_snp1_h, shell=True, stdout=subprocess.PIPE).stdout.readlines()][0].strip().split()

    tabix_snp2_h = export_s3_keys + " cd {1}; tabix -HD {0} | grep CHROM".format(vcf_rs2, data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir'])
    head2 = [x.decode('utf-8') for x in subprocess.Popen(tabix_snp2_h, shell=True, stdout=subprocess.PIPE).stdout.readlines()][0].strip().split()

    rs1_dict = dict(list(zip(head1, geno1)))
    rs2_dict = dict(list(zip(head2, geno2)))
    
    if "<" in rs1_dict["REF"]:
        if "warning" in output:
            output["warning"] = output["warning"] + \
                "." + snp1 + "is a CNV marker. " 
        else:
            output["warning"] = snp1 + "is a CNV marker. " 
            
    if "<" in rs2_dict["REF"]:
        if "warning" in output:
            output["warning"] = output["warning"] + \
                "." + snp2 + "is a CNV marker. " 
        else:
            output["warning"] = snp2 + "is a CNV marker. " 
    
    geno_ind = {
        "rs1" : {k: [] for k in pop_split},
        "rs2" : {k: [] for k in pop_split} 
    }
    
    #SNP1
    for colname in rs1_dict:       
        for key in ID_dict:
            if (colname in ID_dict[key]) and (colname not in adds):
                geno_ind["rs1"][key].append(rs1_dict[colname] + "|." if len(rs1_dict[colname]) == 1 else rs1_dict[colname])
    
    #SNP2            
    for colname in rs2_dict:       
        for key in ID_dict:
            if (colname in ID_dict[key]) and (colname not in adds):
                geno_ind["rs2"][key].append(rs2_dict[colname] + "|." if len(rs2_dict[colname]) == 1 else rs2_dict[colname])
    
    #population freqency dictionary to fill in
    pop_freqs = {
        "ref_freq_snp1" : { }, \
        "ref_freq_snp2" : { }, \
        "alt_freq_snp1" : { }, \
        "alt_freq_snp2" : { }, \
        "total_alleles": { }
    }           
    
    for key in geno_ind["rs1"]:
        pop_freqs["total_alleles"][key] = float(2*geno_ind["rs1"][key].count("0|0") + 2*geno_ind["rs1"][key].count("0|1") +  2*geno_ind["rs1"][key].count("1|1") + 2* geno_ind["rs1"][key].count("1|0") + 2* geno_ind["rs1"][key].count("0|.") + 2* geno_ind["rs1"][key].count("1|."))
        if (pop_freqs["total_alleles"][key] > 0):
            pop_freqs["ref_freq_snp1"][key] = round(((2*geno_ind["rs1"][key].count("0|0") + geno_ind["rs1"][key].count("0|1") + geno_ind["rs1"][key].count("1|0") + geno_ind["rs1"][key].count("1|.") + geno_ind["rs1"][key].count("0|."))/ float(pop_freqs["total_alleles"][key])) *100, 2)
            pop_freqs["ref_freq_snp2"][key] = round(((2*geno_ind["rs2"][key].count("0|0") + geno_ind["rs2"][key].count("0|1") + geno_ind["rs2"][key].count("1|0") + geno_ind["rs2"][key].count("1|.") + geno_ind["rs2"][key].count("0|."))/ float(pop_freqs["total_alleles"][key])) *100, 2)
            pop_freqs["alt_freq_snp1"][key] = round(((2*geno_ind["rs1"][key].count("1|1") + geno_ind["rs1"][key].count("0|1") + geno_ind["rs1"][key].count("1|0") + geno_ind["rs1"][key].count("1|.") + geno_ind["rs1"][key].count("0|."))/ float(pop_freqs["total_alleles"][key])) *100, 2)
            pop_freqs["alt_freq_snp2"][key] = round(((2*geno_ind["rs2"][key].count("1|1") + geno_ind["rs2"][key].count("0|1") + geno_ind["rs2"][key].count("1|0") + geno_ind["rs2"][key].count("1|.") + geno_ind["rs2"][key].count("0|."))/ float(pop_freqs["total_alleles"][key])) *100, 2)
        else :
            output["error"] = "Insufficient haplotype data for " + snp1 + " and " + snp2 + " in 1000G reference panel."
            if web:
                output = json.dumps(output, sort_keys=True, indent=2)
            return output
        
    #get sample size for each population
    sample_size_dict = {}  
     
    for key in ID_dict:
        sample_size_dict[key] = len(ID_dict[key])- len(adds)
        
    # Combine phased genotype
    # Extract haplotypes
    hap = {k: {"0_0": 0, "0_1": 0, "1_0": 0, "1_1": 0, "0_.": 0, "1_.": 0, "._.": 0, "._0": 0, "._1": 0} for k in pop_split}
    
    for pop in geno_ind["rs1"]:
        if len(geno_ind["rs1"][pop]) == len(geno_ind["rs2"][pop]):
            geno_ind_range = len(geno_ind["rs1"][pop])
        elif len(geno_ind["rs1"][pop]) < len(geno_ind["rs2"][pop]):
            geno_ind_range = len(geno_ind["rs1"][pop])
        else:
            geno_ind_range = len(geno_ind["rs2"][pop])
        for ind in range(geno_ind_range):
            # if len(geno_ind["rs1"][pop][ind]) == 3:
            hap1 = geno_ind["rs1"][pop][ind][0] + "_" + geno_ind["rs2"][pop][ind][0]
            hap2 = geno_ind["rs1"][pop][ind][2] + "_" + geno_ind["rs2"][pop][ind][2]
            if hap1 in hap[pop]:
                hap[pop][hap1] += 1           
                hap[pop][hap2] += 1

    # Remove missing haplotypes
    pops = list(hap.keys())
    for pop in pops:
        keys = list(hap[pop].keys())
        for key in keys:
            if "." in key:
                hap[pop].pop(key, None)
        
    # Sort haplotypes
    matrix_values = {k : {"A": "", "B": "", "C": "", "D": "", "N": "", "delta" : "", "Ms" : "" , "D_prime":"", "r2":""} for k in pop_split}
    for pop in hap:
        matrix_values[pop]["A"] = hap[pop][sorted(hap[pop])[0]]
        matrix_values[pop]["B"] = hap[pop][sorted(hap[pop])[1]]
        matrix_values[pop]["C"] = hap[pop][sorted(hap[pop])[2]]
        matrix_values[pop]["D"] = hap[pop][sorted(hap[pop])[3]]
        matrix_values[pop]["N"] = matrix_values[pop]["A"] + matrix_values[pop]["B"] + matrix_values[pop]["C"] + matrix_values[pop]["D"]
        matrix_values[pop]["delta"] = float(matrix_values[pop]["A"] * matrix_values[pop]["D"] - matrix_values[pop]["B"] * matrix_values[pop]["C"])
        matrix_values[pop]["Ms"] = float((matrix_values[pop]["A"] + matrix_values[pop]["C"]) * (matrix_values[pop]["B"] + matrix_values[pop]["D"]) * (matrix_values[pop]["A"] + matrix_values[pop]["B"]) * (matrix_values[pop]["C"] + matrix_values[pop]["D"]))
        if matrix_values[pop]["Ms"] != 0:
            # D prime
            if matrix_values[pop]["delta"] < 0:
                matrix_values[pop]["D_prime"] = abs(matrix_values[pop]["delta"] / min((matrix_values[pop]["A"] + matrix_values[pop]["C"]) * (matrix_values[pop]["A"] + matrix_values[pop]["B"]), (matrix_values[pop]["B"] + matrix_values[pop]["D"]) * (matrix_values[pop]["C"] + matrix_values[pop]["D"])))
            else:
                matrix_values[pop]["D_prime"] = abs(matrix_values[pop]["delta"] / min((matrix_values[pop]["A"] + matrix_values[pop]["C"]) * (matrix_values[pop]["C"] + matrix_values[pop]["D"]), (matrix_values[pop]["A"] + matrix_values[pop]["B"]) * (matrix_values[pop]["B"] + matrix_values[pop]["D"])))
            # R2
            matrix_values[pop]["r2"]= (matrix_values[pop]["delta"]**2) / matrix_values[pop]["Ms"]
            num = (matrix_values[pop]["A"] + matrix_values[pop]["B"] + matrix_values[pop]["C"] + matrix_values[pop]["D"]) * (matrix_values[pop]["A"] * matrix_values[pop]["D"] - matrix_values[pop]["B"] * matrix_values[pop]["C"])**2
            denom = matrix_values[pop]["Ms"]
            matrix_values[pop]["chisq"] = num / denom
            matrix_values[pop]["p"] = 2 * (1 - (0.5 * (1 + math.erf(matrix_values[pop]["chisq"] **0.5 / 2**0.5))))
        else:
            matrix_values[pop]["D_prime"] = "NA"
            matrix_values[pop]["r2"] = "NA"
            matrix_values[pop]["chisq"] = "NA"
            matrix_values[pop]["p"] = "NA"
    
    for pops in sample_size_dict:    
        output[pops] = {
            'Population': pops , 
            'N': sample_size_dict[pops], \
            # rs1_dict["ID"] + ' Allele Freq': {
            #     rs1_dict["REF"] : str(pop_freqs["ref_freq_snp1"][pops]) + "%", \
            #     rs1_dict["ALT"] : str(pop_freqs["alt_freq_snp1"][pops]) + "%"
            # }, \
            # rs2_dict["ID"] + ' Allele Freq': {
            #     rs2_dict["REF"] : str(pop_freqs["ref_freq_snp2"][pops]) + "%", \
            #     rs2_dict["ALT"] : str(pop_freqs["alt_freq_snp2"][pops]) + "%"
            # }, 
            'rs#1 Allele Freq': {
                rs1_dict["REF"] : str(pop_freqs["ref_freq_snp1"][pops]) + "%", \
                rs1_dict["ALT"] : str(pop_freqs["alt_freq_snp1"][pops]) + "%"
            }, \
            'rs#2 Allele Freq': {
                rs2_dict["REF"] : str(pop_freqs["ref_freq_snp2"][pops]) + "%", \
                rs2_dict["ALT"] : str(pop_freqs["alt_freq_snp2"][pops]) + "%"
            }, 
            "D'" : matrix_values[pops]["D_prime"] if isinstance(matrix_values[pops]["D_prime"], str) else round(float(matrix_values[pops]["D_prime"]), 4), \
            "R2" : matrix_values[pops]["r2"] if isinstance(matrix_values[pops]["r2"], str) else round(float(matrix_values[pops]["r2"]), 4), \
            "chisq" : matrix_values[pops]["chisq"] if isinstance(matrix_values[pops]["chisq"], str) else round(float(matrix_values[pops]["chisq"]), 4), \
            "p" : matrix_values[pops]["p"] if isinstance(matrix_values[pops]["p"], str) else round(float(matrix_values[pops]["p"]), 4)
        }
    
    # print json.dumps(output)

    location_data = {
        "ALL" : {
            "location": "All Populations"
        },
        "AFR" : {
            "location": "African"
        },
        "AMR" : {
            "location": "Ad Mixed American"
        },
        "EAS" : {
            "location": "East Asian"
        },
        "EUR" : {
            "location": "European"
        },
        "SAS" : {
            "location": "South Asian"
        },
        "YRI": {
            "location": "Yoruba in Ibadan, Nigeria",
            "superpopulation": "AFR",
            "latitude": 7.40026,
            "longitude": 3.910742
        },
        "LWK": {
            "location": "Luhya in Webuye, Kenya",
            "superpopulation": "AFR",
            "latitude": 0.59738,
            "longitude": 34.777227
        },
        "GWD": {
            "location": "Gambian in Western Divisions in the Gambia",
            "superpopulation": "AFR",
            "latitude": 13.474133,
            "longitude": -16.394272
        },
        "MSL": {
            "location": "Mende in Sierra Leone",
            "superpopulation": "AFR",
            "latitude": 8.176076,
            "longitude": -11.040253
        },
        "ESN": {
            "location": "Esan in Nigeria",
            "superpopulation": "AFR",
            "latitude": 6.687988,
            "longitude": 6.212868
        },
        "ASW": {
            "location": "Americans of African Ancestry in SW USA",
            "superpopulation": "AFR",
            "latitude": 35.310647,
            "longitude": -107.975885
        },
        "ACB": {
            "location": "African Caribbeans in Barbados",
            "superpopulation": "AFR",
            "latitude": 13.172483,
            "longitude": -59.552779
        },
        "MXL": {
            "location": "Mexican Ancestry from Los Angeles USA",
            "superpopulation": "AMR",
            "latitude": 34.113837,
            "longitude": -118.440427
        },
        "PUR": {
            "location": "Puerto Ricans from Puerto Rico",
            "superpopulation": "AMR",
            "latitude": 18.234429,
            "longitude": -66.418775
        },
        "CLM": {
            "location": "Colombians from Medellin, Colombia",
            "superpopulation": "AMR",
            "latitude": 6.252089,
            "longitude": -75.594652
        },
        "PEL": {
            "location": "Peruvians from Lima, Peru",
            "superpopulation": "AMR",
            "latitude": -12.046543,
            "longitude": -77.046155
        },
        "CHB": {
            "location": "Han Chinese in Beijing, China",
            "superpopulation": "EAS",
            "latitude": 39.906802,
            "longitude": 116.407323
        },
        "JPT": {
            "location": "Japanese in Tokyo, Japan",
            "superpopulation": "EAS",
            "latitude": 35.709444,
            "longitude": 139.731815
        },
        "CHS": {
            "location": "Southern Han Chinese",
            "superpopulation": "EAS",
            "latitude": 24.719998,
            "longitude": 113.043464
        },
        "CDX": {
            "location": "Chinese Dai in Xishuangbanna, China",
            "superpopulation": "EAS",
            "latitude": 22.008264,
            "longitude": 100.796045
        },
        "KHV": {
            "location": "Kinh in Ho Chi Minh City, Vietnam",
            "superpopulation": "EAS",
            "latitude": 10.812236,
            "longitude": 106.633978
        },
        "CEU": {
            "location": "Utah Residents (CEPH) with Northern and Western European Ancestry",
            "superpopulation": "EUR",
            "latitude": 39.250493,
            "longitude": -111.631295
        },
        "TSI": {
            "location": "Toscani in Italia",
            "superpopulation": "EUR",
            "latitude": 43.444187,
            "longitude": 11.117199
        },
        "FIN": {
            "location": "Finnish in Finland",
            "superpopulation": "EUR",
            "latitude": 63.112,
            "longitude": 26.770837
        },
        "GBR": {
            "location": "British in England and Scotland",
            "superpopulation": "EUR",
            "latitude": 54.55902,
            "longitude": -2.143222
        },
        "IBS": {
            "location": "Iberian Population in Spain",
            "superpopulation": "EUR",
            "latitude": 40.482057,
            "longitude": -4.088383
        },
        "GIH": {
            "location": "Gujarati Indian from Houston, Texas",
            "superpopulation": "SAS",
            "latitude": 29.760619,
            "longitude": -95.361356
        },
        "PJL": {
            "location": "Punjabi from Lahore, Pakistan",
            "superpopulation": "SAS",
            "latitude": 31.515188,
            "longitude": 74.357703
        },
        "BEB": {
            "location": "Bengali from Bangladesh",
            "superpopulation": "SAS",
            "latitude": 24.013458,
            "longitude": 90.233561
        },
        "STU": {
            "location": "Sri Lankan Tamil from the UK",
            "superpopulation": "SAS",
            "latitude": 7.595905,
            "longitude": 80.843382
        },
        "ITU": {
            "location": "Indian Telugu from the UK",
            "superpopulation": "SAS",
            "latitude": 15.489823,
            "longitude": 78.487081
        }
    }

    # Change manipulate output data for frontend only if accessed via Web instance
    # if web:
    output_table = { 
        "inputs": {
            "rs1": snp1_input,
            "rs2": snp2_input,
            "LD": r2_d
        },
        "aaData": [],
        "locations": {
            "rs1_rs2_LD_map": [],
            "rs1_map": [],
            "rs2_map": []
        }
    }
    table_data = []
    rs1_map_data = []
    rs2_map_data = []
    rs1_rs2_LD_map_data = []
    # print(list(output.keys()))
    # populate table data
    for key in list(output.keys()):
        if key in list(pop_order.keys()):
            # print key, "parse for table"
            key_order = pop_order[key]
            key_pop = output[key]['Population']
            key_N = output[key]['N']
            # key_rs1_allele_freq = ", ".join([allele + ": " + output[key]['rs#1 Allele Freq'][allele] + "%" for allele in output[key]['rs#1 Allele Freq']])
            key_rs1_allele_freq = rs1_dict["REF"] + ": " + output[key]['rs#1 Allele Freq'][rs1_dict["REF"]] + ", " + rs1_dict["ALT"] + ": " + output[key]['rs#1 Allele Freq'][rs1_dict["ALT"]]
            # key_rs2_allele_freq = ", ".join([allele + ": " + output[key]['rs#2 Allele Freq'][allele] + "%" for allele in output[key]['rs#2 Allele Freq']])
            key_rs2_allele_freq = rs2_dict["REF"] + ": " + output[key]['rs#2 Allele Freq'][rs2_dict["REF"]] + ", " + rs2_dict["ALT"] + ": " + output[key]['rs#2 Allele Freq'][rs2_dict["ALT"]]
            key_D_prime = output[key]["D'"]
            key_R_2 = output[key]['R2']
            # set up data for ldpair link
            ldpair_pops = [key]
            key_chisq = output[key]['chisq']
            key_p = output[key]['p']
            if key in list(pop_groups.keys()):
                ldpair_pops = pop_groups[key]
            ldpair_data = [snp1_ldpair, snp2_ldpair, "%2B".join(ldpair_pops)]
            table_data.append([key_order, key_pop, key_N, key_rs1_allele_freq, key_rs2_allele_freq, key_R_2, key_D_prime, ldpair_data, key_chisq, key_p])
            # populate map data
            if key not in list(pop_groups.keys()):
                rs1_rs2_LD_map_data.append([key, location_data[key]["location"], location_data[key]["superpopulation"], location_data[key]["latitude"], location_data[key]["longitude"], key_rs1_allele_freq, key_rs2_allele_freq, key_R_2, key_D_prime])
                rs1_map_data.append([key, location_data[key]["location"], location_data[key]["superpopulation"], location_data[key]["latitude"], location_data[key]["longitude"], key_rs1_allele_freq])
                rs2_map_data.append([key, location_data[key]["location"], location_data[key]["superpopulation"], location_data[key]["latitude"], location_data[key]["longitude"], key_rs2_allele_freq])
    # Add map data
    output_table["locations"]["rs1_rs2_LD_map"] = rs1_rs2_LD_map_data
    output_table["locations"]["rs1_map"] = rs1_map_data
    output_table["locations"]["rs2_map"] = rs2_map_data
    def getKeyOrder(element):
        return element[0]
    table_data.sort(key=getKeyOrder)
    # Add table data sorting order of rows
    output_table["aaData"] = [xs[1:] for xs in table_data]
    # Add final row link to LDpair
    # ldpair_pops = []
    # for pop in output.keys():
    #     if pop not in pop_groups.keys() and len(pop) == 3:
    #         ldpair_pops.append(pop)
    # ldpair_data = [snp1_input, snp2_input, "%2B".join(ldpair_pops)]
    # output_table["aaData"].append(["LDpair", ldpair_data, ldpair_data, ldpair_data, ldpair_data, ldpair_data])
    if "warning" in output:
        output_table["warning"] = output["warning"]
    if "error" in output:
        output_table["error"] = output["error"]
    # Generate output file
    with open(tmp_dir + "LDpop_" + request + ".txt", "w") as ldpop_out:
        ldpop_out.write("\t".join(["Population", "Abbrev", "N", output_table["inputs"]["rs1"] + " Allele Freq", output_table["inputs"]["rs2"] + " Allele Freq", "R2", "D\'", "Chisq", "P"]) + "\n")
        # print("output_table", output_table)
        # print('output_table["aaData"]', output_table["aaData"])
        for row in output_table["aaData"]:
            ldpop_out.write(str(location_data[row[0]]["location"] + "\t" + row[0]) + "\t" + str(row[1]) + "\t" + str(row[2]) + "\t" + str(row[3]) + "\t" + str(row[4]) + "\t" + str(row[5]) + "\t" + str(row[7]) + "\t" + str(row[8]) + "\n")
        if "error" in output_table:
            ldpop_out.write("\n")
            ldpop_out.write(output_table["error"])
        if "warning" in output_table:
            ldpop_out.write("\n")
            ldpop_out.write(output_table["warning"])

    # Change manipulate output data for frontend only if accessed via Web instance
    # if web:
    output = json.dumps(output_table, sort_keys=True, indent=2)
        
    return output
Пример #10
0
def get_query_variant(snp_coord, pop_ids, request, genome_build):

    export_s3_keys = retrieveAWSCredentials()

    vcf_filePath = "%s/%s%s/%s" % (
        config['aws']['data_subfolder'], genotypes_dir,
        genome_build_vars[genome_build]['1000G_dir'],
        genome_build_vars[genome_build]['1000G_file'] % (snp_coord[1]))
    vcf_query_snp_file = "s3://%s/%s" % (config['aws']['bucket'], vcf_filePath)

    queryVariantWarnings = []
    # Extract query SNP phased genotypes

    checkS3File(aws_info, config['aws']['bucket'], vcf_filePath)

    tabix_query_snp_h = export_s3_keys + " cd {1}; tabix -HD {0} | grep CHROM".format(
        vcf_query_snp_file, data_dir + genotypes_dir +
        genome_build_vars[genome_build]['1000G_dir'])
    # print("tabix_query_snp_h", tabix_query_snp_h)
    head = [
        x.decode('utf-8')
        for x in subprocess.Popen(tabix_query_snp_h,
                                  shell=True,
                                  stdout=subprocess.PIPE).stdout.readlines()
    ][0].strip().split()

    tabix_query_snp = export_s3_keys + " cd {4}; tabix -D {0} {1}:{2}-{2} | grep -v -e END > {3}".format(
        vcf_query_snp_file,
        genome_build_vars[genome_build]['1000G_chr_prefix'] + snp_coord[1],
        snp_coord[2], tmp_dir + "snp_no_dups_" + request + ".vcf", data_dir +
        genotypes_dir + genome_build_vars[genome_build]['1000G_dir'])
    # print("tabix_query_snp", tabix_query_snp)
    subprocess.call(tabix_query_snp, shell=True)
    tabix_query_snp_out = open(tmp_dir + "snp_no_dups_" + request +
                               ".vcf").readlines()

    # Validate error
    if len(tabix_query_snp_out) == 0:
        # print("ERROR", "len(tabix_query_snp_out) == 0")
        # handle error: snp + " is not in 1000G reference panel."
        queryVariantWarnings.append(
            [snp_coord[0], "NA", "Variant is not in 1000G reference panel."])
        subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt",
                        shell=True)
        subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True)
        return (None, queryVariantWarnings)
    elif len(tabix_query_snp_out) > 1:
        geno = []
        for i in range(len(tabix_query_snp_out)):
            if tabix_query_snp_out[i].strip().split()[2] == snp_coord[0]:
                geno = tabix_query_snp_out[i].strip().split()
                geno[0] = geno[0].lstrip('chr')
        if geno == []:
            # print("ERROR", "geno == []")
            # handle error: snp + " is not in 1000G reference panel."
            queryVariantWarnings.append([
                snp_coord[0], "NA", "Variant is not in 1000G reference panel."
            ])
            subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt",
                            shell=True)
            subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf",
                            shell=True)
            return (None, queryVariantWarnings)
    else:
        geno = tabix_query_snp_out[0].strip().split()
        geno[0] = geno[0].lstrip('chr')

    if geno[2] != snp_coord[0] and "rs" in geno[2]:
        queryVariantWarnings.append([
            snp_coord[0], "NA",
            "Genomic position does not match RS number at 1000G position (chr"
            + geno[0] + ":" + geno[1] + " = " + geno[2] + ")."
        ])
        # snp = geno[2]

    if "," in geno[3] or "," in geno[4]:
        # print('handle error: snp + " is not a biallelic variant."')
        queryVariantWarnings.append(
            [snp_coord[0], "NA", "Variant is not a biallelic."])

    index = []
    for i in range(9, len(head)):
        if head[i] in pop_ids:
            index.append(i)

    genotypes = {"0": 0, "1": 0}
    for i in index:
        sub_geno = geno[i].split("|")
        for j in sub_geno:
            if j in genotypes:
                genotypes[j] += 1
            else:
                genotypes[j] = 1

    if genotypes["0"] == 0 or genotypes["1"] == 0:
        # print('handle error: snp + " is monoallelic in the " + pop + " population."')
        queryVariantWarnings.append([
            snp_coord[0], "NA",
            "Variant is monoallelic in the chosen population(s)."
        ])

    return (geno, queryVariantWarnings)
Пример #11
0
def calculate_assoc_svg(file, region, pop, request, genome_build, myargs,
                        myargsName, myargsOrigin):

    # Set data directories using config.yml
    with open('config.yml', 'r') as yml_file:
        config = yaml.load(yml_file)
    env = config['env']
    api_mongo_addr = config['api']['api_mongo_addr']
    data_dir = config['data']['data_dir']
    tmp_dir = config['data']['tmp_dir']
    genotypes_dir = config['data']['genotypes_dir']
    aws_info = config['aws']
    mongo_username = config['database']['mongo_user_readonly']
    mongo_password = config['database']['mongo_password']
    mongo_port = config['database']['mongo_port']
    num_subprocesses = config['performance']['num_subprocesses']

    export_s3_keys = retrieveAWSCredentials()

    # Ensure tmp directory exists
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)

    chrs = [
        "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
        "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y"
    ]

    # Define parameters for --variant option
    if region == "variant":
        if myargsOrigin == "None":
            return None

    if myargsOrigin != "None":
        # Find coordinates (GRCh37/hg19) or (GRCh38/hg38) for SNP RS number
        if myargsOrigin[0:2] == "rs":
            snp = myargsOrigin

            # Connect to Mongo snp database
            if env == 'local':
                mongo_host = api_mongo_addr
            else:
                mongo_host = 'localhost'
            client = MongoClient(
                'mongodb://' + mongo_username + ':' + mongo_password + '@' +
                mongo_host + '/admin', mongo_port)
            db = client["LDLink"]

            def get_coords_var(db, rsid):
                rsid = rsid.strip("rs")
                query_results = db.dbsnp.find_one({"id": rsid})
                query_results_sanitized = json.loads(
                    json_util.dumps(query_results))
                return query_results_sanitized

            # Find RS number in snp database
            var_coord = get_coords_var(db, snp)

            if var_coord == None:
                return None

        elif myargsOrigin.split(":")[0].strip("chr") in chrs and len(
                myargsOrigin.split(":")) == 2:
            snp = myargsOrigin
            #var_coord=[None,myargsOrigin.split(":")[0].strip("chr"),myargsOrigin.split(":")[1]]
            var_coord = {
                'chromosome': myargsOrigin.split(":")[0].strip("chr"),
                'position': myargsOrigin.split(":")[1]
            }
        else:
            return None

        chromosome = var_coord['chromosome']
        org_coord = var_coord[genome_build_vars[genome_build]['position']]

    # Open Association Data
    header_list = []
    header_list.append(myargs['chr'])
    header_list.append(myargs['bp'])
    header_list.append(myargs['pval'])

    # Load input file
    with open(file) as fp:
        header = fp.readline().strip().split()
        first = fp.readline().strip().split()

    if len(header) != len(first):
        return None

    # Check header
    for item in header_list:
        if item not in header:
            return None

    len_head = len(header)

    chr_index = header.index(myargs['chr'])
    pos_index = header.index(myargs['bp'])
    p_index = header.index(myargs['pval'])

    # Define window of interest around query SNP
    if myargs['window'] == None:
        if region == "variant":
            window = 500000
        elif region == "gene":
            window = 100000
        else:
            window = 0
    else:
        window = myargs['window']

    if region == "variant":
        coord1 = int(org_coord) - window
        if coord1 < 0:
            coord1 = 0
        coord2 = int(org_coord) + window

    elif region == "gene":
        if myargsName == "None":
            return None

        def get_coords_gene(gene_raw, db):
            gene = gene_raw.upper()
            mongoResult = db.genes_name_coords.find_one({"name": gene})

            #format mongo output
            if mongoResult != None:
                geneResult = [
                    mongoResult["name"],
                    mongoResult[genome_build_vars[genome_build]['chromosome']],
                    mongoResult[genome_build_vars[genome_build]['gene_begin']],
                    mongoResult[genome_build_vars[genome_build]['gene_end']]
                ]
                return geneResult
            else:
                return None

        # Find RS number in snp database
        gene_coord = get_coords_gene(myargsName, db)

        if gene_coord == None or gene_coord[2] == 'NA' or gene_coord == 'NA':
            return None

        # Define search coordinates
        coord1 = int(gene_coord[2]) - window
        if coord1 < 0:
            coord1 = 0
        coord2 = int(gene_coord[3]) + window

        # Run with --origin option
        if myargsOrigin != "None":
            if gene_coord[1] != chromosome:
                return None

            if coord1 > int(org_coord) or int(org_coord) > coord2:
                return None

        else:
            chromosome = gene_coord[1]

    elif region == "region":
        if myargs['start'] == None:
            return None

        if myargs['end'] == None:
            return None

        # Parse out chr and positions for --region option
        if len(myargs['start'].split(":")) != 2:
            return None

        if len(myargs['end'].split(":")) != 2:
            return None

        chr_s = myargs['start'].strip("chr").split(":")[0]
        coord_s = myargs['start'].split(":")[1]
        chr_e = myargs['end'].strip("chr").split(":")[0]
        coord_e = myargs['end'].split(":")[1]

        if chr_s not in chrs:
            return None

        if chr_e not in chrs:
            return None

        if chr_s != chr_e:
            return None

        if coord_s >= coord_e:
            return None

        coord1 = int(coord_s) - window
        if coord1 < 0:
            coord1 = 0
        coord2 = int(coord_e) + window

        # Run with --origin option
        if myargsOrigin != "None":
            if chr_s != chromosome:
                return None

            if coord1 > int(org_coord) or int(org_coord) > coord2:
                return None

        else:
            chromosome = chr_s

    # Generate coordinate list and P-value dictionary
    max_window = 3000000
    if coord2 - coord1 > max_window:
        return None

    assoc_coords = []
    a_pos = []
    assoc_dict = {}
    assoc_list = []
    with open(file) as fp:
        for line in fp:
            col = line.strip().split()
            if len(col) == len_head:
                if col[chr_index].strip("chr") == chromosome:
                    try:
                        int(col[pos_index])
                    except ValueError:
                        continue
                    else:
                        if coord1 <= int(col[pos_index]) <= coord2:
                            try:
                                float(col[p_index])
                            except ValueError:
                                continue
                            else:
                                coord_i = genome_build_vars[genome_build][
                                    '1000G_chr_prefix'] + col[chr_index].strip(
                                        "chr") + ":" + col[
                                            pos_index] + "-" + col[pos_index]
                                assoc_coords.append(coord_i)
                                a_pos.append(col[pos_index])
                                assoc_dict[coord_i] = [col[p_index]]
                                assoc_list.append(
                                    [coord_i, float(col[p_index])])

    # Coordinate list checks
    if len(assoc_coords) == 0:
        return None

    # Get population ids from population output file from LDassoc.py
    pop_list = open(tmp_dir + "pops_" + request + ".txt").readlines()
    ids = []
    for i in range(len(pop_list)):
        ids.append(pop_list[i].strip())

    pop_ids = list(set(ids))

    # Define LD origin coordinate
    try:
        org_coord
    except NameError:
        for var_p in sorted(assoc_list, key=operator.itemgetter(1)):
            snp = "chr" + var_p[0].split("-")[0]

            # Extract lowest P SNP phased genotypes
            vcf_filePath = "%s/%s%s/%s" % (
                config['aws']['data_subfolder'], genotypes_dir,
                genome_build_vars[genome_build]["1000G_dir"],
                genome_build_vars[genome_build]["1000G_file"] % (chromosome))
            vcf_file = "s3://%s/%s" % (config['aws']['bucket'], vcf_filePath)

            checkS3File(aws_info, config['aws']['bucket'], vcf_filePath)

            tabix_snp_h = export_s3_keys + " cd {1}; tabix -HD {0} | grep CHROM".format(
                vcf_file, data_dir + genotypes_dir +
                genome_build_vars[genome_build]['1000G_dir'])
            head = [
                x.decode('utf-8') for x in subprocess.Popen(
                    tabix_snp_h, shell=True,
                    stdout=subprocess.PIPE).stdout.readlines()
            ][0].strip().split()

            # Check lowest P SNP is in the 1000G population and not monoallelic from LDassoc.py output file
            vcf = open(tmp_dir + "snp_no_dups_" + request + ".vcf").readlines()

            if len(vcf) == 0:
                continue
            elif len(vcf) > 1:
                geno = vcf[0].strip().split()
                geno[0] = geno[0].lstrip('chr')
            else:
                geno = vcf[0].strip().split()
                geno[0] = geno[0].lstrip('chr')

            if "," in geno[3] or "," in geno[4]:
                continue

            index = []
            for i in range(9, len(head)):
                if head[i] in pop_ids:
                    index.append(i)

            genotypes = {"0": 0, "1": 0}
            for i in index:
                sub_geno = geno[i].split("|")
                for j in sub_geno:
                    if j in genotypes:
                        genotypes[j] += 1
                    else:
                        genotypes[j] = 1

            if genotypes["0"] == 0 or genotypes["1"] == 0:
                continue

            org_coord = var_p[0].split("-")[1]
            break

    else:
        if genome_build_vars[genome_build][
                '1000G_chr_prefix'] + chromosome + ":" + org_coord + "-" + org_coord not in assoc_coords:
            return None

        # Extract query SNP phased genotypes
        vcf_filePath = "%s/%s%s/%s" % (
            config['aws']['data_subfolder'], genotypes_dir,
            genome_build_vars[genome_build]["1000G_dir"],
            genome_build_vars[genome_build]["1000G_file"] % (chromosome))
        vcf_file = "s3://%s/%s" % (config['aws']['bucket'], vcf_filePath)

        checkS3File(aws_info, config['aws']['bucket'], vcf_filePath)

        tabix_snp_h = export_s3_keys + " cd {1}; tabix -HD {0} | grep CHROM".format(
            vcf_file, data_dir + genotypes_dir +
            genome_build_vars[genome_build]['1000G_dir'])
        head = [
            x.decode('utf-8') for x in
            subprocess.Popen(tabix_snp_h, shell=True,
                             stdout=subprocess.PIPE).stdout.readlines()
        ][0].strip().split()

        # Check query SNP is in the 1000G population, has the correct RS number, and not monoallelic
        vcf = open(tmp_dir + "snp_no_dups_" + request + ".vcf").readlines()

        if len(vcf) == 0:
            subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt",
                            shell=True)
            subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf",
                            shell=True)
            return None

        elif len(vcf) > 1:
            geno = []
            for i in range(len(vcf)):
                if vcf[i].strip().split()[2] == snp:
                    geno = vcf[i].strip().split()
                    geno[0] = geno[0].lstrip('chr')
            if geno == []:
                subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt",
                                shell=True)
                subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf",
                                shell=True)
                return None

        else:
            geno = vcf[0].strip().split()
            geno[0] = geno[0].lstrip('chr')

        if geno[2] != snp and snp[0:2] == "rs" and "rs" in geno[2]:
            snp = geno[2]

        if "," in geno[3] or "," in geno[4]:
            subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt",
                            shell=True)
            subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf",
                            shell=True)
            return None

        index = []
        for i in range(9, len(head)):
            if head[i] in pop_ids:
                index.append(i)

        genotypes = {"0": 0, "1": 0}
        for i in index:
            sub_geno = geno[i].split("|")
            for j in sub_geno:
                if j in genotypes:
                    genotypes[j] += 1
                else:
                    genotypes[j] = 1

        if genotypes["0"] == 0 or genotypes["1"] == 0:
            subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt",
                            shell=True)
            subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf",
                            shell=True)
            return None

    # Calculate proxy LD statistics in parallel
    if len(assoc_coords) < 60:
        num_subprocesses = 1
    # else:
    #     threads=4

    assoc_coords_subset_chunks = np.array_split(assoc_coords, num_subprocesses)

    # block=len(assoc_coords) // num_subprocesses
    commands = []
    # for i in range(num_subprocesses):
    #     if i==min(range(num_subprocesses)) and i==max(range(num_subprocesses)):
    #         command="python3 LDassoc_sub.py "+snp+" "+chromosome+" "+"_".join(assoc_coords)+" "+request+" "+str(i)
    #     elif i==min(range(num_subprocesses)):
    #         command="python3 LDassoc_sub.py "+snp+" "+chromosome+" "+"_".join(assoc_coords[:block])+" "+request+" "+str(i)
    #     elif i==max(range(num_subprocesses)):
    #         command="python3 LDassoc_sub.py "+snp+" "+chromosome+" "+"_".join(assoc_coords[(block*i)+1:])+" "+request+" "+str(i)
    #     else:
    #         command="python3 LDassoc_sub.py "+snp+" "+chromosome+" "+"_".join(assoc_coords[(block*i)+1:block*(i+1)])+" "+request+" "+str(i)
    #     commands.append(command)

    for subprocess_id in range(num_subprocesses):
        subprocessArgs = " ".join([
            str(snp),
            str(chromosome),
            str("_".join(assoc_coords_subset_chunks[subprocess_id])),
            str(request),
            str(genome_build),
            str(subprocess_id)
        ])
        commands.append("python3 LDassoc_sub.py " + subprocessArgs)

    processes = [
        subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
        for command in commands
    ]

    # collect output in parallel
    def get_output(process):
        return process.communicate()[0].splitlines()

    pool = Pool(len(processes))
    out_raw = pool.map(get_output, processes)
    pool.close()
    pool.join()

    # Aggregate output
    out_prox = []
    for i in range(len(out_raw)):
        for j in range(len(out_raw[i])):
            col = out_raw[i][j].decode('utf-8').strip().split("\t")
            col[6] = int(col[6])
            col[7] = float(col[7])
            col[8] = float(col[8])
            col.append(abs(int(col[6])))
            pos_i_j = col[5].split(":")[1]
            coord_i_j = genome_build_vars[genome_build][
                '1000G_chr_prefix'] + chromosome + ":" + pos_i_j + "-" + pos_i_j
            if coord_i_j in assoc_dict:
                col.append(float(assoc_dict[coord_i_j][0]))
                out_prox.append(col)

    out_dist_sort = sorted(out_prox, key=operator.itemgetter(14))
    out_p_sort = sorted(out_dist_sort,
                        key=operator.itemgetter(15),
                        reverse=False)

    # Organize scatter plot data
    q_rs = []
    q_allele = []
    q_coord = []
    q_maf = []
    p_rs = []
    p_allele = []
    p_coord = []
    p_pos = []
    p_maf = []
    dist = []
    d_prime = []
    d_prime_round = []
    r2 = []
    r2_round = []
    corr_alleles = []
    regdb = []
    funct = []
    color = []
    alpha = []
    size = []
    p_val = []
    neg_log_p = []
    for i in range(len(out_p_sort)):
        q_rs_i, q_allele_i, q_coord_i, p_rs_i, p_allele_i, p_coord_i, dist_i, d_prime_i, r2_i, corr_alleles_i, regdb_i, q_maf_i, p_maf_i, funct_i, dist_abs, p_val_i = out_p_sort[
            i]

        q_rs.append(q_rs_i)
        q_allele.append(q_allele_i)
        q_coord.append(float(q_coord_i.split(":")[1]) / 1000000)
        q_maf.append(str(round(float(q_maf_i), 4)))
        if p_rs_i == ".":
            p_rs_i = p_coord_i
        p_rs.append(p_rs_i)
        p_allele.append(p_allele_i)
        p_coord.append(float(p_coord_i.split(":")[1]) / 1000000)
        p_pos.append(p_coord_i.split(":")[1])
        p_maf.append(str(round(float(p_maf_i), 4)))
        dist.append(str(round(dist_i / 1000000.0, 4)))
        d_prime.append(float(d_prime_i))
        d_prime_round.append(str(round(float(d_prime_i), 4)))
        r2.append(float(r2_i))
        r2_round.append(str(round(float(r2_i), 4)))
        corr_alleles.append(corr_alleles_i)

        # P-value
        p_val.append(p_val_i)
        neg_log_p.append(-log10(p_val_i))

        # Correct Missing Annotations
        if regdb_i == ".":
            regdb_i = ""
        regdb.append(regdb_i)
        if funct_i == ".":
            funct_i = ""
        if funct_i == "NA":
            funct_i = "none"
        funct.append(funct_i)

        # Set Color
        reds = [
            "#FFCCCC", "#FFCACA", "#FFC8C8", "#FFC6C6", "#FFC4C4", "#FFC2C2",
            "#FFC0C0", "#FFBEBE", "#FFBCBC", "#FFBABA", "#FFB8B8", "#FFB6B6",
            "#FFB4B4", "#FFB1B1", "#FFAFAF", "#FFADAD", "#FFABAB", "#FFA9A9",
            "#FFA7A7", "#FFA5A5", "#FFA3A3", "#FFA1A1", "#FF9F9F", "#FF9D9D",
            "#FF9B9B", "#FF9999", "#FF9797", "#FF9595", "#FF9393", "#FF9191",
            "#FF8F8F", "#FF8D8D", "#FF8B8B", "#FF8989", "#FF8787", "#FF8585",
            "#FF8383", "#FF8181", "#FF7E7E", "#FF7C7C", "#FF7A7A", "#FF7878",
            "#FF7676", "#FF7474", "#FF7272", "#FF7070", "#FF6E6E", "#FF6C6C",
            "#FF6A6A", "#FF6868", "#FF6666", "#FF6464", "#FF6262", "#FF6060",
            "#FF5E5E", "#FF5C5C", "#FF5A5A", "#FF5858", "#FF5656", "#FF5454",
            "#FF5252", "#FF5050", "#FF4E4E", "#FF4B4B", "#FF4949", "#FF4747",
            "#FF4545", "#FF4343", "#FF4141", "#FF3F3F", "#FF3D3D", "#FF3B3B",
            "#FF3939", "#FF3737", "#FF3535", "#FF3333", "#FF3131", "#FF2F2F",
            "#FF2D2D", "#FF2B2B", "#FF2929", "#FF2727", "#FF2525", "#FF2323",
            "#FF2121", "#FF1F1F", "#FF1D1D", "#FF1B1B", "#FF1818", "#FF1616",
            "#FF1414", "#FF1212", "#FF1010", "#FF0E0E", "#FF0C0C", "#FF0A0A",
            "#FF0808", "#FF0606", "#FF0404", "#FF0202", "#FF0000"
        ]
        if q_coord_i == p_coord_i:
            color_i = "#0000FF"
            alpha_i = 0.7
        else:
            if myargs['dprime'] == True:
                color_i = reds[int(d_prime_i * 100.0)]
                alpha_i = 0.7
            elif myargs['dprime'] == False:
                color_i = reds[int(r2_i * 100.0)]
                alpha_i = 0.7
        color.append(color_i)
        alpha.append(alpha_i)

        # Set Size
        size_i = 9 + float(p_maf_i) * 14.0
        size.append(size_i)

    # Pull out SNPs from association file not found in 1000G
    p_plot_pos = []
    p_plot_pval = []
    p_plot_pos2 = []
    p_plot_pval2 = []
    p_plot_dist = []
    index_var_pos = float(q_coord_i.split(":")[1]) / 1000000
    for input_pos in a_pos:
        if input_pos not in p_pos:
            p_plot_pos.append(float(input_pos) / 1000000)
            p_plot_pval.append(-log10(
                float(assoc_dict[chromosome + ":" + input_pos + "-" +
                                 input_pos][0])))
            p_plot_pos2.append("chr" + chromosome + ":" + input_pos)
            p_plot_pval2.append(
                float(assoc_dict[chromosome + ":" + input_pos + "-" +
                                 input_pos][0]))
            p_plot_dist.append(
                str(round(float(input_pos) / 1000000 - index_var_pos, 4)))

    # Begin Bokeh Plotting
    from collections import OrderedDict
    from bokeh.embed import components, file_html
    from bokeh.layouts import gridplot
    from bokeh.models import HoverTool, LinearAxis, Range1d
    from bokeh.plotting import ColumnDataSource, curdoc, figure, output_file, reset_output, save
    from bokeh.resources import CDN
    from bokeh.io import export_svgs
    import svgutils.compose as sg

    reset_output()

    data_p = {
        'p_plot_posX': p_plot_pos,
        'p_plot_pvalY': p_plot_pval,
        'p_plot_pos2': p_plot_pos2,
        'p_plot_pval2': p_plot_pval2,
        'p_plot_dist': p_plot_dist
    }
    source_p = ColumnDataSource(data_p)

    # Assoc Plot
    x = p_coord
    y = neg_log_p

    data = {
        'x': x,
        'y': y,
        'qrs': q_rs,
        'q_alle': q_allele,
        'q_maf': q_maf,
        'prs': p_rs,
        'p_alle': p_allele,
        'p_maf': p_maf,
        'dist': dist,
        'r': r2_round,
        'd': d_prime_round,
        'alleles': corr_alleles,
        'regdb': regdb,
        'funct': funct,
        'p_val': p_val,
        'size': size,
        'color': color,
        'alpha': alpha
    }
    source = ColumnDataSource(data)

    whitespace = 0.01
    xr = Range1d(start=coord1 / 1000000.0 - whitespace,
                 end=coord2 / 1000000.0 + whitespace)
    yr = Range1d(start=-0.03, end=max(y) * 1.03)
    sup_2 = "\u00B2"

    assoc_plot = figure(
        title="P-values and Regional LD for " + snp + " in " + pop,
        min_border_top=2,
        min_border_bottom=2,
        min_border_left=60,
        min_border_right=60,
        h_symmetry=False,
        v_symmetry=False,
        plot_width=900,
        plot_height=600,
        x_range=xr,
        y_range=yr,
        tools=
        "tap,pan,box_zoom,wheel_zoom,box_select,undo,redo,reset,previewsave",
        logo=None,
        toolbar_location="above")

    assoc_plot.title.align = "center"

    # Add recombination rate from LDassoc.py output file
    recomb_file = tmp_dir + "recomb_" + request + ".json"
    recomb_raw = open(recomb_file).readlines()

    recomb_x = []
    recomb_y = []

    for recomb_raw_obj in recomb_raw:
        recomb_obj = json.loads(recomb_raw_obj)
        recomb_x.append(
            int(recomb_obj[genome_build_vars[genome_build]['position']]) /
            1000000.0)
        recomb_y.append(float(recomb_obj['rate']) / 100 * max(y))

    assoc_plot.line(recomb_x, recomb_y, line_width=1, color="black", alpha=0.5)

    # Add genome-wide significance
    a = [coord1 / 1000000.0 - whitespace, coord2 / 1000000.0 + whitespace]
    b = [-log10(0.00000005), -log10(0.00000005)]
    assoc_plot.line(a, b, color="blue", alpha=0.5)

    assoc_points_not1000G = assoc_plot.circle(x='p_plot_posX',
                                              y='p_plot_pvalY',
                                              size=9 + float("0.25") * 14.0,
                                              source=source_p,
                                              line_color="gray",
                                              fill_color="white")
    assoc_points = assoc_plot.circle(x='x',
                                     y='y',
                                     size='size',
                                     color='color',
                                     alpha='alpha',
                                     source=source)
    assoc_plot.add_tools(
        HoverTool(renderers=[assoc_points_not1000G],
                  tooltips=OrderedDict([("Variant", "@p_plot_pos2"),
                                        ("P-value", "@p_plot_pval2"),
                                        ("Distance (Mb)", "@p_plot_dist")])))

    hover = HoverTool(renderers=[assoc_points])
    hover.tooltips = OrderedDict([
        ("Variant", "@prs @p_alle"),
        ("P-value", "@p_val"),
        ("Distance (Mb)", "@dist"),
        ("MAF", "@p_maf"),
        ("R" + sup_2 + " (" + q_rs[0] + ")", "@r"),
        ("D\' (" + q_rs[0] + ")", "@d"),
        ("Correlated Alleles", "@alleles"),
        ("RegulomeDB", "@regdb"),
        ("Functional Class", "@funct"),
    ])

    assoc_plot.add_tools(hover)

    # Annotate RebulomeDB scores
    if myargs['annotate'] == True:
        assoc_plot.text(x,
                        y,
                        text=regdb,
                        alpha=1,
                        text_font_size="7pt",
                        text_baseline="middle",
                        text_align="center",
                        angle=0)

    assoc_plot.yaxis.axis_label = "-log10 P-value"

    assoc_plot.extra_y_ranges = {"y2_axis": Range1d(start=-3, end=103)}
    assoc_plot.add_layout(
        LinearAxis(y_range_name="y2_axis",
                   axis_label="Combined Recombination Rate (cM/Mb)"),
        "right")  ## Need to confirm units

    # Rug Plot
    y2_ll = [-0.03] * len(x)
    y2_ul = [1.03] * len(x)
    yr_rug = Range1d(start=-0.03, end=1.03)

    data_rug = {
        'x': x,
        'y': y,
        'y2_ll': y2_ll,
        'y2_ul': y2_ul,
        'qrs': q_rs,
        'q_alle': q_allele,
        'q_maf': q_maf,
        'prs': p_rs,
        'p_alle': p_allele,
        'p_maf': p_maf,
        'dist': dist,
        'r': r2_round,
        'd': d_prime_round,
        'alleles': corr_alleles,
        'regdb': regdb,
        'funct': funct,
        'p_val': p_val,
        'size': size,
        'color': color,
        'alpha': alpha
    }
    source_rug = ColumnDataSource(data_rug)

    rug = figure(x_range=xr,
                 y_range=yr_rug,
                 border_fill_color='white',
                 y_axis_type=None,
                 title="",
                 min_border_top=2,
                 min_border_bottom=2,
                 min_border_left=60,
                 min_border_right=60,
                 h_symmetry=False,
                 v_symmetry=False,
                 plot_width=900,
                 plot_height=50,
                 tools="xpan,tap,wheel_zoom",
                 logo=None)

    rug.segment(x0='x',
                y0='y2_ll',
                x1='x',
                y1='y2_ul',
                source=source_rug,
                color='color',
                alpha='alpha',
                line_width=1)
    rug.toolbar_location = None

    # Gene Plot (All Transcripts)
    if myargs['transcript'] == True:
        # Get genes from LDassoc.py output file
        genes_file = tmp_dir + "genes_" + request + ".json"
        genes_raw = open(genes_file).readlines()

        genes_plot_start = []
        genes_plot_end = []
        genes_plot_y = []
        genes_plot_name = []
        exons_plot_x = []
        exons_plot_y = []
        exons_plot_w = []
        exons_plot_h = []
        exons_plot_name = []
        exons_plot_id = []
        exons_plot_exon = []
        message = ["Too many genes to plot."]
        lines = [0]
        gap = 80000
        tall = 0.75
        if genes_raw != None and len(genes_raw) > 0:
            for gene_raw_obj in genes_raw:
                gene_obj = json.loads(gene_raw_obj)
                bin = gene_obj["bin"]
                name_id = gene_obj["name"]
                chrom = gene_obj["chrom"]
                strand = gene_obj["strand"]
                txStart = gene_obj["txStart"]
                txEnd = gene_obj["txEnd"]
                cdsStart = gene_obj["cdsStart"]
                cdsEnd = gene_obj["cdsEnd"]
                exonCount = gene_obj["exonCount"]
                exonStarts = gene_obj["exonStarts"]
                exonEnds = gene_obj["exonEnds"]
                score = gene_obj["score"]
                name2 = gene_obj["name2"]
                cdsStartStat = gene_obj["cdsStartStat"]
                cdsEndStat = gene_obj["cdsEndStat"]
                exonFrames = gene_obj["exonFrames"]
                name = name2
                id = name_id
                e_start = exonStarts.split(",")
                e_end = exonEnds.split(",")

                # Determine Y Coordinate
                i = 0
                y_coord = None
                while y_coord == None:
                    if i > len(lines) - 1:
                        y_coord = i + 1
                        lines.append(int(txEnd))
                    elif int(txStart) > (gap + lines[i]):
                        y_coord = i + 1
                        lines[i] = int(txEnd)
                    else:
                        i += 1

                genes_plot_start.append(int(txStart) / 1000000.0)
                genes_plot_end.append(int(txEnd) / 1000000.0)
                genes_plot_y.append(y_coord)
                genes_plot_name.append(name + "  ")

                for i in range(len(e_start) - 1):
                    if strand == "+":
                        exon = i + 1
                    else:
                        exon = len(e_start) - 1 - i

                    width = (int(e_end[i]) - int(e_start[i])) / 1000000.0
                    x_coord = int(e_start[i]) / 1000000.0 + (width / 2)

                    exons_plot_x.append(x_coord)
                    exons_plot_y.append(y_coord)
                    exons_plot_w.append(width)
                    exons_plot_h.append(tall)
                    exons_plot_name.append(name)
                    exons_plot_id.append(id)
                    exons_plot_exon.append(exon)

        n_rows = len(lines)
        genes_plot_yn = [n_rows - x + 0.5 for x in genes_plot_y]
        exons_plot_yn = [n_rows - x + 0.5 for x in exons_plot_y]
        yr2 = Range1d(start=0, end=n_rows)

        data_gene_plot = {
            'exons_plot_x': exons_plot_x,
            'exons_plot_yn': exons_plot_yn,
            'exons_plot_w': exons_plot_w,
            'exons_plot_h': exons_plot_h,
            'exons_plot_name': exons_plot_name,
            'exons_plot_id': exons_plot_id,
            'exons_plot_exon': exons_plot_exon
        }
        source_gene_plot = ColumnDataSource(data_gene_plot)

        max_genes = 40
        # if len(lines) < 3 or len(genes_raw) > max_genes:
        if len(lines) < 3:
            plot_h_pix = 250
        else:
            plot_h_pix = 250 + (len(lines) - 2) * 50

        gene_plot = figure(
            min_border_top=2,
            min_border_bottom=0,
            min_border_left=100,
            min_border_right=5,
            x_range=xr,
            y_range=yr2,
            border_fill_color='white',
            title="",
            h_symmetry=False,
            v_symmetry=False,
            logo=None,
            plot_width=900,
            plot_height=plot_h_pix,
            tools=
            "hover,xpan,box_zoom,wheel_zoom,tap,undo,redo,reset,previewsave")

        # if len(genes_raw) <= max_genes:
        gene_plot.segment(genes_plot_start,
                          genes_plot_yn,
                          genes_plot_end,
                          genes_plot_yn,
                          color="black",
                          alpha=1,
                          line_width=2)
        gene_plot.rect(x='exons_plot_x',
                       y='exons_plot_yn',
                       width='exons_plot_w',
                       height='exons_plot_h',
                       source=source_gene_plot,
                       fill_color="grey",
                       line_color="grey")
        gene_plot.text(genes_plot_start,
                       genes_plot_yn,
                       text=genes_plot_name,
                       alpha=1,
                       text_font_size="7pt",
                       text_font_style="bold",
                       text_baseline="middle",
                       text_align="right",
                       angle=0)
        hover = gene_plot.select(dict(type=HoverTool))
        hover.tooltips = OrderedDict([
            ("Gene", "@exons_plot_name"),
            ("Transcript ID", "@exons_plot_id"),
            ("Exon", "@exons_plot_exon"),
        ])

        # else:
        #     x_coord_text = coord1/1000000.0 + (coord2/1000000.0 - coord1/1000000.0) / 2.0
        #     gene_plot.text(x_coord_text, n_rows / 2.0, text=message, alpha=1,
        #                     text_font_size="12pt", text_font_style="bold", text_baseline="middle", text_align="center", angle=0)

        gene_plot.xaxis.axis_label = "Chromosome " + chromosome + " Coordinate (Mb)(" + genome_build_vars[
            genome_build]['title'] + ")"
        gene_plot.yaxis.axis_label = "Genes (All Transcripts)"
        gene_plot.ygrid.grid_line_color = None
        gene_plot.yaxis.axis_line_color = None
        gene_plot.yaxis.minor_tick_line_color = None
        gene_plot.yaxis.major_tick_line_color = None
        gene_plot.yaxis.major_label_text_color = None

        gene_plot.toolbar_location = "below"

        # Change output backend to SVG temporarily for headless export
        assoc_plot.output_backend = "svg"
        rug.output_backend = "svg"
        gene_plot.output_backend = "svg"
        export_svgs(assoc_plot,
                    filename=tmp_dir + "assoc_plot_1_" + request + ".svg")
        export_svgs(gene_plot,
                    filename=tmp_dir + "gene_plot_1_" + request + ".svg")

        # 1 pixel = 0.0264583333 cm
        svg_height = str(20.00 + (0.0264583333 * plot_h_pix)) + "cm"
        svg_height_scaled = str(100.00 + (0.1322916665 * plot_h_pix)) + "cm"

        # Concatenate svgs
        sg.Figure(
            "24.59cm", svg_height,
            sg.SVG(tmp_dir + "assoc_plot_1_" + request + ".svg"),
            sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").move(
                -40, 630)).save(tmp_dir + "assoc_plot_" + request + ".svg")

        sg.Figure(
            "122.95cm", svg_height_scaled,
            sg.SVG(tmp_dir + "assoc_plot_1_" + request + ".svg").scale(5),
            sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").scale(5).move(
                -200,
                3150)).save(tmp_dir + "assoc_plot_scaled_" + request + ".svg")

        # Export to PDF
        subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "assoc_plot_" +
                        request + ".svg " + tmp_dir + "assoc_plot_" + request +
                        ".pdf",
                        shell=True)
        # Export to PNG
        subprocess.call("phantomjs ./rasterize.js " + tmp_dir +
                        "assoc_plot_scaled_" + request + ".svg " + tmp_dir +
                        "assoc_plot_" + request + ".png",
                        shell=True)
        # Export to JPEG
        subprocess.call("phantomjs ./rasterize.js " + tmp_dir +
                        "assoc_plot_scaled_" + request + ".svg " + tmp_dir +
                        "assoc_plot_" + request + ".jpeg",
                        shell=True)
        # Remove individual SVG files after they are combined
        subprocess.call("rm " + tmp_dir + "assoc_plot_1_" + request + ".svg",
                        shell=True)
        subprocess.call("rm " + tmp_dir + "gene_plot_1_" + request + ".svg",
                        shell=True)
        # Remove scaled SVG file after it is converted to png and jpeg
        subprocess.call("rm " + tmp_dir + "assoc_plot_scaled_" + request +
                        ".svg",
                        shell=True)

    # Gene Plot (Collapsed)
    else:
        # Get genes from LDassoc.py output file
        genes_c_file = tmp_dir + "genes_c_" + request + ".json"
        genes_c_raw = open(genes_c_file).readlines()

        genes_c_plot_start = []
        genes_c_plot_end = []
        genes_c_plot_y = []
        genes_c_plot_name = []
        exons_c_plot_x = []
        exons_c_plot_y = []
        exons_c_plot_w = []
        exons_c_plot_h = []
        exons_c_plot_name = []
        exons_c_plot_id = []
        message_c = ["Too many genes to plot."]
        lines_c = [0]
        gap = 80000
        tall = 0.75
        if genes_c_raw != None and len(genes_c_raw) > 0:
            for gene_raw_obj in genes_c_raw:
                gene_c_obj = json.loads(gene_raw_obj)
                chrom = gene_c_obj["chrom"]
                txStart = gene_c_obj["txStart"]
                txEnd = gene_c_obj["txEnd"]
                exonStarts = gene_c_obj["exonStarts"]
                exonEnds = gene_c_obj["exonEnds"]
                name2 = gene_c_obj["name2"]
                transcripts = gene_c_obj["transcripts"]
                name = name2
                e_start = exonStarts.split(",")
                e_end = exonEnds.split(",")
                e_transcripts = transcripts.split(",")

                # Determine Y Coordinate
                i = 0
                y_coord = None
                while y_coord == None:
                    if i > len(lines_c) - 1:
                        y_coord = i + 1
                        lines_c.append(int(txEnd))
                    elif int(txStart) > (gap + lines_c[i]):
                        y_coord = i + 1
                        lines_c[i] = int(txEnd)
                    else:
                        i += 1

                genes_c_plot_start.append(int(txStart) / 1000000.0)
                genes_c_plot_end.append(int(txEnd) / 1000000.0)
                genes_c_plot_y.append(y_coord)
                genes_c_plot_name.append(name + "  ")

                # for i in range(len(e_start)):
                for i in range(len(e_start) - 1):
                    width = (int(e_end[i]) - int(e_start[i])) / 1000000.0
                    x_coord = int(e_start[i]) / 1000000.0 + (width / 2)

                    exons_c_plot_x.append(x_coord)
                    exons_c_plot_y.append(y_coord)
                    exons_c_plot_w.append(width)
                    exons_c_plot_h.append(tall)
                    exons_c_plot_name.append(name)
                    exons_c_plot_id.append(e_transcripts[i].replace("-", ","))

        n_rows_c = len(lines_c)
        genes_c_plot_yn = [n_rows_c - x + 0.5 for x in genes_c_plot_y]
        exons_c_plot_yn = [n_rows_c - x + 0.5 for x in exons_c_plot_y]
        yr2_c = Range1d(start=0, end=n_rows_c)

        data_gene_c_plot = {
            'exons_c_plot_x': exons_c_plot_x,
            'exons_c_plot_yn': exons_c_plot_yn,
            'exons_c_plot_w': exons_c_plot_w,
            'exons_c_plot_h': exons_c_plot_h,
            'exons_c_plot_name': exons_c_plot_name,
            'exons_c_plot_id': exons_c_plot_id
        }
        source_gene_c_plot = ColumnDataSource(data_gene_c_plot)

        max_genes_c = 40
        # if len(lines_c) < 3 or len(genes_c_raw) > max_genes_c:
        if len(lines_c) < 3:
            plot_c_h_pix = 250
        else:
            plot_c_h_pix = 250 + (len(lines_c) - 2) * 50

        gene_c_plot = figure(
            min_border_top=2,
            min_border_bottom=0,
            min_border_left=100,
            min_border_right=5,
            x_range=xr,
            y_range=yr2_c,
            border_fill_color='white',
            title="",
            h_symmetry=False,
            v_symmetry=False,
            logo=None,
            plot_width=900,
            plot_height=plot_c_h_pix,
            tools=
            "hover,xpan,box_zoom,wheel_zoom,tap,undo,redo,reset,previewsave")

        # if len(genes_c_raw) <= max_genes_c:
        gene_c_plot.segment(genes_c_plot_start,
                            genes_c_plot_yn,
                            genes_c_plot_end,
                            genes_c_plot_yn,
                            color="black",
                            alpha=1,
                            line_width=2)
        gene_c_plot.rect(x='exons_c_plot_x',
                         y='exons_c_plot_yn',
                         width='exons_c_plot_w',
                         height='exons_c_plot_h',
                         source=source_gene_c_plot,
                         fill_color="grey",
                         line_color="grey")
        gene_c_plot.text(genes_c_plot_start,
                         genes_c_plot_yn,
                         text=genes_c_plot_name,
                         alpha=1,
                         text_font_size="7pt",
                         text_font_style="bold",
                         text_baseline="middle",
                         text_align="right",
                         angle=0)
        hover = gene_c_plot.select(dict(type=HoverTool))
        hover.tooltips = OrderedDict([
            ("Gene", "@exons_c_plot_name"),
            ("Transcript IDs", "@exons_c_plot_id"),
        ])

        # else:
        #     x_coord_text = coord1/1000000.0 + (coord2/1000000.0 - coord1/1000000.0) / 2.0
        #     gene_c_plot.text(x_coord_text, n_rows_c / 2.0, text=message_c, alpha=1,
        #                     text_font_size="12pt", text_font_style="bold", text_baseline="middle", text_align="center", angle=0)

        gene_c_plot.xaxis.axis_label = "Chromosome " + chromosome + " Coordinate (Mb)(" + genome_build_vars[
            genome_build]['title'] + ")"
        gene_c_plot.yaxis.axis_label = "Genes (Transcripts Collapsed)"
        gene_c_plot.ygrid.grid_line_color = None
        gene_c_plot.yaxis.axis_line_color = None
        gene_c_plot.yaxis.minor_tick_line_color = None
        gene_c_plot.yaxis.major_tick_line_color = None
        gene_c_plot.yaxis.major_label_text_color = None

        gene_c_plot.toolbar_location = "below"

        # Change output backend to SVG temporarily for headless export
        assoc_plot.output_backend = "svg"
        rug.output_backend = "svg"
        gene_c_plot.output_backend = "svg"
        export_svgs(assoc_plot,
                    filename=tmp_dir + "assoc_plot_1_" + request + ".svg")
        export_svgs(gene_c_plot,
                    filename=tmp_dir + "gene_plot_1_" + request + ".svg")

        # 1 pixel = 0.0264583333 cm
        svg_height = str(20.00 + (0.0264583333 * plot_c_h_pix)) + "cm"
        svg_height_scaled = str(100.00 + (0.1322916665 * plot_c_h_pix)) + "cm"

        # Concatenate svgs
        sg.Figure(
            "24.59cm", svg_height,
            sg.SVG(tmp_dir + "assoc_plot_1_" + request + ".svg"),
            sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").move(
                -40, 630)).save(tmp_dir + "assoc_plot_" + request + ".svg")

        sg.Figure(
            "122.95cm", svg_height_scaled,
            sg.SVG(tmp_dir + "assoc_plot_1_" + request + ".svg").scale(5),
            sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").scale(5).move(
                -200,
                3150)).save(tmp_dir + "assoc_plot_scaled_" + request + ".svg")

        # Export to PDF
        subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "assoc_plot_" +
                        request + ".svg " + tmp_dir + "assoc_plot_" + request +
                        ".pdf",
                        shell=True)
        # Export to PNG
        subprocess.call("phantomjs ./rasterize.js " + tmp_dir +
                        "assoc_plot_scaled_" + request + ".svg " + tmp_dir +
                        "assoc_plot_" + request + ".png",
                        shell=True)
        # Export to JPEG
        subprocess.call("phantomjs ./rasterize.js " + tmp_dir +
                        "assoc_plot_scaled_" + request + ".svg " + tmp_dir +
                        "assoc_plot_" + request + ".jpeg",
                        shell=True)
        # Remove individual SVG files after they are combined
        subprocess.call("rm " + tmp_dir + "assoc_plot_1_" + request + ".svg",
                        shell=True)
        subprocess.call("rm " + tmp_dir + "gene_plot_1_" + request + ".svg",
                        shell=True)
        # Remove scaled SVG file after it is converted to png and jpeg
        subprocess.call("rm " + tmp_dir + "assoc_plot_scaled_" + request +
                        ".svg",
                        shell=True)

    reset_output()

    # Remove temporary files
    subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt", shell=True)
    subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True)
    subprocess.call("rm " + tmp_dir + "genes_*" + request + "*.json",
                    shell=True)
    subprocess.call("rm " + tmp_dir + "recomb_" + request + ".json",
                    shell=True)
    subprocess.call("rm " + tmp_dir + "assoc_args" + request + ".json",
                    shell=True)

    print("Bokeh high quality image export complete!")

    # Return plot output
    return None