示例#1
0
def annotate_all_bin_bed(wig_f, out_f, chrom):

    #Open the input and output.
    wig = open(wig_f, "r")
    out = open(out_f, "w")

    #We choose the 97th and the 90th percentiles for promoter and enhancer, respectively.
    promoter_cutoff = wsu.get_intensity_percentile(0.97, wig, 0)
    enhancer_cutoff = wsu.get_intensity_percentile(0.90, wig, 0)

    #Loop through the WIG file and annotate each region. Skip the first two lines, which are the header.
    junk = wig.readline()
    junk = wig.readline()
    bin = wig.readline().split("\t")
    while len(bin) > 1:

        #Output the starting and ending positions.
        end = str(int(bin[0]) + BIN_SIZE)
        out.write("chr" + chrom + "\t" + bin[0] + "\t" + end + "\t")

        #Get the annotation.
        rpkm = float(bin[1])
        if rpkm >= promoter_cutoff:
            out.write("Promoter\n")
        elif rpkm >= enhancer_cutoff:
            out.write("Enhancer\n")
        else:
            out.write("Weak\n")

        #Get the next line.
        bin = wig.readline().split("\t")

    #Close the input and output.
    wig.close()
    out.close()
def save_significant(percentages, magnitudes, wig_name, out_name, chrom, cell,
                     min_val):

    #Get threshold to use in printing.
    wig = open(wig_name, 'r')
    out = open(out_name, 'w')
    intensity = wsu.get_intensity_percentile(0.995, wig, min_val)
    wig.close()
    scale = 5.0 / intensity
    perc_string = percentages.astype(str)

    #Print significant magnitudes with labels.
    labels = ["Promoter", "Enhancer", "Polycomb", "Weak"]
    for j in range(0, percentages.shape[1]):
        was_significant = False
        for i in range(0, percentages.shape[0]):
            other_percents = percentages[np.arange(percentages.shape[0]) != i,
                                         j]
            if labels[i] != "Polycomb" and percentages[i, j] > 0.5 and np.max(
                    other_percents) <= percentages[i, j] / 2:
                out.write(labels[i] + "\t" + cell + "_" + chrom + "_" +
                          magnitudes[j] + "\t" + magnitudes[j] + "\n")

                was_significant = True
        if not was_significant:
            out.write("Unknown" + "\t" + cell + "_" + chrom + "_" +
                      magnitudes[j] + "\t" + magnitudes[j] + "\n")

    #Close the files.
    wig.close()
    out.close()
示例#3
0
def save_significant(percentages, shape_names, shapes, wig_name, out_name,
                     chrom, cell, min_val, cutoff):

    #Get threshold to use in printing.
    wig = open(wig_name, 'r')
    out = open(out_name, 'a')
    perc_out = open(out_name + "_percents", 'a')
    percentage_out = open(out_name + "_" + cell + "/" + chrom + "percentages",
                          'w')
    intensity = wsu.get_intensity_percentile(0.995, wig, min_val)
    print("\n")
    wig.close()
    scale = 5.0 / intensity
    perc_string = percentages.astype(str)

    #Print significant shapes with labels.
    labels = ["Promoter", "Enhancer", "Polycomb", "Weak"]
    for j in range(0, percentages.shape[1]):
        split_clust = shapes[j].split(",")
        scaled_clust = [float(i) for i in split_clust] * np.tile(
            scale, len(split_clust))
        scaled_clust_str = [str(i) for i in scaled_clust]
        perc_str = [str(i) for i in percentages[:, j]]
        joined = ','.join(scaled_clust_str)
        perc_joined = ','.join(perc_str)
        was_significant = False
        for i in range(0, percentages.shape[0]):
            other_percents = percentages[np.arange(percentages.shape[0]) != i,
                                         j]
            if labels[i] != "Polycomb" and percentages[
                    i, j] > cutoff and np.max(other_percents) < percentages[i,
                                                                            j]:
                out.write(labels[i] + "\t" + cell + "_" + chrom + "_" +
                          shape_names[j] + "\t" + joined)
                perc_out.write(labels[i] + "\t" + cell + "_" + chrom + "_" +
                               shape_names[j] + "\t" + perc_joined)
                if joined.find('\n') != len(joined) - 1:
                    perc_out.write("\n")
                    out.write("\n")
                was_significant = True
        if not was_significant:
            out.write("Unknown" + "\t" + cell + "_" + chrom + "_" +
                      shape_names[j] + "\t" + joined)
            perc_out.write("Unknown" + "\t" + cell + "_" + chrom + "_" +
                           shape_names[j] + "\t" + perc_joined)
            if joined.find('\n') != len(joined) - 1:
                perc_out.write("\n")
                out.write("\n")

        #Print all percentages to a file to use later.
        percentage_out.write(','.join(perc_string[:, j]) + "\n")

    #Close the files.
    wig.close()
    out.close()
    percentage_out.close()
示例#4
0
def main():

    #Read in the bed file and shape data.
    bed = np.genfromtxt(sys.argv[1], delimiter='\t', dtype=str)
    output = sys.argv[3]
    wig = sys.argv[4]
    chromosome = sys.argv[5]
    cell = sys.argv[6]
    anno_file = sys.argv[7]
    min_val = float(sys.argv[8])

    #shape annotations are in column 3. Biological annotations are in column 4.
    shape_col = 3
    bio_col = 8
    shape_start = 1
    shape_end = 2
    bio_start = 6
    bio_end = 7
    bio_len = 9

    #Get list of shapes.
    shapes = []
    shape_file = open(sys.argv[2], 'r')
    next_clust = shape_file.readline()
    while next_clust:
        shapes.append(next_clust)
        next_clust = shape_file.readline()
    unique_clusts = sorted(list(set(bed[:, shape_col])),
                           key=lambda x: float(x))

    #Get percentage of each shape and each chromHMM annotation and each chromHMM annotation per shape.
    wig_file = open(wig, "r")
    threshold = wsu.get_intensity_percentile(0.75, wig_file, min_val)
    wig_file.close()
    total_percent_shapes = get_shape_percentages(shape_col, shape_start,
                                                 shape_end, unique_clusts, bed)
    total_percent_anno = get_anno_percentages(bio_col, shape_start, shape_end,
                                              bio_len, bed)
    [total_percent_all, total_sums_all
     ] = get_all_percentage_pairs(shape_col, bio_col, shape_start, shape_end,
                                  bio_start, bio_end, bio_len, unique_clusts,
                                  bed, threshold, anno_file, shapes)

    #Print all shapes with significant annotations, along with their annotations.
    save_significant(total_percent_all, unique_clusts, shapes, wig, output,
                     chromosome, cell, min_val)
示例#5
0
def get_all_precision_or_recall(bed, sig, wig, chrom):

    #Get actual annotation and ground truth for all annotations and for all unannotated regions.
    threshold = wsu.get_intensity_percentile(0.75, open(wig, 'r'), 0)
    annotations = ["Promoter", "Enhancer", "Weak"]
    length = len(annotations)

    #Get precision and recall for each type.
    [pred, gt] = ppr.get_labels_and_ground_truth(bed, sig, wig, annotations, threshold)
    precision = dict()
    recall = dict()
    if len(pred) > 0:
        for i in range(length):
            precision[i] = precision_score(gt[:, i], pred[:, i])
            recall[i] = recall_score(gt[:, i], pred[:, i])
        
    return [chrom, precision, recall]
示例#6
0
def main():

    file_path = sys.argv[1]
    output_path = sys.argv[2]
    bin_size = int(sys.argv[3])
    region_size = int(sys.argv[4])
    wig_file = open(sys.argv[5], 'r')
    percentile = 0.995
    fine = bool(sys.argv[6])
    minimum_intensity = float(sys.argv[7])
    
    threshold = wsu.get_intensity_percentile(percentile, wig_file, minimum_intensity, fine)
    print(str(threshold))
    
    shiftRegions(file_path, output_path, bin_size, region_size, threshold, minimum_intensity)
    
    #Print message to user.
    print("Shifting complete for all windows.")
示例#7
0
def get_all_precision_and_recall(bed, sig, wig, chrom):

    #Get actual annotation and ground truth for all annotations and for all unannotated regions.
    threshold = wsu.get_intensity_percentile(0.75, open(wig, 'r'), BIN_SIZE)
    annotations = ["Promoter", "Enhancer", "Weak"]
    length = len(annotations)
    ground_truth_list = []
    predicted_list = []
      
    #Get precision and recall for each type.
    [pred, gt] = get_labels_and_ground_truth(bed, sig, wig, annotations, threshold)
    precision = dict()
    recall = dict()
    fpr = dict()
    for i in range(length):
        precision[i] = precision_score(gt[:, i], pred[:, i])
        recall[i] = recall_score(gt[:, i], pred[:, i])
        fp = len(np.where((pred[:, i] == 1) & (gt[:, i] == 0))[0])
        tn = len(np.where((pred[:, i] == 0) & (gt[:, i] == 0))[0])
        fpr[i] = fp / (fp + tn)
    return [precision, recall, pred.shape[0], threshold, pred, gt, fpr]
def main():

    #Read in the bed file and shape data.
    bed = np.genfromtxt(sys.argv[1], delimiter='\t', dtype=str)
    output = sys.argv[2]
    wig = sys.argv[3]
    chromosome = sys.argv[4]
    cell = sys.argv[5]
    anno_file = sys.argv[6]
    min_val = float(sys.argv[7])

    #magnitude annotations are in column 3. Biological annotations are in column 4.
    magnitude_col = 3
    bio_col = 7
    magnitude_start = 1
    magnitude_end = 2
    bio_start = 5
    bio_end = 6
    bio_len = 8

    #Get list of magnitudes.
    unique_magnitudes = sorted(list(set(bed[:, magnitude_col])),
                               key=lambda x: float(x))
    print(unique_magnitudes)

    #Get percentage of each shape and each chromHMM annotation and each chromHMM annotation per shape.
    wig_file = open(wig, "r")
    threshold = wsu.get_intensity_percentile(0.75, wig_file, min_val)
    wig_file.close()
    [total_percent_all, total_sums_all
     ] = get_all_percentage_pairs(magnitude_col, bio_col, magnitude_start,
                                  magnitude_end, bio_start, bio_end, bio_len,
                                  unique_magnitudes, bed, threshold, anno_file)

    #Print all shapes with significant annotations, along with their annotations.
    save_significant(total_percent_all, unique_magnitudes, wig, output,
                     chromosome, cell, min_val)
def get_all_precision_and_recall(bed, sig, tss_bed, tss_sig, or_bed, or_sig,
                                 and_bed, and_sig, rpkm_bed, rpkm_sig, wig,
                                 chrom, cell):

    #Get actual annotation and ground truth for all annotations and for all unannotated regions.
    threshold = wsu.get_intensity_percentile(0.75, open(wig, 'r'), 0)
    annotations = ["Promoter", "Enhancer", "Weak"]
    length = len(annotations)
    ground_truth_list = []
    predicted_list = []

    #Get precision and recall for each type.
    [pred, gt] = get_labels_and_ground_truth(bed, sig, wig, annotations,
                                             threshold)
    precision = dict()
    recall = dict()
    fpr = dict()
    for i in range(length):
        precision[i] = precision_score(gt[:, i], pred[:, i])
        recall[i] = recall_score(gt[:, i], pred[:, i])
        fp = len(np.where((pred[:, i] == 1) & (gt[:, i] == 0))[0])
        tn = len(np.where((pred[:, i] == 0) & (gt[:, i] == 0))[0])
        fpr[i] = fp / (fp + tn)

    #Get precision and recall for TSS-based promoters.
    [pred_promoter, gt_promoter
     ] = get_tss_labels_and_ground_truth(tss_bed, tss_sig, wig,
                                         ["Promoter", "Not_Promoter"],
                                         threshold)
    precision["tss"] = precision_score(gt_promoter[:, 0], pred_promoter[:, 0])
    recall["tss"] = recall_score(gt_promoter[:, 0], pred_promoter[:, 0])
    fp = len(
        np.where((pred_promoter[:, 0] == 1) & (gt_promoter[:, 0] == 0))[0])
    tn = len(
        np.where((pred_promoter[:, 0] == 0) & (gt_promoter[:, 0] == 0))[0])
    fpr["tss"] = fp / (fp + tn)

    #Get precision and recall for combined annotations (OR).
    [pred_or, gt_or] = get_labels_and_ground_truth(or_bed, or_sig, wig,
                                                   annotations, threshold)
    precision_or = dict()
    recall_or = dict()
    fpr_or = dict()
    for i in range(length):
        precision_or[i] = precision_score(gt_or[:, i], pred_or[:, i])
        recall_or[i] = recall_score(gt_or[:, i], pred_or[:, i])
        fp = len(np.where((pred_or[:, i] == 1) & (gt_or[:, i] == 0))[0])
        tn = len(np.where((pred_or[:, i] == 0) & (gt_or[:, i] == 0))[0])
        fpr_or[i] = fp / (fp + tn)

    #Get precision and recall for combined annotations (AND).
    [pred_and, gt_and] = get_labels_and_ground_truth(and_bed, and_sig, wig,
                                                     annotations, threshold)
    precision_and = dict()
    recall_and = dict()
    fpr_and = dict()
    for i in range(length):
        precision_and[i] = precision_score(gt_and[:, i], pred_and[:, i])
        recall_and[i] = recall_score(gt_and[:, i], pred_and[:, i])
        fp = len(np.where((pred_and[:, i] == 1) & (gt_and[:, i] == 0))[0])
        tn = len(np.where((pred_and[:, i] == 0) & (gt_and[:, i] == 0))[0])
        fpr_and[i] = fp / (fp + tn)

    #Get precision and recall for RPKM annotations.
    [pred_rpkm, gt_rpkm] = get_labels_and_ground_truth(rpkm_bed, rpkm_sig, wig,
                                                       annotations, threshold)
    precision_rpkm = dict()
    recall_rpkm = dict()
    fpr_rpkm = dict()
    for i in range(length):
        precision_rpkm[i] = precision_score(gt_rpkm[:, i], pred_rpkm[:, i])
        recall_rpkm[i] = recall_score(gt_rpkm[:, i], pred_rpkm[:, i])
        fp = len(np.where((pred_rpkm[:, i] == 1) & (gt_rpkm[:, i] == 0))[0])
        tn = len(np.where((pred_rpkm[:, i] == 0) & (gt_rpkm[:, i] == 0))[0])
        fpr_rpkm[i] = fp / (fp + tn)

    return [
        precision, recall, precision_or, recall_or, precision_and, recall_and,
        precision_rpkm, recall_rpkm, pred.shape[0], threshold, pred, gt,
        pred_and, gt_and, fpr, fpr_or, fpr_and, fpr_rpkm
    ]
示例#10
0
def match_shapes_cutoff(in_file_name, shape_file_name, out_dir, wig_name,
                        cutoff):

    #Get threshold to use in printing.
    wig = open(wig_name, 'r')
    intensity = wsu.get_intensity_percentile(0.995, wig, 0)
    if intensity == 0:
        intensity = 0.1
    print("\n")
    wig.close()
    scale = 5.0 / intensity

    #Open input and shape files.
    in_file = open(in_file_name, "r")
    shape_file = open(shape_file_name, "r")

    #Open output files.
    out_file = open(out_dir, "w")
    out_clust = open(out_dir + "clust", "w")

    #Read in shape data.
    shapes = []
    shape_anno = []
    shape_name = []
    next_shape = shape_file.readline()
    counter = 0
    while next_shape:
        split_tabs = next_shape.split("\t")
        shape_anno.append(split_tabs[1])
        shape_name.append(split_tabs[0])
        shapes.append([float(i) for i in split_tabs[2].split(",")])
        next_shape = shape_file.readline()
    shape_file.close()

    #Read in each line in the file and map it.
    if (len(shapes) > 1):
        next_line = in_file.readline()
        while (next_line):

            #Get the chromosome and position info to print to the output file.
            #Get the input signal to match with the shapes.
            inputStr = []
            labels = []
            split_line = next_line.split(",")
            labels = list(split_line[0:3])
            inputStr = list(split_line[3:len(split_line)])
            input = [float(i) for i in inputStr]
            input_scaled = input * np.tile(scale, len(input))

            #Match the data to the nearest shape and obtain the match and the ambiguity metric.
            [match, ambig, crosscorr,
             out_str] = match_region(input_scaled, shapes, shape_anno)

            #If the cross-correlation is within the threshold, assign the label as the correct label for the region.
            #Otherwise, assign it as unknown.
            anno_label = "Unknown"
            anno_name = "Unknown"
            if crosscorr >= cutoff and match != -1:
                anno_label = shape_anno[match]
                anno_name = shape_name[match]

            #Print match to BED file. Format is:
            #chrom  start   end shape_num 1 - ambiguity
            #Score is the opposite of the ambiguity metric.
            out_file.write("chr" + labels[0] + "\t" + labels[1] + "\t" +
                           labels[2] + "\t" + anno_label + "\t" +
                           str(1 - ambig) + "\t" + out_str + "\n")
            out_clust.write("chr" + labels[0] + "\t" + labels[1] + "\t" +
                            labels[2] + "\t" + anno_name + "\t" +
                            str(1 - ambig) + "\t" + out_str + "\n")

            #Read the next line in the file.
            next_line = in_file.readline()

        #Print a message to the user.
        print("Files done")
    else:
        print("Only one shape. No annotation performed.")
    out_file.close()
    out_clust.close()
    in_file.close()