Exemplo n.º 1
0
def get_all_percentage_pairs(anno, chrom_hmm_anno, start, end, chrom_hmm_start,
                             chrom_hmm_end, chrom_hmm_len, shapes, bed, thresh,
                             signals_path, shape_str):

    #Set up percentage matrix.
    sum_matrix = np.zeros((2, len(shapes)))
    cumulative_vec = np.zeros(len(shapes))

    #Loop through bed file to compute percentage for each region.
    current_start = -1
    current_end = -1
    current_clust = "none"
    prev_start = -1
    prev_end = -1
    sigs = open(signals_path, "r")
    junk = 0

    for j in range(0, 2):  #Loop through BED file twice.
        next_signal = sigs.readline().split(",")
        if j == 1:
            next_signal = sigs.readline().split(",")
        for i in range(0, bed.shape[0]):

            #Get the previous data, if applicable.
            if i > 0:
                prev_start = int(bed[i - 1, start])
                prev_end = int(bed[i - 1, end])
                prev_clust = bed[i - 1, anno]

            #Get the next element data.
            next_line = bed[i, :]
            current_start = int(next_line[start])
            current_end = int(next_line[end])
            current_clust = next_line[anno]
            a = next_line[chrom_hmm_anno]
            idx = shapes.index(current_clust)

            clust_sig = [float(i) for i in shape_str[idx].split(",")]
            #Get the signal data.
            if (prev_start >= int(next_signal[1]) or current_start > int(
                    next_signal[1])) and current_start > prev_start:
                next_signal = sigs.readline().split(",")
            if current_start > int(next_signal[1]):
                next_signal = sigs.readline().split(",")
            if int(next_signal[1]) == current_start:
                #Add to the existing percentages.
                region = [float(i) for i in next_signal[3:len(next_signal)]]
                count_clust = wsu.count_above(thresh, "", clust_sig, 0,
                                              len(clust_sig) * BIN_SIZE, 0, 0,
                                              BIN_SIZE)
                count_a = wsu.count_above(thresh, a, region, current_start,
                                          current_end,
                                          int(next_line[chrom_hmm_start]),
                                          int(next_line[chrom_hmm_end]),
                                          BIN_SIZE)

                if a == "AE" or a == "OE":
                    sum_matrix[0, idx] += int(next_line[chrom_hmm_len]) if (
                        count_clust == 0) else count_a
                elif a != "0" and a != "AP" and a != "OP" and a != "GE" and a != "TS":
                    sum_matrix[1, idx] += int(next_line[chrom_hmm_len]) if (
                        count_clust == 0) else count_a

                #Add to the total sum if the current start and end are not equal to the previous ones.
                #if(prev_start != current_start and prev_start != "-1"):
            cumulative_vec[idx] = np.sum(sum_matrix[:, idx])

    #Get the set of percentages.
    cumulative_matrix = np.tile(cumulative_vec, (2, 1))
    return [sum_matrix / cumulative_matrix, np.sum(sum_matrix, 0)]
Exemplo n.º 2
0
def get_labels_and_ground_truth(bed_file, sig_file, wig, annotations,
                                threshold):

    #Set up percentage matrix.
    vec_pred = list()
    vec_gt = list()
    final_stack_pred = np.empty((0, 0))
    final_stack_gt = np.empty((0, 0))

    #Get scores and labels for each bed file.
    bed = np.genfromtxt(bed_file, delimiter='\t', dtype=str)
    sigf = open(sig_file, "r")

    #Loop through bed file to compute percentage for each region.
    current_start = -1
    current_end = -1
    prev_start = -1
    prev_end = -1
    sig_i = 0
    #Do not move forward if the first line is blank in the sig file.
    try:
        sig = [float(s) for s in sigf.readline().split(",")]
        sum_vec = np.zeros(3)

        #Keep track of regions with no ChromHMM annotations.
        #These regions will not be used in the analysis.
        not_annotated_count = 0
        count_in_region = 0
        for i in range(0, bed.shape[0]):

            #Get the next element data.
            next_line = bed[i, :]
            current_start = int(next_line[1])
            current_end = int(next_line[2])
            a = next_line[8]
            anno_start = int(next_line[6])
            anno_end = int(next_line[7])
            our_anno = next_line[3]
            anno_length = int(next_line[9])

            #Get next signals if needed.
            #If we are still on the same region, don't get it.
            if current_start != prev_start:
                if sig_i != 0:
                    sig_s = sigf.readline()
                    sig = [float(s) for s in sig_s.split(",")]
                sum_vec = np.zeros(3)
                sig_i += 1

            #Add to the existing percentages.
            #If the region has peaks, consider only regions above peak threshold.
            #If no peaks exist, consider entire region.
            total_peak_size = wsu.count_above(threshold, "", sig,
                                              current_start, current_end,
                                              current_start, current_end,
                                              BIN_SIZE)
            if a == "1_TssA" or a == "2_TssAFlnk" or a == "10_TssBiv" or a == "11_BivFlnk":
                if total_peak_size > 0:
                    sum_vec[0] += wsu.count_above(threshold, a, sig,
                                                  current_start, current_end,
                                                  anno_start, anno_end,
                                                  BIN_SIZE)
                else:
                    sum_vec[0] += anno_length
            elif a == "6_EnhG" or a == "7_Enh" or a == "12_EnhBiv":
                if total_peak_size > 0:
                    sum_vec[1] += wsu.count_above(threshold, a, sig,
                                                  current_start, current_end,
                                                  anno_start, anno_end,
                                                  BIN_SIZE)
                else:
                    sum_vec[1] += anno_length
            elif a == "9_Het" or a == "15_Quies":
                if total_peak_size > 0:
                    sum_vec[2] += wsu.count_above(threshold, a, sig,
                                                  current_start, current_end,
                                                  anno_start, anno_end,
                                                  BIN_SIZE)
                else:
                    sum_vec[2] += anno_length
            #This case is when there is no annotation. Do not count it.
            else:
                not_annotated_count += 1
            count_in_region += 1

            #Add the ground truth and predicted value based on the region with the maximum count above the threshold.
            next_start = current_start + 1
            if i + 1 < len(bed):
                next_start = int(bed[i + 1, :][1])
            if next_start != current_start and not_annotated_count != count_in_region:

                #Add another element to the ground truth and prediction vectors.
                vec_gt.append(np.zeros(len(sum_vec)))
                vec_pred.append(np.zeros(len(sum_vec)))
                max = np.argmax(sum_vec)
                for sum in range(0, len(sum_vec)):
                    #Add ground truth.
                    if sum == max:
                        vec_gt[len(vec_gt) - 1][sum] = 1
                    else:
                        vec_gt[len(vec_gt) - 1][sum] = 0
                    #Add predictions.
                    if annotations[sum] == our_anno:
                        vec_pred[len(vec_pred) - 1][sum] = 1
                    else:
                        vec_pred[len(vec_pred) - 1][sum] = 0
                #If it is unknown according to our analysis, do not consider it.
                #This includes cases where there is no annotation from ChromHMM or where
                #There is signal above the threshold but no ChromHMM annotation in the signal.
                if vec_pred[len(vec_pred) - 1][0] == 0 and vec_pred[
                        len(vec_pred) - 1][1] == 0 and vec_pred[len(vec_pred) -
                                                                1][2] == 0:
                    del vec_pred[len(vec_pred) - 1]
                    del vec_gt[len(vec_pred) - 1]
                elif sum_vec[0] == 0 and sum_vec[1] == 0 and sum_vec[2] == 0:
                    del vec_pred[len(vec_pred) - 1]
                    del vec_gt[len(vec_pred) - 1]

                #Set count and unannotated count to 0. Do the same for summation vec.
                not_annotated_count = 0
                count_in_region = 0

            #Get the previous data, if applicable.
            prev_start = current_start
            prev_end = current_end

        #Stack all values.
        final_stack_pred = np.stack(vec_pred)
        final_stack_gt = np.stack(vec_gt)
    except:
        pass
    #Return value.
    return [final_stack_pred, final_stack_gt]
Exemplo n.º 3
0
def get_tss_labels_and_ground_truth(bed_file, sig_file, wig, annotations,
                                    threshold):

    #Set up counts of promoter and non-promoter.
    vec_pred = list()
    vec_gt = list()

    bed = np.genfromtxt(bed_file, delimiter='\t', dtype=str)
    sigs = np.genfromtxt(sig_file, delimiter=',', dtype=float)

    #Loop through bed file to compute percentage for each region.
    current_start = -1
    current_end = -1
    prev_start = -1
    prev_end = -1
    sig = sigs[0, :]
    sig_i = -1
    sum_vec = np.zeros(2)

    #Keep track of regions with no ChromHMM annotations.
    #These regions will not be used in the analysis.
    not_annotated_count = 0
    count_in_region = 0

    for i in range(0, bed.shape[0]):

        #Get the next element data.
        next_line = bed[i, :]
        current_start = int(next_line[1])
        current_end = int(next_line[2])
        a = next_line[7]
        anno_start = int(next_line[5])
        anno_end = int(next_line[6])
        our_anno = next_line[3]
        anno_length = int(next_line[8])

        #Get next signals if needed.
        #If we are still on the same region, don't get it.
        if current_start != prev_start:
            sig_i += 1
            sig = sigs[sig_i, :]
            sum_vec = np.zeros(2)

        #Add to the existing percentages.
        #If the region has peaks, consider only regions above peak threshold.
        #If no peaks exist, consider entire region.
        total_peak_size = wsu.count_above(threshold, "", sig, current_start,
                                          current_end, current_start,
                                          current_end, BIN_SIZE)
        if a == "1_TssA" or a == "2_TssAFlnk" or a == "10_TssBiv" or a == "11_BivFlnk":
            if total_peak_size > 0:
                sum_vec[0] += wsu.count_above(threshold, a, sig, current_start,
                                              current_end, anno_start,
                                              anno_end, BIN_SIZE)
            else:
                sum_vec[0] += anno_length
        elif anno_length != 0:
            if total_peak_size > 0:
                sum_vec[1] += wsu.count_above(threshold, a, sig, current_start,
                                              current_end, anno_start,
                                              anno_end, BIN_SIZE)
            else:
                sum_vec[1] += anno_length
        #This case is when there is no annotation. Do not count it.
        else:
            not_annotated_count += 1
        count_in_region += 1

        #Add the ground truth and predicted value based on the region with the maximum count above the threshold.
        next_start = current_start + 1
        if i + 1 < len(bed):
            next_start = int(bed[i + 1, :][1])
        if next_start != current_start and not_annotated_count != count_in_region:
            vec_gt.append(np.zeros(len(sum_vec)))
            vec_pred.append(np.zeros(len(sum_vec)))
            max = np.argmax(sum_vec)
            for sum in range(0, len(sum_vec)):
                #Add ground truth.
                if sum == max:
                    vec_gt[len(vec_gt) - 1][sum] = 1
                else:
                    vec_gt[len(vec_gt) - 1][sum] = 0
                #Add predictions.
                if annotations[sum] == our_anno:
                    vec_pred[len(vec_pred) - 1][sum] = 1
                else:
                    vec_pred[len(vec_pred) - 1][sum] = 0
            if sum_vec[0] == 0 and sum_vec[1] == 0:
                del vec_pred[len(vec_pred) - 1]
                del vec_gt[len(vec_pred) - 1]
            #Set count and unannotated count to 0.
            not_annotated_count = 0
            count_in_region = 0
        #Get the previous data, if applicable.
        prev_start = current_start
        prev_end = current_start
    #Return value.
    return [np.stack(vec_pred), np.stack(vec_gt)]
def get_all_percentage_pairs(anno, chrom_hmm_anno, start, end, chrom_hmm_start,
                             chrom_hmm_end, chrom_hmm_len, magnitudes, bed,
                             thresh, signals_path):

    #Set up percentage matrix.
    sum_matrix = np.zeros((4, len(magnitudes)))
    cumulative_vec = np.zeros(len(magnitudes))

    #Loop through bed file to compute percentage for each region.
    current_start = -1
    current_end = -1
    current_clust = "none"
    prev_start = -1
    prev_end = -1
    sigs = open(signals_path, "r")
    junk = 0

    for j in range(0, 2):  #Loop through BED file twice.
        next_signal = sigs.readline().split(",")
        if j == 1:
            next_signal = sigs.readline().split(",")
        for i in range(0, bed.shape[0]):
            if len(next_signal) > 1:
                #Get the previous data, if applicable.
                if i > 0:
                    prev_start = int(bed[i - 1, start])
                    prev_end = int(bed[i - 1, end])

                #Get the next element data.
                next_line = bed[i, :]
                current_start = int(next_line[start])
                current_end = int(next_line[end])
                current_clust = next_line[anno]
                a = next_line[chrom_hmm_anno]
                idx = magnitudes.index(current_clust)

                #Get the signal data.
                if (prev_start >= int(next_signal[1]) or current_start > int(
                        next_signal[1])) and current_start > prev_start:
                    next_signal = sigs.readline().split(",")
                if len(next_signal) > 1:
                    if current_start > int(next_signal[1]):
                        next_signal = sigs.readline().split(",")
                    if len(next_signal) > 1:
                        if int(next_signal[1]) == current_start:
                            #Add to the existing percentages.
                            region = [
                                float(i)
                                for i in next_signal[3:len(next_signal)]
                            ]
                            count_a = wsu.count_above(
                                thresh, a, region, current_start, current_end,
                                int(next_line[chrom_hmm_start]),
                                int(next_line[chrom_hmm_end]), BIN_SIZE)
                            if a == "1_TssA" or a == "2_TssAFlnk" or a == "10_TssBiv" or a == "11_BivFlnk":
                                sum_matrix[0, idx] += int(
                                    next_line[chrom_hmm_len]
                                ) if (int(current_clust) < thresh) else count_a
                            elif a == "6_EnhG" or a == "7_Enh" or a == "12_EnhBiv":
                                sum_matrix[1, idx] += int(
                                    next_line[chrom_hmm_len]
                                ) if (int(current_clust) < thresh) else count_a
                            elif a == "13_ReprPC" or a == "ReprPCWk":
                                sum_matrix[2, idx] += int(
                                    next_line[chrom_hmm_len]
                                ) if (int(current_clust) < thresh) else count_a
                            elif a == "9_Het" or a == "15_Quies":
                                sum_matrix[3, idx] += int(
                                    next_line[chrom_hmm_len]
                                ) if (int(current_clust) < thresh) else count_a

                            #Add to the total sum if the current start and end are not equal to the previous ones.
                            #if(prev_start != current_start and prev_start != "-1"):
                cumulative_vec[idx] = np.sum(sum_matrix[:, idx])

    #Get the set of percentages.
    cumulative_matrix = np.tile(cumulative_vec, (4, 1))
    return [sum_matrix / cumulative_matrix, np.sum(sum_matrix, 0)]