示例#1
0
文件: circFilter.py 项目: iprada/DCC
 def read_rep_region(self, regionfile):
     regions = HTSeq.GFF_Reader(regionfile, end_included=True)
     rep_tree = IntervalTree()
     for feature in regions:
         iv = feature.iv
         rep_tree.insert(iv, annotation='.')
     return rep_tree
示例#2
0
 def read_rep_region(self, regionfile):
     regions = HTSeq.GFF_Reader(regionfile, end_included=True)
     rep_tree = IntervalTree()
     for feature in regions:
         iv = feature.iv
         rep_tree.insert(iv, annotation='.')
     return rep_tree
示例#3
0
    def selectGeneGtf(self, gtf_file):
        # construct annotation tree
        # new_gtf file contains only exon annotation
        gtf = HTSeq.GFF_Reader(gtf_file, end_included=True)
        annotation_tree = IntervalTree()
        gtf_exon = []
        for feature in gtf:
            # Select only exon line
            if feature.type == 'exon':
                gtf_exon.append(feature.get_gff_line().split('\t'))

            iv = feature.iv
            try:
                row = feature.attr
                row['type'] = feature.type
            except:
                row = feature.get_gff_line()

            annotation_tree.insert(iv, annotation=row)

        gtf_exon_sorted = sorted(gtf_exon, key=lambda x: (x[0], int(x[3]), int(x[4])))
        gtf_exon_sorted = ['\t'.join(s) for s in gtf_exon_sorted]
        new_gtf = open(self.tmp_dir + "tmp_" + os.path.basename(gtf_file) + '.exon.sorted', 'w')
        new_gtf.writelines(gtf_exon_sorted)
        new_gtf.close()
        return annotation_tree
示例#4
0
    def selectGeneGtf(self, gtf_file):
        # construct annotation tree
        # new_gtf file contains only exon annotation
        gtf = HTSeq.GFF_Reader(gtf_file, end_included=True)
        annotation_tree = IntervalTree()
        gtf_exon = []
        for feature in gtf:
            # Select only exon line
            if feature.type == 'exon':
                gtf_exon.append(feature.get_gff_line().split('\t'))

            iv = feature.iv
            try:
                row = feature.attr
                row['type'] = feature.type
            except:
                row = feature.get_gff_line()

            annotation_tree.insert(iv, annotation=row)

        gtf_exon_sorted = sorted(gtf_exon,
                                 key=lambda x: (x[0], int(x[3]), int(x[4])))
        gtf_exon_sorted = ['\t'.join(s) for s in gtf_exon_sorted]
        new_gtf = open(
            self.tmp_dir + "tmp_" + os.path.basename(gtf_file) +
            '.exon.sorted', 'w')
        new_gtf.writelines(gtf_exon_sorted)
        new_gtf.close()
        return annotation_tree
示例#5
0
    def compute_scores(self, men, women, dmz, drt):

        # construct a score matrix
        n_row = len(men.rows)
        n_col = len(women.rows)
        dist_arr = lil_matrix((n_row, n_col))
        weight_arr = lil_matrix((n_row, n_col))
        max_dist = 0

        T = IntervalTree(women.rows)
        for i in range(n_row):
            man = men.rows[i]
            mass_lower, mass_upper = man.get_mass_range(dmz, absolute_mass_tolerance=False)
            candidate_women = T.search(int(mass_lower), int(mass_upper))
            for woman in candidate_women:
                if man.is_within_tolerance(woman, dmz, drt, absolute_mass_tolerance=False):
                    dist, w = self.compute_dist(man, woman, dmz, drt)
                    j = woman.row_id
                    dist_arr[i, j] = dist
                    weight_arr[i, j] = w
                    if dist > max_dist:
                        max_dist = dist

        try:

            # make this into a score matrix
            dist_arr = dist_arr.tocoo()
            score_arr = lil_matrix((n_row, n_col))
            Q = lil_matrix((n_row, n_col))
            max_score = 0
            # see http://stackoverflow.com/questions/4319014/iterating-through-a-scipy-sparse-vector-or-matrix
            for i, j, v in itertools.izip(dist_arr.row, dist_arr.col, dist_arr.data):
                score = 1-(v/max_dist)
                score = weight_arr[i, j] * score
                score_arr[i, j] = score
                Q[i, j] = 1
                if score > max_score:
                    max_score = score

            # normalise
            score_arr = score_arr * (1/max_score)
            return score_arr, Q

        except ZeroDivisionError:

            dist_arr = dist_arr.tocoo()
            score_arr = lil_matrix((n_row, n_col))
            Q = lil_matrix((n_row, n_col))
            max_score = 0
            for i, j, v in itertools.izip(dist_arr.row, dist_arr.col, dist_arr.data):
                score = 1-v
                score = weight_arr[i, j] * score
                score_arr[i, j] = score
                Q[i, j] = 1
                if score > max_score:
                    max_score = score

            return score_arr, Q
示例#6
0
 def selectGeneGtf(self, gtf_file):
     # select gene features for gtf or gff annotation file
     gtf = HTSeq.GFF_Reader(gtf_file, end_included=True)
     annotation_tree = IntervalTree()
     for feature in gtf:
         # Select only exon line
         iv = feature.iv
         try:
             row = feature.attr
             row['type'] = feature.type
         except:
             row = feature.get_gff_line()
         annotation_tree.insert(iv, annotation=row)
     return annotation_tree
示例#7
0
 def selectGeneGtf(self,gtf_file):
     # select gene features for gtf or gff annotation file
     gtf = HTSeq.GFF_Reader(gtf_file, end_included=True)
     annotation_tree = IntervalTree()
     for feature in gtf:
         # Select only exon line
         iv = feature.iv
         try:
             row = feature.attr
             row['type'] = feature.type
         except:
             row = feature.get_gff_line()
         annotation_tree.insert(iv, annotation=row)
     return annotation_tree
    def intersectcirc(self, circ_file, modified_gtf_file, strand=True, isStartBED=True):
        # input the result file of print_start_end_file
        input_bed_file = open(circ_file).readlines()
        exon_gtf_file = HTSeq.GFF_Reader(modified_gtf_file, end_included=True)
        gtf_exon_sorted = IntervalTree()
        for feature in exon_gtf_file:
            row = feature.attr
            current_bed_interval = feature.iv
            gtf_exon_sorted.insert(current_bed_interval, annotation=row)

        circ_exon_set = {}
        for bed_line in input_bed_file:
            bed_field = bed_line.split('\t')
            custom_exon_list = []

            # we add 1bp in order for intersect to work correctly
            # different case for start or end bed file
            if isStartBED:
                start = int(bed_field[1])
                end = int(bed_field[1]) + 1
            else:
                start = int(bed_field[1]) - 1
                end = int(bed_field[1])

            # in order for the intersect to work, we need at least 1bp frame size
            current_bed_interval = HTSeq.GenomicInterval(bed_field[0],
                                                         start,
                                                         end,
                                                         bed_field[5].strip()
                                                         )

            # for later processing however, we again need the "0" bp frame window
            insert_bed_interval = HTSeq.GenomicInterval(bed_field[0],
                                                        int(bed_field[1]),
                                                        int(bed_field[2]),
                                                        bed_field[5].strip()
                                                        )
            # extract all customs exons
            gtf_exon_sorted.intersect(current_bed_interval,
                                      lambda x: custom_exon_list.append(x.annotation['custom_exon_id'])
                                      )

            if custom_exon_list:  # if we found one or more custom exons
                for custom_exon in custom_exon_list:  # go through the list
                    circ_exon_set.setdefault(insert_bed_interval, set()).add(custom_exon)  # and add them to the set

        # return the filled set
        return circ_exon_set
示例#9
0
def shiftReduceParse(linearTree, string):
    """ 
    parse listed tree items from right to left (shift reduce)
    returns a tree or None, if some nodes are not aligned
    """
    def isAligned(idx):
        alignment = [item for node in linearTree for item in node[2]]
        return idx in alignment
    
    treeBuffer = []
    
    # check whether first and last word are aligned
    # if not then remove the tree
    lastIndex = len(string)
    if not isAligned(1):
        logging.info("Align first word to first semantic node")
        linearTree[0][2] += (1,)
    if not isAligned(lastIndex):
        logging.info("Align first word to first semantic node")
        linearTree[-1][2] += (lastIndex,)

    
    for node in reversed(linearTree):
        #print "buffer:", treeBuffer
        t = IntervalTree()
        t.name = node[0]
        # Add child nodes to the current node 
        # by popping them from the buffer
        if node[1] == 0:
            pass
        else:
            for _ in range(node[1]):
                n = treeBuffer.pop()
                t.childNodes.append(n)
        
        # unaligned words 
        if node[2] == (0,):
            return None
            
            if t.childNodes:
                minInterval = min(child.interval.start for child in t.childNodes)
                maxInterval = max(child.interval.end   for child in t.childNodes)
                t.interval = Interval(minInterval,maxInterval)
            else:
                # what happens with leaf nodes that have no aligned semantic?
                t.interval = Interval()
        else:
            minInterval, maxInterval = min(node[2])-1, max(node[2])
            
            for child in t.childNodes:
                childInterval = child.interval
                minInterval, maxInterval = min(minInterval,childInterval.start), max(maxInterval,childInterval.end)
            
            t.interval = Interval(minInterval, maxInterval)
            t.alignment = node[2]
            
        treeBuffer.append(t)
    return treeBuffer[0]
示例#10
0
def generate_interval_tree_from_bed_file(regions_bed_path):
    tsv_handler = TsvHandler(regions_bed_path)

    # collect intervals from BED in illumina PG standards and convert to intervals that make sense: 0-based, closed
    bed_intervals_by_chromosome = tsv_handler.get_bed_intervals_by_chromosome(universal_offset=-1, start_offset=1)

    interval_trees_by_chromosome = dict()

    for chromosome in bed_intervals_by_chromosome:
        intervals = bed_intervals_by_chromosome[chromosome]

        interval_tree = IntervalTree(intervals)
        interval_trees_by_chromosome[chromosome] = interval_tree

    print("chromosomes: ", bed_intervals_by_chromosome.keys())

    return interval_trees_by_chromosome
    def intersectcirc(self,
                      circ_file,
                      modified_gtf_file,
                      strand=True,
                      isStartBED=True):
        # input the result file of print_start_end_file
        input_bed_file = open(circ_file).readlines()
        exon_gtf_file = HTSeq.GFF_Reader(modified_gtf_file, end_included=True)
        gtf_exon_sorted = IntervalTree()
        for feature in exon_gtf_file:
            row = feature.attr
            current_bed_interval = feature.iv
            gtf_exon_sorted.insert(current_bed_interval, annotation=row)

        circ_exon_set = {}
        for bed_line in input_bed_file:
            bed_field = bed_line.split('\t')
            custom_exon_list = []

            # we add 1bp in order for intersect to work correctly
            # different case for start or end bed file
            if isStartBED:
                start = int(bed_field[1])
                end = int(bed_field[1]) + 1
            else:
                start = int(bed_field[1]) - 1
                end = int(bed_field[1])

            # in order for the intersect to work, we need at least 1bp frame size
            current_bed_interval = HTSeq.GenomicInterval(
                bed_field[0], start, end, bed_field[5].strip())

            # for later processing however, we again need the "0" bp frame window
            insert_bed_interval = HTSeq.GenomicInterval(
                bed_field[0], int(bed_field[1]), int(bed_field[2]),
                bed_field[5].strip())
            # extract all customs exons
            gtf_exon_sorted.intersect(
                current_bed_interval, lambda x: custom_exon_list.append(
                    x.annotation['custom_exon_id']))

            if custom_exon_list:  # if we found one or more custom exons
                for custom_exon in custom_exon_list:  # go through the list
                    circ_exon_set.setdefault(insert_bed_interval, set()).add(
                        custom_exon)  # and add them to the set

        # return the filled set
        return circ_exon_set