Exemplo n.º 1
0
def exon_matching(
    exon_tree: IntervalTree,
    ref_exon: Interval,
    match_extend_tolerate_left: int,
    match_extend_tolerate_right: int,
    intervals_adjacent: bool = True,
) -> List[IntervalTree]:
    """
    exon_tree --- an IntervalTree made from .baseC/.altC using exon detection; probably only short read data
    ref_exon --- an Interval representing an exon; probably from PacBio
    match_extend_tolerate --- maximum difference between the matched start/end

    find a continuous exon path (consisting of 1 or more nodes for which the intervals must be adjacent)
    in exon_tree that matches to ref_exon
    """
    matches = exon_tree.find(ref_exon.start, ref_exon.end)
    if len(matches) == 0:  # likely due to very low coverage on transcript
        return None
    # check that all matches are adjacent (no splicing! this just one integral exon)
    if (not intervals_adjacent) or c_branch.intervals_all_adjacent(matches):
        # check if the ends differ a little, if so, extend to min/max
        for i in range(len(matches)):
            d_start = abs(matches[i].start - ref_exon.start)
            # print "matching {0} to {1}".format(matches[i].start, ref_exon.start)
            # pdb.set_trace()
            if (d_start <= match_extend_tolerate_left
                ):  # now find the furthest end that satisfies the results
                for j in range(len(matches) - 1, i - 1, -1):
                    if (abs(matches[j].end - ref_exon.end) <=
                            match_extend_tolerate_right):
                        return matches[i:(j + 1)]
        return None
    else:  # ack! could not find evidence for this :<
        return None
Exemplo n.º 2
0
 def test_empty(self):
     iv = IntervalTree()
     self.assertEqual([], iv.find(100, 300))
     self.assertEqual([], iv.after(100))
     self.assertEqual([], iv.before(100))
     self.assertEqual([], iv.after_interval(100))
     self.assertEqual([], iv.before_interval(100))
     self.assertEqual([], iv.upstream_of_interval(100))
     self.assertEqual([], iv.downstream_of_interval(100))
     self.assertEqual(None, iv.traverse(lambda x: x.append(1)))
Exemplo n.º 3
0
 def test_empty(self):
     iv = IntervalTree()
     self.assertEqual([], iv.find(100, 300))
     self.assertEqual([], iv.after(100))
     self.assertEqual([], iv.before(100))
     self.assertEqual([], iv.after_interval(100))
     self.assertEqual([], iv.before_interval(100))
     self.assertEqual([], iv.upstream_of_interval(100))
     self.assertEqual([], iv.downstream_of_interval(100))
     self.assertEqual(None, iv.traverse(lambda x: x.append(1)))
Exemplo n.º 4
0
def plot_coverage(coords, bams):
    '''Given the name of a DNA coordinates firl and a list of bam file names,
    plot the read aligment coverage for each bam file for each coordinate.
    One graph per coordinate will be generated. The coverage for each
    BAM file for a given coordinate will be plotted on the same graph.
    The coordinates file should be in TSV format.'''
    coords = get_coords(coords)
    for chrom, start, end in coords:
        logging.info("processing coord {} {} {}".format(chrom, start, end))
        # Start plotting the graph and generate a name for the output file
        graph_filename = start_graph(chrom, start, end)
        coords_range = range(start, end + 1)
        for bam_filename in bams:
            # interval tree tracks the start and end mapped coordinates
            # of each read in the bam file that lies within our region
            # of interest.
            interval_tree = IntervalTree()
            with pysam.Samfile(bam_filename, "rb") as bam:
                logging.info("processing bam file {}".format(bam_filename))
                # Collect all the reads from the BAM file which lie in
                # the region of interest.
                # fetch uses 0-based indexing. Our input coordinates are
                # in 1-based coordinates.
                reads = bam.fetch(chrom, start - 1, end - 1)
                # Insert the start and end of each aligned read into the
                # interval tree.
                for read in reads:
                    if len(read.positions) > 0:
                        # Add 1 to convert from 0-based to 1-based coordinates
                        first_pos = read.positions[0] + 1
                        last_pos = read.positions[-1] + 1
                        interval_tree.add(first_pos, last_pos, None)
            # For each base position in our region of interest,
            # count the number of reads which overlap this position.
            # This computes the coverage for each position in the region.
            counts = [
                len(interval_tree.find(pos, pos)) for pos in coords_range
            ]
            # Plot the coverage information for this bam file
            legend_text = bam_name_legend(bam_filename)
            plot_graph(counts, coords_range, legend_text)
        # Close the drawing of the graph for this set of coordinates
        end_graph(graph_filename)
Exemplo n.º 5
0
def plot_coverage(coords, bams):
    '''Given the name of a DNA coordinates firl and a list of bam file names,
    plot the read aligment coverage for each bam file for each coordinate.
    One graph per coordinate will be generated. The coverage for each
    BAM file for a given coordinate will be plotted on the same graph.
    The coordinates file should be in TSV format.'''
    coords = get_coords(coords)
    for chrom, start, end in coords:
        logging.info("processing coord {} {} {}".format(chrom, start, end))
        # Start plotting the graph and generate a name for the output file
        graph_filename = start_graph(chrom, start, end)
        coords_range = range(start, end+1)
        for bam_filename in bams:
            # interval tree tracks the start and end mapped coordinates
            # of each read in the bam file that lies within our region
            # of interest.
            interval_tree = IntervalTree()
            with pysam.Samfile(bam_filename, "rb") as bam:
                logging.info("processing bam file {}".format(bam_filename))
                # Collect all the reads from the BAM file which lie in
                # the region of interest.
                # fetch uses 0-based indexing. Our input coordinates are
                # in 1-based coordinates.
                reads = bam.fetch(chrom, start-1, end-1)
                # Insert the start and end of each aligned read into the
                # interval tree.
                for read in reads:
                    if len(read.positions) > 0:
                        # Add 1 to convert from 0-based to 1-based coordinates
                        first_pos = read.positions[0] + 1
                        last_pos = read.positions[-1] + 1
                        interval_tree.add(first_pos, last_pos, None)
            # For each base position in our region of interest,
            # count the number of reads which overlap this position.
            # This computes the coverage for each position in the region.
            counts = [len(interval_tree.find(pos, pos))
                      for pos in coords_range]
            # Plot the coverage information for this bam file
            legend_text = bam_name_legend(bam_filename)
            plot_graph(counts, coords_range, legend_text)
        # Close the drawing of the graph for this set of coordinates
        end_graph(graph_filename)
Exemplo n.º 6
0
def parse_blast(blast_str, qfeat):
  """takes a blast file and cns_pair and sees if the query cns intersects with 
  any of the cns found"""
  scns_inteval = IntervalTree()
  for line in blast_str.split("\n"):
    if "WARNING" in line: continue
    if "ERROR" in line: continue
    line = line.split("\t")
    locus = map(int, line[6:10])
    locus.extend(map(float, line[10:]))
    
    s_start, s_end = locus[:2]
    s_start = min(int(s_start), int(s_end))
    s_end = max(int(s_start), int(s_end))
    scns_inteval.insert_interval(Interval(s_start, s_end))

  q_start = min(int(qfeat['start']), int(qfeat['end']))
  q_end = max(int(qfeat['start']), int(qfeat['end'])) 
  intersecting_cns = scns_inteval.find(q_start, q_end)
  return intersecting_cns
Exemplo n.º 7
0
class IntervalTreeOverlapDetector(OverlapDetector):
    def __init__(self, excludedSegments=None):
        from bx.intervals.intersection import IntervalTree
        self._intervalTree = IntervalTree()
        if excludedSegments:
            for start, end in excludedSegments:
                self._intervalTree.add(start, end)

    def overlaps(self, start, end):
        return bool(self._intervalTree.find(start, end))

    def addSegment(self, start, end):
        self._addElementHandleBxPythonZeroDivisionException(start, end)
        # self._intervalTree.add(start, end)

    def _addElementHandleBxPythonZeroDivisionException(self,
                                                       start,
                                                       end,
                                                       nrTries=10):
        """
        DivisionByZero error is caused by a bug in the bx-python library.
        It happens rarely, so we just execute the add command again up to nrTries times
        when it does. If it pops up more than 10 times, we assume something else is wrong and
        raise.
        """
        cnt = 0
        while True:
            cnt += 1
            try:
                self._intervalTree.add(start, end)
            except Exception as e:
                from gold.application.LogSetup import logMessage, logging
                logMessage("Try nr %i. %s" % (cnt, str(e)), level=logging.WARN)
                if cnt > nrTries:
                    raise e
                continue
            else:
                break
Exemplo n.º 8
0
def resolve_conflicts(pfam_hit_dict,minDomSize = 9,verbose=False):
    '''
    :param pfam_hit_dict: dictionary of hits for the gene in the following format
    hit start,hit end : int
    hit id : str
    score, model coverage percent : float
    {(hit start,hit end):('hit id',score,model coverage percent)}
    :param minDomSize: int, the minimum window size that will be considered a domain
    :return:
    a sorted dictionary with the position of the hit as the keys and ('hit id',score,model coverage percent)
    '''
    # initialize output
    gene_hits = SortedDict()
    redoFlag = True
    while redoFlag:
        if verbose: print("Sorting through intervals", pfam_hit_dict)
        redoFlag = False
        intervals_scores = [(key,value[1]) for key,value in pfam_hit_dict.items()]
        # sort intervals from pfam hits by score and place the highest score first
        intervals_scores.sort(key=itemgetter(1),reverse=True)
        # initialize intersect tree for quick overlap search
        intersectTree = IntervalTree()
        #add the intervals with the highest scores first
        for (interval,score) in intervals_scores:
            intervalStart = interval[0]
            intervalEnd = interval[1]
            intervalLength = intervalEnd-intervalStart+1
            # if the interval is less than the minimum domain size don't bother
            if intervalLength > minDomSize:
                intersectingIntervals = [(x.start,x.end) for x in intersectTree.find(intervalStart,intervalEnd)]
                overLapFlag = False
                # for every interval that you're adding resolve the overlapping intervals
                while len(intersectingIntervals) > 0 and intervalLength > 1:

                    start,end = intersectingIntervals[0]

                    # interval completely covers existing coverage, break up into two intervals and redo the process
                    if (intervalStart < start and intervalEnd > end):
                        if verbose: print("Split Interval", interval,intersectingIntervals, pfam_hit_dict[interval])
                        left_scale = calculate_window((intervalStart,start-1))/intervalLength
                        right_scale = calculate_window((end+1,intervalEnd))/intervalLength
                        pfam_hit_dict[(intervalStart,start-1)] = (pfam_hit_dict[interval][0],
                                                                  pfam_hit_dict[interval][1],
                                                                  pfam_hit_dict[interval][2] * left_scale)
                        pfam_hit_dict[(end+1,intervalEnd)] = (pfam_hit_dict[interval][0],
                                                              pfam_hit_dict[interval][1],
                                                              pfam_hit_dict[interval][2] * right_scale)
                        # delete original hit and iterate
                        del pfam_hit_dict[interval]
                        redoFlag = True
                        break
                    else:
                        #completely in the interval
                        if (intervalStart >= start and intervalEnd <= end):
                            #if completely overlapping then ignore since we already sorted by score
                            overLapFlag = True
                            break
                        #intersection covers the left hand side of the interval
                        elif intervalStart >= start:
                            intervalStart = end + 1
                        #intersection covers the right hand side of the interval
                        elif intervalEnd <= end:
                            intervalEnd = start - 1
                            # recalculate the interval length and see if there are still intersecting intervals
                        intervalLength = intervalEnd-intervalStart+1
                        intersectingIntervals = [(x.start,x.end) for x in intersectTree.find(intervalStart,intervalEnd)]

                if redoFlag:
                    if verbose: print("Exiting For Loop to Reinitialize",pfam_hit_dict)
                    break
                # if loop did not break because of an overlap add the annotation after resolving overlap,
                # check for minimum length after you merge intervals
                elif not overLapFlag and intervalLength > minDomSize:
                    if verbose: print("Adding Hit",(intervalStart,intervalEnd),pfam_hit_dict[interval][0])
                    # scale the hitCoverage based on the reduction this works since interval is a tuple and isn't mutated
                    hitCoverage = pfam_hit_dict[interval][2]*(intervalLength/(interval[1]-interval[0]+1.))
                    gene_hits[(intervalStart,intervalEnd)] = (pfam_hit_dict[interval][0],
                                                              pfam_hit_dict[interval][1],
                                                              hitCoverage)
                    intersectTree.add_interval(Interval(float(intervalStart),intervalEnd))
    if verbose: print("Merging Hits")
    # Merge Windows Right Next to one another that have the same pFam ID,
    # redoFlag: need to restart the process after a successful merge
    redoFlag = True
    while redoFlag:
        for idx in range(len(gene_hits)-1):
            left_hit = gene_hits.keys()[idx]
            right_hit = gene_hits.keys()[idx+1]
            left_window_size = calculate_window(left_hit)
            right_window_size = calculate_window(right_hit)
            merged_window_size = calculate_window((left_hit[0],right_hit[1]))
            new_coverage = (gene_hits[left_hit][2] + gene_hits[right_hit][2])*\
                           (left_window_size+ right_window_size)/merged_window_size
            # Will merge a hit under the following conditions:
            # 1. Gap between the two hits is less than the minimum domain
            # 2. Cumulative coverage of the two hits is less than 1 (this avoids merging repeats together)
            if right_hit[0]-left_hit[1] < minDomSize and gene_hits[left_hit][0] == gene_hits[right_hit][0] \
                    and new_coverage < 1:
                gene_hits[(left_hit[0],right_hit[1])] = (gene_hits[left_hit][0],
                                                         left_window_size/merged_window_size * gene_hits[left_hit][1] +
                                                         right_window_size/merged_window_size * gene_hits[right_hit][1],
                                                         new_coverage)
                redoFlag = True
                del gene_hits[left_hit]
                del gene_hits[right_hit]
                if verbose: print("Merged", left_hit,right_hit)
                break
        else:
            redoFlag = False
    if verbose: print("Deleting Domains Under Minimum Domain Size")
    # Finally check if any of the domains are less than the minimum domain size
    keysToDelete = [coordinates for coordinates in gene_hits.keys() if calculate_window(coordinates) < minDomSize]
    for key in keysToDelete:
        del gene_hits[key]
        if verbose: print("Deleting",key)
    if verbose: print("Final Annotation", gene_hits)
    return gene_hits
Exemplo n.º 9
0
def annotate_igrs(genome, igr_df):
    """
    Annotate the inter-genic regions listed in a dataframe with any available annotations from Rfam

    Parameters
    ----------
    genome: src.data.rfam_db.Genome
        The genome object for the organism who's IGR's are being analyzed
    igr_df: pandas.Dataframe
        The dataframe with the columns 'accession', 'start', 'end', 'length', 'gc'
    Returns
    -------
    annotated_igr_df: pandas.Dataframe
    """

    # Initialize connection to Rfam database
    session = rfam_session()

    # Get the list of "rfamseq_acc" numbers for a given organism
    rfamseq_acc_list = session.query(t_genseq.c.rfamseq_acc).filter(
        t_genseq.c.upid == genome.upid).distinct().all()

    # Create a list to store all the interval trees
    annotation_tree_dict = {}

    for rfamseq_acc in rfamseq_acc_list:

        # Pull rfamseq_acc out of the list
        rfamseq_acc = rfamseq_acc[0]

        rna_query = session.query(t_full_region).filter(
            t_full_region.c.rfamseq_acc == rfamseq_acc)
        rna_list = rna_query.all()

        # Make an interval tree for all of the RNA annotations to allow for rapid overlap search
        annotation_tree = IntervalTree()

        # Go though and add each RNA annotation to the interval tree
        for rna in rna_list:
            start = min(rna.seq_start, rna.seq_end)
            end = max(rna.seq_start, rna.seq_end)

            annotation_interval = Interval(start=start,
                                           end=end,
                                           chrom=rna.rfamseq_acc,
                                           value=rna)
            annotation_tree.insert_interval(annotation_interval)

        rfamseq_acc_stripped = rfamseq_acc.partition('.')[0]
        annotation_tree_dict[rfamseq_acc_stripped] = annotation_tree

    # Make an empty list of all the igrs with annotations
    annotated_igr_list = []
    for accession, accession_igr_df in igr_df.groupby('accession'):
        # Lookup the RNA annotation tree for the given accession
        try:
            annotation_tree = annotation_tree_dict[accession]
        except KeyError:
            print("IGR dataframe key: {} not found. Available keys are: {}".
                  format(accession, annotation_tree_dict.keys()))

        # For each IGR find all of the overlaps with annotated RNAs
        for igr in accession_igr_df.itertuples():

            overlap_list = annotation_tree.find(igr.start, igr.end)
            for overlap in overlap_list:
                # Add the IGR to the annotated_igr_list
                annotated_igr_list.append({
                    'igr_index': igr[0],
                    'rfam_acc': overlap.value.rfam_acc
                })

    # Convert annotated_igr_list into dataframe and merge on the rfam_acc
    annotated_igr_df = pd.merge(igr_df,
                                pd.DataFrame(annotated_igr_list,
                                             columns=["igr_index",
                                                      "rfam_acc"]),
                                on="igr_index",
                                how='left')

    # Look up the information for all of the RNA families represented in this genome
    rna_family_query = session.query(Family)\
                              .with_entities(Family.rfam_acc, Family.rfam_id, Family.description, Family.type)\
                              .filter(Family.rfam_acc.in_(annotated_igr_df["rfam_acc"].dropna().unique()))
    rna_families_df = pd.read_sql(rna_family_query.statement,
                                  rna_family_query.session.bind)

    merged_igr_df = pd.merge(annotated_igr_df,
                             rna_families_df,
                             on="rfam_acc",
                             how="left")

    combined_descriptions = merged_igr_df.dropna().groupby("igr_index")\
                                                  .agg(dict(rfam_acc=lambda x: ','.join(set(x)),
                                                            rfam_id=lambda x: ','.join(set(x)),
                                                            type=lambda x: ','.join(set(x)),
                                                            description=lambda x: '<br>'.join(set(x))))
    merged_igr_df.drop_duplicates(["igr_index"], inplace=True)
    merged_igr_df.reset_index(inplace=True, drop=True)
    merged_igr_df.update(combined_descriptions)

    merged_igr_df["category"] = merged_igr_df.apply(
        lambda row: categorize_igr(row), axis=1)

    merged_igr_df["log_length"] = np.log(merged_igr_df["length"])
    session.close()
    return merged_igr_df
def _bx(es):
    t = IntervalTree()
    for e in es:
        t.add(e[0], e[1], e)
        c = len(t.find(e[0], e[1]))
def _bx(es):
    t = IntervalTree()
    for e in es:
        t.add(e[0], e[1], e)
        c = len(t.find(e[0], e[1]))