Python IntervalTree.find示例

编程语言: Python

命名空间/包名称: assemblyline.lib.bx.intersection

类/类型: IntervalTree

方法/功能: find

hotexamples.com的示例: 4

Python IntervalTree.find - 已找到4个示例。这些是从开源项目中提取的最受好评的assemblyline.lib.bx.intersection.IntervalTree.find现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

IntervalTree(6)

insert_interval(4)

find(2)

示例#1

显示文件

def trim_graph(G, strand, min_trim_length, trim_utr_fraction,
               trim_intron_fraction):
    # get 'chains' of contiguous non-intron nodes with edge degree of
    # one or less
    node_chain_map, chains = get_chains(G, introns=False)
    # setup dictionaries of predecessors and successors
    successor_dict = {}
    for n, nbrdict in G.adjacency_iter():
        successor_dict[n] = nbrdict.keys()
    predecessor_dict = {}
    G.reverse(copy=False)
    for n, nbrdict in G.adjacency_iter():
        predecessor_dict[n] = nbrdict.keys()
    G.reverse(copy=False)
    # setup intron data structures
    introns = {}
    intron_tree = IntervalTree()
    reverse = (strand == NEG_STRAND)
    for u, nbrdict in G.adjacency_iter():
        for v in nbrdict:
            if reverse:
                left, right = v, u
            else:
                left, right = u, v
            # skip contiguous nodes
            if left.end == right.start:
                continue
            # calculate score of the chains
            u_chain_nodes = chains[node_chain_map[u]]
            u_score = max(G.node[n][NODE_SCORE] for n in u_chain_nodes)
            v_chain_nodes = chains[node_chain_map[v]]
            v_score = max(G.node[n][NODE_SCORE] for n in v_chain_nodes)
            # store scores in intron data structures
            introns[(left.end, right.start)] = (u_score, v_score)
            intron_tree.insert_interval(
                Interval(left.end, right.start, value=(u_score, v_score)))
    # trim chains
    all_trim_nodes = set()
    for parent, nodes in chains.iteritems():
        if strand == NEG_STRAND:
            nodes.reverse()
        in_degree = len(predecessor_dict[nodes[0]])
        out_degree = len(successor_dict[nodes[-1]])
        trim_nodes = set()
        if ((in_degree == 1) and (out_degree == 1)
                and (parent.start, parent.end) in introns):
            # intron retention - a chain of nodes precisely matches an
            # intron, so we can potentially remove the entire chain
            pred_score, succ_score = introns[(parent.start, parent.end)]
            cutoff_score = trim_intron_fraction * max(pred_score, succ_score)
            trim_nodes.update(trim_intron(G, nodes, cutoff_score))
        else:
            # determine whether this node chain is intronic. intronic node
            # chains are trimmed more strictly due to intronic pre-mrna
            found_intron = False
            max_pred_score = 0.0
            max_succ_score = 0.0
            for hit in intron_tree.find(parent.start, parent.end):
                # ignore contained introns
                if (hit.start > parent.start) and (hit.end < parent.end):
                    continue
                # set intron flag and keep track of highest coverage
                # overlapping intron to make trimming conservative
                found_intron = True
                pred_score, succ_score = hit.value
                if pred_score > max_pred_score:
                    max_pred_score = pred_score
                if succ_score > max_succ_score:
                    max_succ_score = succ_score
            if (in_degree == 0) and (out_degree == 0):
                if found_intron:
                    cutoff_score = trim_intron_fraction * max(
                        max_pred_score, max_succ_score)
                    trim_nodes.update(trim_intron(G, nodes, cutoff_score))
                trim_nodes.update(
                    trim_bidirectional(G, nodes, min_trim_length,
                                       trim_utr_fraction))
            elif in_degree == 0:
                if found_intron:
                    cutoff_score = trim_intron_fraction * max_succ_score
                    trim_nodes.update(trim_intronic_utr(
                        G, nodes, cutoff_score))
                trim_nodes.update(
                    trim_utr(G, nodes[::-1], min_trim_length,
                             trim_utr_fraction))
            elif out_degree == 0:
                if found_intron:
                    cutoff_score = trim_intron_fraction * max_pred_score
                    trim_nodes.update(
                        trim_intronic_utr(G, nodes[::-1], cutoff_score))
                trim_nodes.update(
                    trim_utr(G, nodes, min_trim_length, trim_utr_fraction))
        all_trim_nodes.update(trim_nodes)
    if len(all_trim_nodes) > 0:
        logging.debug("\t\t(%s) trimmed %d/%d nodes from graph" %
                      (strand_int_to_str(strand), len(all_trim_nodes), len(G)))
    return all_trim_nodes

示例#2

显示文件

文件： compare_assemblies.py 项目： redlin810/assemblyline

def compare_locus(transcripts):
    # store reference introns
    # (strand,start,end) -> ids (set)
    ref_intron_dict = collections.defaultdict(lambda: [])
    ref_node_dict = collections.defaultdict(lambda: [])
    ref_splicing_patterns = collections.defaultdict(lambda: [])
    ref_dict = {}
    # find the intron domains of the transcripts
    boundaries = find_exon_boundaries(transcripts)
    test_transcripts = []
    for t in transcripts:
        # separate ref and nonref transcripts
        is_ref = bool(int(t.attrs[GTFAttr.REF]))
        if is_ref:
            # add to dict
            ref_id = t.attrs[GTFAttr.TRANSCRIPT_ID]
            ref_dict[ref_id] = t
            # split exons that cross boundaries and get the
            # nodes in the transcript path
            for n in split_exons(t, boundaries):
                ref_node_dict[n].append(t)
            # add to introns
            splicing_pattern = []
            for start, end in t.iterintrons():
                intron = (t.strand, start, end)
                ref_intron_dict[intron].append(t)
                splicing_pattern.append(intron)
            # add to splicing patterns
            if len(splicing_pattern) > 0:
                ref_splicing_patterns[tuple(splicing_pattern)].append(t)
        else:
            test_transcripts.append(t)
    # index introns for fast intersection
    intron_tree = IntervalTree()
    for intron, refs in ref_intron_dict.iteritems():
        strand, start, end = intron
        intron_tree.insert_interval(
            Interval(start, end, strand=strand, value=refs))
    # categorize transcripts
    for t in test_transcripts:
        # get transcript nodes and introns
        nodes = list(split_exons(t, boundaries))
        introns = []
        for start, end in t.iterintrons():
            introns.append((t.strand, start, end))
        splicing_pattern = tuple(introns)
        # keep list of all matching ref transcripts
        matches = collections.defaultdict(lambda: Match())
        # dict of reference transcripts -> category -> list of nodes
        for n in nodes:
            if n in ref_node_dict:
                # look for reference transcripts that share this node
                for ref in ref_node_dict[n]:
                    if cmp_strand(t.strand, ref.strand):
                        c = Category.SAME_STRAND
                    else:
                        c = Category.OPP_STRAND
                    ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID]
                    m = matches[ref_id]
                    m.nodes[c].append(n)
            # look for reference introns that overlap this node
            for hit in intron_tree.find(*n):
                if cmp_strand(t.strand, hit.strand):
                    c = Category.INTRONIC_SAME_STRAND
                else:
                    c = Category.INTRONIC_OPP_STRAND
                for ref in hit.value:
                    ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID]
                    m = matches[ref_id]
                    m.nodes[c].append(n)
        # dict of introns -> list of reference transcripts
        for intron in introns:
            if intron in ref_intron_dict:
                for ref in ref_intron_dict[intron]:
                    ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID]
                    m = matches[ref_id]
                    m.introns.append(intron)
        # check splicing pattern matches
        if len(splicing_pattern) > 0:
            if splicing_pattern in ref_splicing_patterns:
                for ref in ref_splicing_patterns[splicing_pattern]:
                    ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID]
                    m = matches[ref_id]
                    m.splicing = True
        # go through the matches for this transcript and determine
        # the transcript category
        match_stats = []
        for ref_id, m in matches.iteritems():
            ref = ref_dict[ref_id]
            # calculate coverage
            same_strand_bp = sum(
                (n[1] - n[0]) for n in m.nodes[Category.SAME_STRAND])
            opp_strand_bp = sum(
                (n[1] - n[0]) for n in m.nodes[Category.OPP_STRAND])
            # count shared introns
            num_shared_introns = len(m.introns)
            # decide category for this test/ref transcript pair
            if m.splicing or (num_shared_introns > 0) or (same_strand_bp > 0):
                c = Category.SAME_STRAND
            elif (opp_strand_bp > 0):
                c = Category.OPP_STRAND
            else:
                # count nodes of different types
                num_same_strand = len(m.nodes[Category.SAME_STRAND])
                num_opp_strand = len(m.nodes[Category.OPP_STRAND])
                num_intronic_same_strand = len(
                    m.nodes[Category.INTRONIC_SAME_STRAND])
                num_intronic_opp_strand = len(
                    m.nodes[Category.INTRONIC_OPP_STRAND])
                assert num_same_strand == 0
                assert num_opp_strand == 0
                num_intronic = (num_intronic_same_strand +
                                num_intronic_opp_strand)
                assert num_intronic > 0
                if (num_intronic == len(nodes)):
                    # completely intronic
                    if num_intronic_same_strand > 0:
                        c = Category.INTRONIC_SAME_STRAND
                    else:
                        c = Category.INTRONIC_OPP_STRAND
                else:
                    # interleaving means some nodes intronic and other intergenic
                    if num_intronic_same_strand > 0:
                        c = Category.INTERLEAVING_SAME_STRAND
                    else:
                        c = Category.INTERLEAVING_OPP_STRAND
            # create a match object
            ms = MatchStats.from_transcript(t, ref)
            ms.shared_same_strand_bp = same_strand_bp
            ms.shared_opp_strand_bp = opp_strand_bp
            ms.shared_introns = num_shared_introns
            ms.shared_splicing = m.splicing
            ms.category = Category.to_str(c)
            ms.distance = 0
            match_stats.append(ms)
        yield (t, match_stats)

示例#3

显示文件

文件： trim.py 项目： BioXiao/assemblyline

def trim_graph(G, strand,
               min_trim_length, 
               trim_utr_fraction,
               trim_intron_fraction):
    # get 'chains' of contiguous non-intron nodes with edge degree of 
    # one or less
    node_chain_map, chains = get_chains(G, introns=False)
    # setup dictionaries of predecessors and successors
    successor_dict = {}
    for n,nbrdict in G.adjacency_iter():
        successor_dict[n] = nbrdict.keys()
    predecessor_dict = {}
    G.reverse(copy=False)
    for n,nbrdict in G.adjacency_iter():
        predecessor_dict[n] = nbrdict.keys()
    G.reverse(copy=False)
    # setup intron data structures
    introns = {}
    intron_tree = IntervalTree()
    reverse = (strand == NEG_STRAND)
    for u,nbrdict in G.adjacency_iter():
        for v in nbrdict:
            if reverse:
                left, right = v, u
            else:
                left, right = u, v
            # skip contiguous nodes
            if left.end == right.start:
                continue
            # calculate score of the chains
            u_chain_nodes = chains[node_chain_map[u]]
            u_score = max(G.node[n][NODE_SCORE] for n in u_chain_nodes)
            v_chain_nodes = chains[node_chain_map[v]]
            v_score = max(G.node[n][NODE_SCORE] for n in v_chain_nodes)
            # store scores in intron data structures
            introns[(left.end,right.start)] = (u_score, v_score)
            intron_tree.insert_interval(Interval(left.end, right.start, 
                                                 value=(u_score,v_score)))
    # trim chains
    all_trim_nodes = set()
    for parent, nodes in chains.iteritems():
        if strand == NEG_STRAND:
            nodes.reverse()
        in_degree = len(predecessor_dict[nodes[0]])
        out_degree = len(successor_dict[nodes[-1]])
        trim_nodes = set()
        if ((in_degree == 1) and (out_degree == 1) and
            (parent.start, parent.end) in introns): 
            # intron retention - a chain of nodes precisely matches an 
            # intron, so we can potentially remove the entire chain
            pred_score, succ_score = introns[(parent.start, parent.end)]
            cutoff_score = trim_intron_fraction * max(pred_score, succ_score)
            trim_nodes.update(trim_intron(G, nodes, cutoff_score))
        else:
            # determine whether this node chain is intronic. intronic node
            # chains are trimmed more strictly due to intronic pre-mrna
            found_intron = False
            max_pred_score = 0.0
            max_succ_score = 0.0
            for hit in intron_tree.find(parent.start, parent.end):
                # ignore contained introns
                if (hit.start > parent.start) and (hit.end < parent.end):
                    continue
                # set intron flag and keep track of highest coverage 
                # overlapping intron to make trimming conservative
                found_intron = True
                pred_score, succ_score = hit.value
                if pred_score > max_pred_score:
                    max_pred_score = pred_score
                if succ_score > max_succ_score:
                    max_succ_score = succ_score
            if (in_degree == 0) and (out_degree == 0):
                if found_intron:
                    cutoff_score = trim_intron_fraction * max(max_pred_score, max_succ_score)
                    trim_nodes.update(trim_intron(G, nodes, cutoff_score))
                trim_nodes.update(trim_bidirectional(G, nodes, min_trim_length, trim_utr_fraction))
            elif in_degree == 0:
                if found_intron:
                    cutoff_score = trim_intron_fraction * max_succ_score
                    trim_nodes.update(trim_intronic_utr(G, nodes, cutoff_score))
                trim_nodes.update(trim_utr(G, nodes[::-1], min_trim_length, trim_utr_fraction))
            elif out_degree == 0:
                if found_intron:
                    cutoff_score = trim_intron_fraction * max_pred_score
                    trim_nodes.update(trim_intronic_utr(G, nodes[::-1], cutoff_score))
                trim_nodes.update(trim_utr(G, nodes, min_trim_length, trim_utr_fraction))
        all_trim_nodes.update(trim_nodes)
    if len(all_trim_nodes) > 0:
        logging.debug("\t\t(%s) trimmed %d/%d nodes from graph" % 
                      (strand_int_to_str(strand), len(all_trim_nodes), 
                       len(G)))
    return all_trim_nodes

示例#4

显示文件

文件： compare_assemblies.py 项目： BioXiao/assemblyline

def compare_locus(transcripts):
    # store reference introns
    # (strand,start,end) -> ids (set) 
    ref_intron_dict = collections.defaultdict(lambda: [])
    ref_node_dict = collections.defaultdict(lambda: [])
    ref_splicing_patterns = collections.defaultdict(lambda: [])
    ref_dict = {}
    # find the intron domains of the transcripts
    boundaries = find_exon_boundaries(transcripts)
    test_transcripts = []
    for t in transcripts:
        # separate ref and nonref transcripts
        is_ref = bool(int(t.attrs[GTFAttr.REF]))
        if is_ref:
            # add to dict
            ref_id = t.attrs[GTFAttr.TRANSCRIPT_ID]
            ref_dict[ref_id] = t
            # split exons that cross boundaries and get the
            # nodes in the transcript path
            for n in split_exons(t, boundaries):
                ref_node_dict[n].append(t)
            # add to introns
            splicing_pattern = []
            for start,end in t.iterintrons():
                intron = (t.strand, start, end)
                ref_intron_dict[intron].append(t)
                splicing_pattern.append(intron)
            # add to splicing patterns
            if len(splicing_pattern) > 0:
                ref_splicing_patterns[tuple(splicing_pattern)].append(t)
        else:
            test_transcripts.append(t)
    # index introns for fast intersection
    intron_tree = IntervalTree()
    for intron, refs in ref_intron_dict.iteritems():
        strand, start, end = intron
        intron_tree.insert_interval(Interval(start,end,strand=strand,value=refs))
    # categorize transcripts
    for t in test_transcripts:
        # get transcript nodes and introns
        nodes = list(split_exons(t, boundaries))
        introns = []
        for start,end in t.iterintrons():
            introns.append((t.strand,start,end))
        splicing_pattern = tuple(introns)
        # keep list of all matching ref transcripts
        matches = collections.defaultdict(lambda: Match())
        # dict of reference transcripts -> category -> list of nodes
        for n in nodes:
            if n in ref_node_dict:
                # look for reference transcripts that share this node
                for ref in ref_node_dict[n]:
                    if cmp_strand(t.strand, ref.strand):
                        c = Category.SAME_STRAND
                    else:
                        c = Category.OPP_STRAND
                    ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID]
                    m = matches[ref_id]
                    m.nodes[c].append(n)
            # look for reference introns that overlap this node
            for hit in intron_tree.find(*n):
                if cmp_strand(t.strand, hit.strand):
                    c = Category.INTRONIC_SAME_STRAND
                else:
                    c = Category.INTRONIC_OPP_STRAND
                for ref in hit.value: 
                    ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID]
                    m = matches[ref_id]
                    m.nodes[c].append(n)          
        # dict of introns -> list of reference transcripts
        for intron in introns:
            if intron in ref_intron_dict:
                for ref in ref_intron_dict[intron]:
                    ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID]
                    m = matches[ref_id]
                    m.introns.append(intron)
        # check splicing pattern matches
        if len(splicing_pattern) > 0:
            if splicing_pattern in ref_splicing_patterns:
                for ref in ref_splicing_patterns[splicing_pattern]:
                    ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID]
                    m = matches[ref_id]
                    m.splicing = True
        # go through the matches for this transcript and determine
        # the transcript category
        match_stats = []
        for ref_id, m in matches.iteritems():
            ref = ref_dict[ref_id]
            # calculate coverage
            same_strand_bp = sum((n[1] - n[0]) for n in m.nodes[Category.SAME_STRAND])
            opp_strand_bp = sum((n[1] - n[0]) for n in m.nodes[Category.OPP_STRAND])
            # count shared introns
            num_shared_introns = len(m.introns)
            # decide category for this test/ref transcript pair
            if m.splicing or (num_shared_introns > 0) or (same_strand_bp > 0):
                c = Category.SAME_STRAND
            elif (opp_strand_bp > 0):
                c = Category.OPP_STRAND
            else:
                # count nodes of different types
                num_same_strand = len(m.nodes[Category.SAME_STRAND])
                num_opp_strand = len(m.nodes[Category.OPP_STRAND])
                num_intronic_same_strand = len(m.nodes[Category.INTRONIC_SAME_STRAND])
                num_intronic_opp_strand = len(m.nodes[Category.INTRONIC_OPP_STRAND])
                assert num_same_strand == 0
                assert num_opp_strand == 0
                num_intronic = (num_intronic_same_strand +
                                num_intronic_opp_strand)
                assert num_intronic > 0
                if (num_intronic == len(nodes)):
                    # completely intronic
                    if num_intronic_same_strand > 0:
                        c = Category.INTRONIC_SAME_STRAND
                    else:
                        c = Category.INTRONIC_OPP_STRAND
                else:
                    # interleaving means some nodes intronic and other intergenic
                    if num_intronic_same_strand > 0:
                        c = Category.INTERLEAVING_SAME_STRAND
                    else:
                        c = Category.INTERLEAVING_OPP_STRAND
            # create a match object
            ms = MatchStats.from_transcript(t, ref)
            ms.shared_same_strand_bp = same_strand_bp
            ms.shared_opp_strand_bp = opp_strand_bp
            ms.shared_introns = num_shared_introns
            ms.shared_splicing = m.splicing
            ms.category = Category.to_str(c)            
            ms.distance = 0
            match_stats.append(ms)
        yield (t, match_stats)