Python Interval примеры использования

Язык программирования: Python

Пространство имен/Пакет: assemblyline.lib.bx.intersection

Класс/Тип: Interval

Примеров на hotexamples.com: 6

Python Interval - 6 примеров найдено. Это лучшие примеры Python кода для assemblyline.lib.bx.intersection.Interval, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Interval(6)

Основные методы

Interval (6)

Пример #1

Показать файл

Файл: categorize_genes.py Проект: redlin810/assemblyline

def read_reference_gtf(ref_gtf_file):
    gene_map = {}
    for f in GTFFeature.parse(open(ref_gtf_file)):
        # get gene by id
        gene_id = f.attrs["gene_id"]
        if gene_id not in gene_map:
            g = Gene()
            g.gene_id = gene_id
            g.chrom = f.seqid
            g.strand = f.strand
            g.gene_start = f.start
            g.gene_end = f.end
            gene_map[gene_id] = g
        else:
            g = gene_map[gene_id]
        # update gene
        g.gene_start = min(g.gene_start, f.start)
        g.gene_end = max(g.gene_end, f.end)
        if f.feature_type == "exon":
            g.exons.add((f.start, f.end))
        elif f.feature_type == "CDS":
            g.is_coding = True
        if "gene_name" in f.attrs:
            g.gene_names.add(f.attrs["gene_name"])
        g.annotation_sources.add(f.source)
    logging.info("Sorting genes")
    genes = sorted(gene_map.values(),
                   key=operator.attrgetter('chrom', 'gene_start'))
    del gene_map
    # cluster loci
    logging.debug("Building interval index")
    locus_cluster_trees = collections.defaultdict(lambda: ClusterTree(0, 1))
    locus_trees = collections.defaultdict(lambda: IntervalTree())
    for i, g in enumerate(genes):
        locus_cluster_trees[g.chrom].insert(g.gene_start, g.gene_end, i)
    for chrom, cluster_tree in locus_cluster_trees.iteritems():
        for locus_start, locus_end, indexes in cluster_tree.getregions():
            # cluster gene exons and add to interval tree
            exon_tree = IntervalTree()
            for i in indexes:
                g = genes[i]
                cluster_tree = ClusterTree(0, 1)
                for start, end in g.exons:
                    cluster_tree.insert(start, end, 1)
                # update exons
                exon_clusters = []
                for start, end, indexes in cluster_tree.getregions():
                    exon_clusters.append((start, end))
                g.exons = exon_clusters
                del cluster_tree
                for start, end in g.exons:
                    exon_tree.insert_interval(Interval(start, end, value=g))
            # add to locus interval tree
            locus_trees[chrom].insert_interval(
                Interval(locus_start, locus_end, value=exon_tree))
    logging.debug("Done indexing reference GTF file")
    return locus_trees

Пример #2

Показать файл

Файл: gtf_annotate.py Проект: redlin810/assemblyline

def build_interval_tree_from_bed(bed_file):
    trees = collections.defaultdict(lambda: IntervalTree())
    for f in BEDFeature.parse(open(bed_file)):
        tree = trees[f.chrom]
        for start,end in f.exons:
            tree.insert_interval(Interval(start, end, strand=f.strand, value=f.name))
    return trees

Пример #3

Показать файл

def build_locus_trees(gtf_file):
    transcripts = []
    locus_cluster_trees = collections.defaultdict(lambda: ClusterTree(0,1))
    for locus_transcripts in parse_gtf(open(gtf_file)):
        for t in locus_transcripts:
            is_ref = bool(int(t.attrs[GTFAttr.REF]))
            if not is_ref:
                continue
            i = len(transcripts)
            transcripts.append(t)
            locus_cluster_trees[t.chrom].insert(t.start, t.end, i)
    # build interval trees of loci
    locus_trees = collections.defaultdict(lambda: IntervalTree())
    for chrom, cluster_tree in locus_cluster_trees.iteritems():
        for locus_start, locus_end, indexes in cluster_tree.getregions():
            for i in indexes:
                locus_transcripts = [transcripts[i] for i in indexes]
                locus_trees[chrom].insert_interval(Interval(locus_start, locus_end, value=locus_transcripts))
    return locus_trees

Пример #4

Показать файл

def trim_graph(G, strand, min_trim_length, trim_utr_fraction,
               trim_intron_fraction):
    # get 'chains' of contiguous non-intron nodes with edge degree of
    # one or less
    node_chain_map, chains = get_chains(G, introns=False)
    # setup dictionaries of predecessors and successors
    successor_dict = {}
    for n, nbrdict in G.adjacency_iter():
        successor_dict[n] = nbrdict.keys()
    predecessor_dict = {}
    G.reverse(copy=False)
    for n, nbrdict in G.adjacency_iter():
        predecessor_dict[n] = nbrdict.keys()
    G.reverse(copy=False)
    # setup intron data structures
    introns = {}
    intron_tree = IntervalTree()
    reverse = (strand == NEG_STRAND)
    for u, nbrdict in G.adjacency_iter():
        for v in nbrdict:
            if reverse:
                left, right = v, u
            else:
                left, right = u, v
            # skip contiguous nodes
            if left.end == right.start:
                continue
            # calculate score of the chains
            u_chain_nodes = chains[node_chain_map[u]]
            u_score = max(G.node[n][NODE_SCORE] for n in u_chain_nodes)
            v_chain_nodes = chains[node_chain_map[v]]
            v_score = max(G.node[n][NODE_SCORE] for n in v_chain_nodes)
            # store scores in intron data structures
            introns[(left.end, right.start)] = (u_score, v_score)
            intron_tree.insert_interval(
                Interval(left.end, right.start, value=(u_score, v_score)))
    # trim chains
    all_trim_nodes = set()
    for parent, nodes in chains.iteritems():
        if strand == NEG_STRAND:
            nodes.reverse()
        in_degree = len(predecessor_dict[nodes[0]])
        out_degree = len(successor_dict[nodes[-1]])
        trim_nodes = set()
        if ((in_degree == 1) and (out_degree == 1)
                and (parent.start, parent.end) in introns):
            # intron retention - a chain of nodes precisely matches an
            # intron, so we can potentially remove the entire chain
            pred_score, succ_score = introns[(parent.start, parent.end)]
            cutoff_score = trim_intron_fraction * max(pred_score, succ_score)
            trim_nodes.update(trim_intron(G, nodes, cutoff_score))
        else:
            # determine whether this node chain is intronic. intronic node
            # chains are trimmed more strictly due to intronic pre-mrna
            found_intron = False
            max_pred_score = 0.0
            max_succ_score = 0.0
            for hit in intron_tree.find(parent.start, parent.end):
                # ignore contained introns
                if (hit.start > parent.start) and (hit.end < parent.end):
                    continue
                # set intron flag and keep track of highest coverage
                # overlapping intron to make trimming conservative
                found_intron = True
                pred_score, succ_score = hit.value
                if pred_score > max_pred_score:
                    max_pred_score = pred_score
                if succ_score > max_succ_score:
                    max_succ_score = succ_score
            if (in_degree == 0) and (out_degree == 0):
                if found_intron:
                    cutoff_score = trim_intron_fraction * max(
                        max_pred_score, max_succ_score)
                    trim_nodes.update(trim_intron(G, nodes, cutoff_score))
                trim_nodes.update(
                    trim_bidirectional(G, nodes, min_trim_length,
                                       trim_utr_fraction))
            elif in_degree == 0:
                if found_intron:
                    cutoff_score = trim_intron_fraction * max_succ_score
                    trim_nodes.update(trim_intronic_utr(
                        G, nodes, cutoff_score))
                trim_nodes.update(
                    trim_utr(G, nodes[::-1], min_trim_length,
                             trim_utr_fraction))
            elif out_degree == 0:
                if found_intron:
                    cutoff_score = trim_intron_fraction * max_pred_score
                    trim_nodes.update(
                        trim_intronic_utr(G, nodes[::-1], cutoff_score))
                trim_nodes.update(
                    trim_utr(G, nodes, min_trim_length, trim_utr_fraction))
        all_trim_nodes.update(trim_nodes)
    if len(all_trim_nodes) > 0:
        logging.debug("\t\t(%s) trimmed %d/%d nodes from graph" %
                      (strand_int_to_str(strand), len(all_trim_nodes), len(G)))
    return all_trim_nodes

Пример #5

Показать файл

Файл: annotate_transcripts.py Проект: redlin810/assemblyline

def annotate_locus(transcripts, gtf_sample_attr):
    # store reference introns
    # (strand,start,end) -> ids (set)
    ref_intron_dict = collections.defaultdict(lambda: [])
    ref_node_dict = collections.defaultdict(lambda: ([], []))
    node_score_dict = collections.defaultdict(lambda: [0.0, 0.0])
    all_introns = set()
    # find the intron domains of the transcripts
    boundaries = find_exon_boundaries(transcripts)
    # add transcript to intron and graph data structures
    inp_transcripts = []
    for t in transcripts:
        # separate ref and nonref transcripts
        is_ref = bool(int(t.attrs[GTFAttr.REF]))
        if is_ref:
            # split exons that cross boundaries and get the
            # nodes in the transcript path
            for n in split_exons(t, boundaries):
                ref_node_dict[n][t.strand].append(t)
            # add to introns
            for start, end in t.iterintrons():
                ref_intron_dict[(t.strand, start, end)].append(t)
                all_introns.add((t.strand, start, end))
        else:
            if t.strand != NO_STRAND:
                score = float(t.attrs[GTFAttr.SCORE])
                for n in split_exons(t, boundaries):
                    node_score_dict[n][t.strand] += score
            inp_transcripts.append(t)
            # add to introns
            for start, end in t.iterintrons():
                all_introns.add((t.strand, start, end))
    # index introns for fast intersection
    intron_tree = IntervalTree()
    for strand, start, end in all_introns:
        intron_tree.insert_interval(Interval(start, end, strand=strand))
    del all_introns
    # categorize transcripts
    strand_transcript_lists = [[], [], []]
    for t in inp_transcripts:
        # get transcript nodes and introns
        nodes = list(split_exons(t, boundaries))
        introns = set(t.iterintrons())
        # try to resolve strand
        strand = t.strand
        if strand == NO_STRAND:
            strand = resolve_strand(nodes, node_score_dict, ref_node_dict)
        # define opposite strand
        if strand == NO_STRAND:
            opp_strand = NO_STRAND
        else:
            opp_strand = (strand + 1) % 2
        # get all reference transcripts that share introns
        intron_ref_dict = {}
        for start, end in introns:
            if (strand, start, end) in ref_intron_dict:
                refs = ref_intron_dict[(strand, start, end)]
                intron_ref_dict.update(
                    (ref.attrs[GTFAttr.TRANSCRIPT_ID], ref) for ref in refs)
        intron_refs = []
        for ref in intron_ref_dict.itervalues():
            intron_refs.append((ref, list(split_exons(ref, boundaries))))
        # get all reference transcripts that share coverage
        same_strand_ref_dict = {}
        opp_strand_ref_dict = {}
        for n in nodes:
            if n in ref_node_dict:
                strand_refs = ref_node_dict[n]
                same_strand_ref_dict.update(
                    (ref.attrs[GTFAttr.TRANSCRIPT_ID], ref)
                    for ref in strand_refs[strand])
                opp_strand_ref_dict.update(
                    (ref.attrs[GTFAttr.TRANSCRIPT_ID], ref)
                    for ref in strand_refs[opp_strand])
        same_strand_refs = []
        for ref in same_strand_ref_dict.itervalues():
            same_strand_refs.append((ref, list(split_exons(ref, boundaries))))
        opp_strand_refs = []
        for ref in opp_strand_ref_dict.itervalues():
            opp_strand_refs.append((ref, list(split_exons(ref, boundaries))))
        # categorize
        cinf = categorize_transcript(t,
                                     nodes,
                                     introns,
                                     intron_refs,
                                     same_strand_refs,
                                     opp_strand_refs,
                                     intron_tree,
                                     ignore_test=False)
        if cinf.is_test:
            # recategorize test transcripts
            cinf2 = categorize_transcript(t,
                                          nodes,
                                          introns,
                                          intron_refs,
                                          same_strand_refs,
                                          opp_strand_refs,
                                          intron_tree,
                                          ignore_test=True)
            cinf = cinf._replace(category=cinf2.category)
        # add annotation attributes
        best_ref_id = (cinf.ref.attrs[GTFAttr.TRANSCRIPT_ID]
                       if cinf.ref is not None else 'na')
        t.attrs[GTFAttr.CATEGORY] = cinf.category
        t.attrs[GTFAttr.TEST] = '1' if cinf.is_test else '0'
        t.attrs[GTFAttr.ANN_REF_ID] = best_ref_id
        t.attrs[GTFAttr.ANN_COV_RATIO] = cinf.ann_cov_ratio
        t.attrs[GTFAttr.ANN_INTRON_RATIO] = cinf.ann_intron_ratio
        # group transcripts by strand
        strand_transcript_lists[strand].append(t)
    # explictly delete large data structures
    del ref_intron_dict
    del ref_node_dict
    del node_score_dict
    del intron_tree
    del inp_transcripts
    # annotate score and recurrence for transcripts
    for strand_transcripts in strand_transcript_lists:
        # find the intron domains of the transcripts
        boundaries = find_exon_boundaries(strand_transcripts)
        # gather node score/recurrence data
        new_data_func = lambda: {'ids': set(), 'score': 0.0, 'pct': 0.0}
        node_data = collections.defaultdict(new_data_func)
        for t in strand_transcripts:
            sample_id = t.attrs[gtf_sample_attr]
            score = float(t.attrs[GTFAttr.SCORE])
            pctrank = float(t.attrs[GTFAttr.PCTRANK])
            # split exons that cross boundaries and to get the
            # nodes in the transcript path
            for n in split_exons(t, boundaries):
                nd = node_data[n]
                nd['ids'].add(sample_id)
                nd['score'] += score
                nd['pct'] += pctrank
        # calculate recurrence and score statistics
        for t in strand_transcripts:
            nodes = list(split_exons(t, boundaries))
            mean_score, mean_pctrank, mean_recur = \
                compute_recurrence_and_score(nodes, node_data)
            t.attrs[GTFAttr.MEAN_SCORE] = mean_score
            t.attrs[GTFAttr.MEAN_PCTRANK] = mean_pctrank
            t.attrs[GTFAttr.MEAN_RECURRENCE] = mean_recur

Пример #6

Показать файл

Файл: compare_assemblies.py Проект: redlin810/assemblyline

def compare_locus(transcripts):
    # store reference introns
    # (strand,start,end) -> ids (set)
    ref_intron_dict = collections.defaultdict(lambda: [])
    ref_node_dict = collections.defaultdict(lambda: [])
    ref_splicing_patterns = collections.defaultdict(lambda: [])
    ref_dict = {}
    # find the intron domains of the transcripts
    boundaries = find_exon_boundaries(transcripts)
    test_transcripts = []
    for t in transcripts:
        # separate ref and nonref transcripts
        is_ref = bool(int(t.attrs[GTFAttr.REF]))
        if is_ref:
            # add to dict
            ref_id = t.attrs[GTFAttr.TRANSCRIPT_ID]
            ref_dict[ref_id] = t
            # split exons that cross boundaries and get the
            # nodes in the transcript path
            for n in split_exons(t, boundaries):
                ref_node_dict[n].append(t)
            # add to introns
            splicing_pattern = []
            for start, end in t.iterintrons():
                intron = (t.strand, start, end)
                ref_intron_dict[intron].append(t)
                splicing_pattern.append(intron)
            # add to splicing patterns
            if len(splicing_pattern) > 0:
                ref_splicing_patterns[tuple(splicing_pattern)].append(t)
        else:
            test_transcripts.append(t)
    # index introns for fast intersection
    intron_tree = IntervalTree()
    for intron, refs in ref_intron_dict.iteritems():
        strand, start, end = intron
        intron_tree.insert_interval(
            Interval(start, end, strand=strand, value=refs))
    # categorize transcripts
    for t in test_transcripts:
        # get transcript nodes and introns
        nodes = list(split_exons(t, boundaries))
        introns = []
        for start, end in t.iterintrons():
            introns.append((t.strand, start, end))
        splicing_pattern = tuple(introns)
        # keep list of all matching ref transcripts
        matches = collections.defaultdict(lambda: Match())
        # dict of reference transcripts -> category -> list of nodes
        for n in nodes:
            if n in ref_node_dict:
                # look for reference transcripts that share this node
                for ref in ref_node_dict[n]:
                    if cmp_strand(t.strand, ref.strand):
                        c = Category.SAME_STRAND
                    else:
                        c = Category.OPP_STRAND
                    ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID]
                    m = matches[ref_id]
                    m.nodes[c].append(n)
            # look for reference introns that overlap this node
            for hit in intron_tree.find(*n):
                if cmp_strand(t.strand, hit.strand):
                    c = Category.INTRONIC_SAME_STRAND
                else:
                    c = Category.INTRONIC_OPP_STRAND
                for ref in hit.value:
                    ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID]
                    m = matches[ref_id]
                    m.nodes[c].append(n)
        # dict of introns -> list of reference transcripts
        for intron in introns:
            if intron in ref_intron_dict:
                for ref in ref_intron_dict[intron]:
                    ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID]
                    m = matches[ref_id]
                    m.introns.append(intron)
        # check splicing pattern matches
        if len(splicing_pattern) > 0:
            if splicing_pattern in ref_splicing_patterns:
                for ref in ref_splicing_patterns[splicing_pattern]:
                    ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID]
                    m = matches[ref_id]
                    m.splicing = True
        # go through the matches for this transcript and determine
        # the transcript category
        match_stats = []
        for ref_id, m in matches.iteritems():
            ref = ref_dict[ref_id]
            # calculate coverage
            same_strand_bp = sum(
                (n[1] - n[0]) for n in m.nodes[Category.SAME_STRAND])
            opp_strand_bp = sum(
                (n[1] - n[0]) for n in m.nodes[Category.OPP_STRAND])
            # count shared introns
            num_shared_introns = len(m.introns)
            # decide category for this test/ref transcript pair
            if m.splicing or (num_shared_introns > 0) or (same_strand_bp > 0):
                c = Category.SAME_STRAND
            elif (opp_strand_bp > 0):
                c = Category.OPP_STRAND
            else:
                # count nodes of different types
                num_same_strand = len(m.nodes[Category.SAME_STRAND])
                num_opp_strand = len(m.nodes[Category.OPP_STRAND])
                num_intronic_same_strand = len(
                    m.nodes[Category.INTRONIC_SAME_STRAND])
                num_intronic_opp_strand = len(
                    m.nodes[Category.INTRONIC_OPP_STRAND])
                assert num_same_strand == 0
                assert num_opp_strand == 0
                num_intronic = (num_intronic_same_strand +
                                num_intronic_opp_strand)
                assert num_intronic > 0
                if (num_intronic == len(nodes)):
                    # completely intronic
                    if num_intronic_same_strand > 0:
                        c = Category.INTRONIC_SAME_STRAND
                    else:
                        c = Category.INTRONIC_OPP_STRAND
                else:
                    # interleaving means some nodes intronic and other intergenic
                    if num_intronic_same_strand > 0:
                        c = Category.INTERLEAVING_SAME_STRAND
                    else:
                        c = Category.INTERLEAVING_OPP_STRAND
            # create a match object
            ms = MatchStats.from_transcript(t, ref)
            ms.shared_same_strand_bp = same_strand_bp
            ms.shared_opp_strand_bp = opp_strand_bp
            ms.shared_introns = num_shared_introns
            ms.shared_splicing = m.splicing
            ms.category = Category.to_str(c)
            ms.distance = 0
            match_stats.append(ms)
        yield (t, match_stats)