Exemplo n.º 1
0
def test_merge_overlaps_reducer_with_initializer():
    def reducer(old, new):
        return old + [new]
    # empty tree
    e = IntervalTree()
    e.merge_overlaps(data_reducer=reducer, data_initializer=[])
    e.verify()
    assert not e

    # One Interval in tree
    o = IntervalTree.from_tuples([(1, 2, 'hello')])
    o.merge_overlaps(data_reducer=reducer, data_initializer=[])
    o.verify()
    assert len(o) == 1
    assert sorted(o) == [Interval(1, 2, ['hello'])]

    # many Intervals in tree, with gap
    t = IntervalTree.from_tuples(data.ivs1.data)
    t.merge_overlaps(data_reducer=reducer, data_initializer=[])
    t.verify()
    assert len(t) == 2
    assert sorted(t) == [
        Interval(1, 2, ['[1,2)']),
        Interval(4, 15, [
            '[4,7)',
            '[5,9)',
            '[6,10)',
            '[8,10)',
            '[8,15)',
            '[10,12)',
            '[12,14)',
            '[14,15)',
        ])
    ]
Exemplo n.º 2
0
def _get_unparse_intervals_of_inds(
    dfs_inds_to_include: Sequence[int],
    ast: ObjectChoiceNode,
    unparse: UnparseResult
) -> IntervalTree:
    """Given some indicies we wish include, find the intervals of the total
    unparse string which are covered by those indicies"""
    include_set = set(dfs_inds_to_include)
    interval_tree = IntervalTree()
    currently_including = False
    for ind, pointer in enumerate(ast.depth_first_iter()):
        if ind % 2 != 0:
            # Only take into account the choice nodes. Skip the object nodes
            continue
        assert isinstance(pointer.cur_node, ObjectChoiceNode)
        func_need_to_do_here = None
        if ind in include_set:
            if not currently_including:
                func_need_to_do_here = lambda start, end: interval_tree.add(Interval(start, end))
                currently_including = True
        else:
            if currently_including:
                func_need_to_do_here = lambda start, end: interval_tree.chop(start, end)
                currently_including = False
        if func_need_to_do_here:
            span = unparse.pointer_to_span(pointer)
            if span is None or span[1] - span[0] == 0:
                continue
            start, end = span
            func_need_to_do_here(start, end)
    interval_tree.merge_overlaps()
    return interval_tree
Exemplo n.º 3
0
def test_merge_overlaps_reducer_with_initializer():
    def reducer(old, new):
        return old + [new]
    # empty tree
    e = IntervalTree()
    e.merge_overlaps(data_reducer=reducer, data_initializer=[])
    e.verify()
    assert not e

    # One Interval in tree
    o = IntervalTree.from_tuples([(1, 2, 'hello')])
    o.merge_overlaps(data_reducer=reducer, data_initializer=[])
    o.verify()
    assert len(o) == 1
    assert sorted(o) == [Interval(1, 2, ['hello'])]

    # many Intervals in tree, with gap
    t = trees['ivs1']()
    t.merge_overlaps(data_reducer=reducer, data_initializer=[])
    t.verify()
    assert len(t) == 2
    assert sorted(t) == [
        Interval(1, 2, ['[1,2)']),
        Interval(4, 15, [
            '[4,7)',
            '[5,9)',
            '[6,10)',
            '[8,10)',
            '[8,15)',
            '[10,12)',
            '[12,14)',
            '[14,15)',
        ])
    ]
Exemplo n.º 4
0
def layout_cost(params):

    geometry = params_to_geometry(params)
    pdf = DOC.compile(geometry)
    pdf_document = fitz.open(pdf)
    if pdf_document.pageCount > 1:
        return 10

    page1 = pdf_document[-1]
    full_tree_y = IntervalTree()
    tree_y = IntervalTree()
    blks = page1.getTextBlocks()  # Read text blocks of input page
    # Calculate CropBox & displacement
    disp = fitz.Rect(page1.CropBoxPosition, page1.CropBoxPosition)
    croprect = page1.rect + disp
    full_tree_y.add(Interval(croprect[1], croprect[3]))
    for b in blks:  # loop through the blocks
        r = fitz.Rect(b[:4])  # block rectangle
        # add dislacement of original /CropBox
        r += disp
        _, y0, _, y1 = r
        tree_y.add(Interval(y0, y1))
    tree_y.merge_overlaps()
    for i in tree_y:
        full_tree_y.add(i)
    full_tree_y.split_overlaps()
    # For top and bottom margins, we only know they are the first and last elements in the list
    full_tree_y_list = list(sorted(full_tree_y))
    _, bottom_margin = \
        map(get_interval_width, full_tree_y_list[::len(full_tree_y_list) - 1])
    return bottom_margin
Exemplo n.º 5
0
def concatDifferences(diffs):
	if(len(diffs) > 1):
		points = list()
		tree = IntervalTree()
		for diff in diffs:
			if(diff[0] == diff[1]):
				points.append(diff)
			else:
				tree[diff[0]:diff[1]] = (diff[2], diff[3])
		tree.merge_overlaps(tupleReducer)
		items = tree.items()
		for point in points:
			if(len(tree[point[0]]) == 0):
				items.add((point[0], point[1], (point[2], point[3])))
		
		points = list()
		tree = IntervalTree()
		for item in items:
			if(item[2][0] == item[2][1]):
				points.append([item[2][0], item[2][1], item[0], item[1]])
			else:
				tree[item[2][0]:item[2][1]] = (item[0], item[1])
		tree.merge_overlaps(tupleReducer)
		items = tree.items()
		for point in points:
			if(len(tree[point[0]]) == 0):
				items.add((point[0], point[1], (point[2], point[3])))		
		diffs = list()
		for item in items:
			diffs.append([item[2][0], item[2][1], item[0], item[1]])
	return diffs
Exemplo n.º 6
0
def aln_coverage(aln_list):
    """
    Calculate the coverage across the reported alignments for a given read. This will most
    often involve only a single alignment, but also considers non-overlapping alignments
    reported by BWA MEM scavenged from the XP tag. Reports the number of bases covered
    (<=read_len) and the overlap between them (normally 0).
    :param aln_list: the list of alignments for a read
    :return: dict {coverage: xx, overlap: yy}
    """
    # using an intervaltree for this
    tr = IntervalTree()
    tot = 0
    for ti in aln_list:
        if ti['is_reverse']:
            # reversed reads must be tallied from the opposite end
            n = ti['total']
            for op, nb in ti['cigartuple']:
                if op == 0:
                    tr.addi(n - nb, n)
                    tot += nb
                n -= nb
        else:
            # forward mapped reads tally from start position
            n = 0
            for op, nb in ti['cigartuple']:
                if op == 0:
                    tr.addi(n, n + nb)
                    tot += nb
                n += nb
    # lazy means of merging intervals
    tr.merge_overlaps()
    cov = sum([i.end - i.begin for i in tr])
    return {'coverage': cov, 'overlap': tot - cov, 'has_multi': len(aln_list) > 1}
Exemplo n.º 7
0
    def store(self, key, start, end, data):
        # Which intervals have we already stored?
        redis_itree = self.red.hget(key, "itree")
        if redis_itree is None:
            itree = IntervalTree()
        else:
            itree = loads(redis_itree)

        # Add our new interval into the tree
        interval = Interval(start, end)
        itree = itree | IntervalTree([interval])
        itree.merge_overlaps()

        zset = key + "__zset"  # name of the redis sorted set
        self.red.hmset(key, {"itree": dumps(itree), "zset": zset})

        # Start a redis pipeline. All of these actions are committed to the server in batch and within a single
        # transaction, rather than executing each one separately over the network.
        pipe = self.red.pipeline()

        # Store the data. Each key in the data should be a position, and value can be arbitrary data.
        for k, v in iteritems(data):
            v_serial = v
            if not isinstance(v, str):
                v_serial = msgpack.packb(v, use_bin_type=True)

            pipe.zadd(zset, k, v_serial)

        pipe.execute()
Exemplo n.º 8
0
def main():
    t = IntervalTree()

    with open(instr_file) as f:
        for l in f:
            l = l.strip()
            if (l.endswith("@") or l.endswith("#") or l.startswith("==")):
                continue
            words = l.split()
            if (words[1] != words[2]):
                overhead = 0
                if (words[-1] == "overhead"):
                    overhead = int(words[2]) - int(words[1])
                t[int(words[1]):int(words[2])] = overhead

    t.merge_overlaps(lambda acc, v: acc + v)
    sorted_latencies = sorted(
        list(map(lambda x: x.end - x.begin - x.data, sorted(t))))

    if (len(sorted_latencies) > 0):
        max_latency = sorted_latencies[len(sorted_latencies) - 1]
        avg_latency = sum(sorted_latencies) / len(sorted_latencies)
    else:
        max_latency = 0
        avg_latency = 0

    distr = distribution(sorted_latencies)

    out = {}
    out["name"] = bench_name
    out["mean_latency"] = avg_latency
    out["max_latency"] = max_latency
    out["distr_latency"] = distr

    print(json.dumps(out))
Exemplo n.º 9
0
def repeats_main(args):
    paf = PAF.from_file(args.inpaf)

    repeats = defaultdict(list)

    for p in paf:
        qgenome = get_genome_name(p.query, args.sep)
        tgenome = get_genome_name(p.target, args.sep)

        if qgenome != tgenome:
            continue

        query, qinterval = p.query_as_interval()
        target, tinterval = p.target_as_interval()

        if (query == target) and qinterval.overlaps(tinterval):
            filtered = sym_diff(qinterval, tinterval)
            repeats[query].extend(filtered)
        else:
            repeats[query].append(qinterval)
            repeats[target].append(tinterval)

    for seqid, intervals in repeats.items():
        itree = IntervalTree(intervals)
        itree.merge_overlaps()
        for interval in itree:
            bed = BED(seqid, interval.begin, interval.end)
            print(bed, file=args.outfile)
    return
Exemplo n.º 10
0
def test_merge_overlaps_reducer_wo_initializer():
    def reducer(old, new):
        return "%s, %s" % (old, new)
    # empty tree
    e = IntervalTree()
    e.merge_overlaps(data_reducer=reducer)
    e.verify()
    assert not e

    # One Interval in tree
    o = IntervalTree.from_tuples([(1, 2, 'hello')])
    o.merge_overlaps(data_reducer=reducer)
    o.verify()
    assert len(o) == 1
    assert sorted(o) == [Interval(1, 2, 'hello')]

    # many Intervals in tree, with gap
    t = trees['ivs1']()
    t.merge_overlaps(data_reducer=reducer)
    t.verify()
    assert len(t) == 2
    assert sorted(t) == [
        Interval(1, 2,'[1,2)'),
        Interval(4, 15, '[4,7), [5,9), [6,10), [8,10), [8,15), [10,12), [12,14), [14,15)')
    ]
Exemplo n.º 11
0
def interval_tree(start_data, stop_data, buffer_len):
    starts = []
    stops = []
    t = IntervalTree()

    ## Shrink each interval by the buffer size
    for key, value in start_data.iteritems():
        for i in range(0, len(value)):
            shrunk_start = value[i] + buffer_len / 2.0
            shrunk_stop = stop_data[key][i] + 1 - buffer_len / 2.0
            if shrunk_start < shrunk_stop:
                t[shrunk_start:shrunk_stop] = (shrunk_start, shrunk_stop)

    ## Add chromosome endpoints without buffer
    chrom_start, chrom_stop = get_extremes(start_data, stop_data)
    if chrom_start < t.begin() + 1:
        t[chrom_start:t.begin() + 1] = (chrom_start, t.begin() + 1)
    if t.end() - 1 < chrom_stop:
        t[t.end() - 1:chrom_stop] = (t.end() - 1, chrom_stop)

    ## Merge intervals that overlap in tree to get consensus
    t.merge_overlaps()

    ## Check that original intervals only overlap with one consensus interval
    for key, value in start_data.iteritems():
        for i in range(0, len(value)):
            start = value[i]
            stop = stop_data[key][i] + 1
            if len(t[start:stop]) > 1:
                ## If they overlap with more than one
                ## Remove part of consensus interval
                ## This will never be more than the buffer size/2
                assert (len(t[start:stop]) == 2)
                remove_start = 0
                remove_stop = 0
                min_length = float('inf')
                for interval in t[start:stop]:
                    overlap_start, overlap_stop = get_overlap(
                        (start, stop), (interval[0], interval[1]))
                    if (overlap_stop - overlap_start) < min_length:
                        min_length = overlap_stop - overlap_start
                        remove_start = overlap_start
                        remove_stop = overlap_stop
                print(min_length)
                t.chop(remove_start, remove_stop)
                assert (min_length <= buffer_len / 2.0)
                assert (len(t[start:stop]) < 2)

    ## Get consensus start and stop points
    chrom_len = chrom_stop - chrom_start
    covered = 0.0
    for interval in sorted(t):
        starts.append(interval[0])
        stops.append(interval[1])
        covered = covered + (interval[1] - interval[0])

    print("The percentage of the chromosome covered is: %s" % '{0:.2f}'.format(
        (covered / chrom_len) * 100.0))

    return (starts, stops)
Exemplo n.º 12
0
    def get_safeguard_speed(self,
                            ego_idx,
                            decision_trajectory,
                            desired_speed,
                            pred_t=4,
                            safe_dis_range=7):

        danger_speed_range = IntervalTree()
        for collision_idx, collision_time in self.collision_points:
            if collision_idx > len(decision_trajectory) - 1:
                continue
            dis_sum = np.cumsum(
                np.linalg.norm(np.diff(decision_trajectory, axis=0), axis=1))
            dis = dis_sum[collision_idx - 1] - dis_sum[ego_idx]

            speed_min = max(0, ((dis - safe_dis_range) / collision_time))
            speed_max = min(desired_speed + 0.01,
                            (dis + safe_dis_range) / collision_time)
            if speed_min >= speed_max:
                continue
            # rospy.logdebug("col_idx:%d,col_time:%.2f,ocp_min:%.2f,ocp_max:%.2f,desired_speed:%.2f)",
            #                                                                 collision_idx,collision_time,
            #                                                                 speed_min,speed_max,desired_speed)
            danger_speed_range[speed_min:speed_max] = (speed_min, speed_max)

        if len(danger_speed_range[desired_speed]) == 0:
            return desired_speed
        else:
            danger_speed_range.merge_overlaps()
            speed = sorted(danger_speed_range)[-1].begin
            return speed
Exemplo n.º 13
0
def find_len_non_overlap(interval: Interval, itree: IntervalTree) -> int:
    overlaps = IntervalTree(itree.overlap(interval))
    overlaps.merge_overlaps()

    len_overlap = sum([intersection(interval, o).length() for o in overlaps])

    return interval.length() - len_overlap
Exemplo n.º 14
0
def test_merge_overlaps_reducer_wo_initializer():
    def reducer(old, new):
        return "%s, %s" % (old, new)
    # empty tree
    e = IntervalTree()
    e.merge_overlaps(data_reducer=reducer)
    e.verify()
    assert not e

    # One Interval in tree
    o = IntervalTree.from_tuples([(1, 2, 'hello')])
    o.merge_overlaps(data_reducer=reducer)
    o.verify()
    assert len(o) == 1
    assert sorted(o) == [Interval(1, 2, 'hello')]

    # many Intervals in tree, with gap
    t = IntervalTree.from_tuples(data.ivs1.data)
    t.merge_overlaps(data_reducer=reducer)
    t.verify()
    assert len(t) == 2
    assert sorted(t) == [
        Interval(1, 2,'[1,2)'),
        Interval(4, 15, '[4,7), [5,9), [6,10), [8,10), [8,15), [10,12), [12,14), [14,15)')
    ]
Exemplo n.º 15
0
class AslrOracle:
  def __init__(self):
    self.queries = 0

    self.InitCache()

  def CheckAddress(self, address):
    return self.CheckRange(address, 0x1000)

  def InitCache(self):
    self.cached_queries = 0
    self.good_regions = IntervalTree()
    self.bad_regions = IntervalTree()

  def InsertToCache(self, start, end, valid):
    if valid:
      self.good_regions.add(Interval(start, end + 1))
      self.good_regions.merge_overlaps()
    else:
      self.bad_regions.add(Interval(start, end))

  def CheckCache(self, start, end):
    good_overlaps = self.good_regions.overlap(start, end)
    for overlap in good_overlaps:
      if (overlap[0] <= start) and (overlap[1] >= end):
        self.cached_queries += 1
        return True

    bad_overlaps = self.bad_regions.envelop(start, end)
    if len(bad_overlaps) > 0:
      self.cached_queries += 1
      return False

    return None
Exemplo n.º 16
0
 def get_length(self):
     gene_tree = IntervalTree()
     for t in self.transcript.values():
         for e in t.exon:
             gene_tree.addi(e[0], e[1])
     gene_tree.merge_overlaps()
     return sum(x.end - x.begin + 1 for x in gene_tree)
Exemplo n.º 17
0
def filter_nstretches(
    itrees: Mapping[str, IntervalTree],
    nstretches: Mapping[str, IntervalTree],
    min_non_overlap: int,
) -> None:
    """
    Remove contigs without much going on outside N stretches.
    """

    for scaffold, itree in nstretches.items():

        # Loop through all of the potential breaks
        for nstretch in itree:
            # Find "contigs" that overlap the potential break
            # We do this in sorted order, from smallest to largest alignment
            # that means shorter ones are removed first
            contigs = sorted(itrees[scaffold].overlap(nstretch),
                             key=lambda x: x.length())

            to_drop = set()

            # Loop through the contigs to test.
            for contig in contigs:
                # Find if they overlap with any other n stretches.
                n_overlaps = nstretches[scaffold].overlap(contig)

                # Get an intervaltree of all contigs overlapping this one.
                contig_overlaps = IntervalTree(
                    itrees[scaffold].overlap(contig))

                # Remove all of the n-chunks from the intervals.
                # Note the "Coords" is still duplicated in the data attribute
                for n_overlap in n_overlaps:
                    contig_overlaps.chop(n_overlap.begin, n_overlap.end)

                # Get the intervals that aren't the overlap under
                # consideration.
                contig_overlaps_itree = IntervalTree(o for o in contig_overlaps
                                                     if o.data != contig.data)
                contig_overlaps_itree.merge_overlaps()

                # Get the fragments of the overlap under consideration
                contig_itree = IntervalTree(o for o in contig_overlaps
                                            if o.data == contig.data)

                # For each of the fragments, find how many new Non-N bases it
                # contributes to the contigging.
                len_non_overlap = sum([
                    find_len_non_overlap(f, contig_overlaps_itree)
                    for f in contig_itree
                ])

                # Remove the contig if it doesn't cut the muster
                if len_non_overlap < min_non_overlap:
                    to_drop.add(contig)

            for contig in to_drop:
                itrees[scaffold].remove(contig)
    return
Exemplo n.º 18
0
    def get_cnv_df(self, pat_tree, mat_tree):
        both_alleles = IntervalTree(list(pat_tree) + list(mat_tree))
        both_alleles.split_overlaps()
        both_alleles.merge_overlaps(data_reducer=self.specify_levels)
        seg_df = []
        for segment in both_alleles:
            seg_df.append([self.chr_name, segment.begin, segment.end, segment.data['major'], segment.data['minor']])

        return pd.DataFrame(seg_df, columns=['Chromosome', 'Start.bp', 'End.bp', 'major', 'minor'])
Exemplo n.º 19
0
def get_chrom_features(chrom, txs):
    '''
    Get all merged intervals on given chromosome
    '''
    chrom_features = txs[txs.seqname == chrom]
    chrom_features = zip(chrom_features.start, chrom_features.end)

    chrom_tree = IntervalTree()
    [chrom_tree.addi(s, e) for s, e in chrom_features if e - s > 0]
    chrom_tree.merge_overlaps()

    return chrom_tree
Exemplo n.º 20
0
def get_gene_lookup(tx_ref_file):
    '''
    Generate start/end coordinate reference
    for genes and output as an interval tree
    dictionary. Also output dataframe containing
    chromosome, start and ends for all exons.
    '''
    ref_trees, ex_ref_out = None, None
    if tx_ref_file == '':
        return ref_trees, ex_trees, ex_ref_out

    logging.info('Generating lookup for genes...')
    #TODO: standardise with make_supertranscript for gtf handling
    tx_ref = pd.read_csv(tx_ref_file, comment='#', sep='\t', header=None, low_memory=False)
    tx_ref['gene_id'] = tx_ref[8].apply(lambda x: get_attribute(x, 'gene_id'))
    tx_ref['gene'] = tx_ref[8].apply(lambda x: get_attribute(x, 'gene_name'))

    # create start/end gene lookup, grouping adjacent rows
    # (this prevents merging distant genes with the same IDs)
    gn_ref = tx_ref[[0, 3, 4, 'gene_id', 'gene']]
    gn_ref.columns = ['chrom', 'start', 'end', 'gene_id', 'gene']
    adj_check = (gn_ref.gene_id != gn_ref.gene_id.shift()).cumsum()
    gn_ref = gn_ref.groupby(['chrom', 'gene_id', 'gene', adj_check],
                            as_index=False, sort=False).agg({'start': min, 'end': max})
    gn_ref = gn_ref.drop_duplicates()

    # start/end coordinates for gene matching
    ref_trees = {}
    chroms = np.unique(gn_ref.chrom.values)
    for chrom in chroms:
        chr_ref = gn_ref[gn_ref.chrom == chrom].drop_duplicates()
        ref_tree = IntervalTree()
        for s,e,g in zip(chr_ref['start'].values, chr_ref['end'].values, chr_ref['gene'].values):
            if g != '':
                ref_tree.addi(s-1, e, g)
        ref_trees[chrom] = ref_tree

    # merged exon boundaries for block annotation
    ex_ref = tx_ref[tx_ref[2] == 'exon']
    ex_ref_out = pd.DataFrame()
    ex_trees = {}
    for chrom in chroms:
        chr_ref = ex_ref[ex_ref[0] == chrom].drop_duplicates()
        ex_tree = IntervalTree()
        for s,e in zip(chr_ref[3].values, chr_ref[4].values):
            ex_tree.addi(s-1, e)
        ex_tree.merge_overlaps()
        tmp = pd.DataFrame([(chrom, tree[0], tree[1]) for tree in ex_tree],
                           columns=['chrom', 'start', 'end'])
        ex_ref_out = pd.concat([ex_ref_out, tmp], ignore_index=True)
        ex_trees[chrom] = ex_tree

    return ref_trees, ex_trees, ex_ref_out
Exemplo n.º 21
0
def total_intersection(itree: IntervalTree, interval: Interval) -> int:
    if interval.length() <= 0:
        return 0

    total = 0
    ovlps = IntervalTree(itree.overlap(interval))
    ovlps.merge_overlaps()
    for ovlp in ovlps:
        inter = intersect(interval, ovlp)
        total += inter.length()

    return total
Exemplo n.º 22
0
    def score_document_regions(self, query, doc, fast=False):
        if fast:
            return self.fast_score_document_regions(query, doc)

        try:
            nltk.data.find("tokenizers/punkt")
        except (LookupError, OSError):
            nltk.download("punkt")
        try:
            nltk.data.find("corpora/stopwords")
        except LookupError:
            nltk.download("stopwords")
        result = {}
        stops = stopwords.words("english") if self.skip_stopwords else None

        qfield_values = []
        specified_qfields = list(filter(None, self.queryfield))
        # Choose a query field to do the highlighting with
        if specified_qfields:
            for fname in specified_qfields:
                qfield_values.append(query._asdict()[fname])
        else:
            # Use the first field in the query that is not the id
            # ._asdict() is an OrderedDict, so this is deterministic
            for fname, fval in query._asdict().items():
                if fname != "query_id":
                    qfield_values = [fval]
                    break

        assert len(qfield_values)
        for qfield_value in qfield_values:
            for word in word_tokenize(qfield_value):
                word = word.lower()
                if not word.isalpha() and not word.isnumeric():
                    continue
                if stops and word.lower() in stops:
                    continue
                for dfield, dvalue in zip(doc._fields, doc):
                    if not isinstance(dvalue, str):
                        continue  # skip non-strings for now
                    if dfield not in result:
                        result[dfield] = []
                    for match in re.finditer("\\b" + re.escape(word) + "\\b",
                                             dvalue.lower()):
                        result[dfield].append([match.start(), match.end()])
        for field, values in list(result.items()):
            tree = IntervalTree()
            for start, stop in values:
                tree[start:stop] = 1
            tree.merge_overlaps()
            result[field] = sorted([[i.begin, i.end, 1.0] for i in tree])
        return result
Exemplo n.º 23
0
def get_unique_loci(intervals):

    grouped_intervals = defaultdict(list)
    for genome, seqid, start, end in intervals:
        grouped_intervals[(genome, seqid)].append(Interval(start, end))

    unique_loci = list()
    for (genome, seqid), intvls in grouped_intervals.items():
        itree = IntervalTree(intvls)
        itree.merge_overlaps()
        for intvl in itree:
            unique_loci.append((genome, seqid, intvl.begin, intvl.end))

    return unique_loci
Exemplo n.º 24
0
    def read(self, length, offset, fh):
        """
        Read data from this GhostFile.
        :param length:
        :param offset:
        :param fh:
        :return:
        """
        if offset >= self.__filesize or length == 0:
            return b''

        data = b''

        intervals = IntervalTree(self.__rewritten_intervals[offset:offset+length])
        intervals.merge_overlaps()
        intervals.slice(offset)
        intervals.slice(offset + length)
        intervals = sorted(intervals[offset:offset+length])
        assert offset < self.__filesize
        assert intervals[0].begin >= offset and intervals[-1].end <= offset + length if len(intervals) > 0 else True

        if len(intervals) == 0:
            return b'\x00' * min(length, self.__filesize - offset)

        assert len(intervals) > 0

        # Used to fill any hole at the start of the read range
        end_prev_interval = offset

        # Read the data
        for interv in intervals:
            # Fill any hole before this interval
            data += b'\x00' * (interv.begin - end_prev_interval)

            os.lseek(fh, interv.begin, os.SEEK_SET)
            data += os.read(fh, interv.length())

            end_prev_interval = interv.end

        # Fill any hole at the end of the read range
        data += b'\x00' * (offset + length - intervals[-1].end)

        if offset + length > self.__filesize:
            data = data[0:self.__filesize-offset]

        assert len(data) <= length
        assert offset + len(data) <= self.__filesize
        return data
Exemplo n.º 25
0
    def calc_full_cnv(self, phylogeny):
        pat_tree = IntervalTree()
        for i in self.paternal_tree:
            weighted_cn = i.data.cn_change * phylogeny.ccfs[i.data.cluster_num]
            pat_tree[i.begin: i.end] = Event(i.data.type, i.data.allele, i.data.cluster_num, weighted_cn)
        pat_tree.split_overlaps()
        pat_tree.merge_overlaps(data_reducer=self.sum_levels)

        mat_tree = IntervalTree()
        for i in self.maternal_tree:
            weighted_cn = i.data.cn_change * phylogeny.ccfs[i.data.cluster_num]
            mat_tree[i.begin: i.end] = Event(i.data.type, i.data.allele, i.data.cluster_num, weighted_cn)
        mat_tree.split_overlaps()
        mat_tree.merge_overlaps(data_reducer=self.sum_levels)

        # could deliver a Chromosome (or child class) instead of just a tree
        return pat_tree, mat_tree
Exemplo n.º 26
0
	def update(self):
		intervals = IntervalTree()
		stack = []
		for match in self.secretsRe[0].finditer(self.fullString):
			intervals.addi(match.start(), match.end(), "begin") #<err.*?>
		for match in self.secretsRe[1].finditer(self.fullString):
			intervals.addi(match.start(), match.end(), "center") #</err><corr>
		for match in self.secretsRe[2].finditer(self.fullString):
			intervals.addi(match.start(), match.end(), "end") #</corr>
		intervals = sorted(intervals)
		hidedIntervals = IntervalTree()
		if(self.mode == "latest"):
			for i in intervals:
				if(i.data == "begin"):
					stack.append(i)
				elif(i.data == "center"):
					token = stack.pop()
					if(len(stack) == 0):
						hidedIntervals.addi(token.begin, i.end, None)
				elif(i.data == "end"):
					hidedIntervals.addi(i.begin, i.end, None)
		else:
			for i in intervals:
				if(i.data == "begin"):
					hidedIntervals.addi(i.begin, i.end, None)
				elif(i.data == "center"):
					stack.append(i)
				elif(i.data == "end"):
					token = stack.pop()
					if(len(stack) == 0):
						hidedIntervals.addi(token.begin, i.end, None)
		hidedIntervals.merge_overlaps()
		if(hidedIntervals != None):
			self.hidedIntervals = sorted(hidedIntervals)
		else:
			self.hidedIntervals = []
		if(len(self.hidedIntervals) == 0):
			self.visibleString = self.fullString
		else:
			self.visibleString = ""
			curIndex = 0
			for match in self.hidedIntervals:
				if(curIndex < match[0]):
					self.visibleString += self.fullString[curIndex:match[0]]
				curIndex = match[1]
			self.visibleString += self.fullString[curIndex:]
Exemplo n.º 27
0
def sweep_function(binary, start, addr_stack):
    block_tree = IntervalTree()
    branch_stack = [start]
    heapify(branch_stack)
    while branch_stack:
        addr = heappop(branch_stack)
        part = FunctionBlock(start_addr=addr)
        for inst in THUMB_DISASSEMBLER.disasm(binary[addr:], addr):
            i_addr = inst.address
            if block_tree.overlaps(i_addr):
                # we already visited this address (maybe a loop?)
                break
            part.instructions += 1
            part.stop_addr = i_addr + inst.size
            ops = len(inst.operands)
            reg_reads, reg_writes = inst.regs_access()
            if inst.id in BRANCH_IDS:
                if ARM_REG_LR in reg_reads:
                    if inst.cc == ARM_CC_AL:
                        # this is a conditional return
                        continue
                    # this is a unconditional return
                    break
                new_addr = parse_imm(inst.operands[0].imm)
                heappush(branch_stack, new_addr)
                if inst.cc == ARM_CC_AL:
                    part.branches += 1
                    break
                else:
                    part.conditional_branches += 1
            elif ops == 1 and inst.id in CALL_IDS:
                new_addr = parse_imm(inst.operands[0].imm)
                addr_stack.add(new_addr)
                part.calls.add(new_addr)
            elif ops == 2 and inst.id in COND_BRANCH_IDS:
                new_addr = parse_imm(inst.operands[1].imm)
                heappush(branch_stack, new_addr)
                part.conditional_branches += 1
            elif ARM_REG_PC in reg_writes:
                # assume return for any otherwise unmatched changes of PC
                break
        if part.start_addr < part.stop_addr:
            block_tree[part.start_addr:part.stop_addr] = part
    block_tree.merge_overlaps(add)
    return block_tree
Exemplo n.º 28
0
    def fast_score_document_regions(self, query, doc):
        """
        Score document regions with Aho–Corasick_algorithm.
        :param query:
        :param doc:
        :param run_idx:
        :return:
        """
        try:
            nltk.data.find("tokenizers/punkt")
        except (LookupError, OSError):
            nltk.download("punkt")
        try:
            nltk.data.find("corpora/stopwords")
        except LookupError:
            nltk.download("stopwords")
        result = {}
        stops = stopwords.words("english") if self.skip_stopwords else None
        query_tokens = set()
        for qfield_value in query:
            query_tokens.update([
                w.lower() for w in word_tokenize(qfield_value)
                if (w.isalpha() or w.isnumeric()) and (
                    not stops or not w.lower() in stops)
            ])
        if not hasattr(self, "A"):
            self.A = ahocorasick.Automaton()
        self.A.clear()
        for idx, token in enumerate(query_tokens):
            self.A.add_word(token, (idx, token))
        self.A.make_automaton()

        for dfield, dvalue in zip(doc._fields, doc):
            matches = [(end_idx - len(match) + 1, end_idx)
                       for end_idx, (_, match) in self.A.iter(dvalue.lower())]
            result[dfield] = matches

        for field, values in list(result.items()):
            tree = IntervalTree()
            for start, stop in values:
                tree[start:stop + 1] = 1
            tree.merge_overlaps()
            result[field] = sorted([[i.begin, i.end, 1.0] for i in tree])
        return result
Exemplo n.º 29
0
def paf_to_intervals(pafs: Iterable[PAF]) -> Dict[Tuple[str, str], float]:
    """ """

    pairwise_intervals: Dict[Tuple[str, str],
                             List[Interval]] = defaultdict(list)  # noqa
    lengths: Dict[str, int] = dict()

    for paf in pafs:
        if paf.query == paf.target:
            continue

        lengths[paf.query] = paf.qlen
        lengths[paf.target] = paf.tlen

        if paf.qlen < paf.tlen:
            _, interval = paf.query_as_interval()
            id_ = (paf.query, paf.target)
        elif paf.tlen < paf.qlen:
            _, interval = paf.target_as_interval()
            id_ = (paf.target, paf.query)
        elif paf.query < paf.target:
            _, interval = paf.query_as_interval()
            id_ = (paf.query, paf.target)
        else:
            _, interval = paf.target_as_interval()
            id_ = (paf.target, paf.query)

        pairwise_intervals[id_].append(interval)

    pairwise_covs: Dict[Tuple[str, str], float] = dict()

    for (query, target), intervals in pairwise_intervals.items():
        itree = IntervalTree(intervals)
        itree.merge_overlaps()

        ali_length = 0
        for interval in itree:
            ali_length += interval.length()

        contig_length = lengths[query]
        cov = ali_length / contig_length
        pairwise_covs[(query, target)] = cov

    return pairwise_covs
def seq_from_exons_introns(exons, introns, join=True):
    """
    Merges exons and introns and returns sequence
    Note that exons and introns are in different formats
    Exons are tuples of (exn_num, chrom, start, stop, strand, gene) (should be nonoverlapping)
    Introns are tuples of (intron_seq, intron_gcoords) that are nonoverlappping by construction
    We ignore the intron sequence IN CASE the coords overlap
    """
    itree = IntervalTree()
    chroms = set()
    strands = set()
    for exn_num, chrom, start, stop, strand, gene in exons:
        chroms.add(chrom)
        strands.add(strand)
        itree[start:stop] = f"exon_{exn_num}"

    assert len(chroms) == 1
    chrom = chroms.pop()
    assert len(strands) == 1
    strand = strands.pop()

    for i, gcoord in enumerate(introns[1]):
        chrom, startstop, strand = gcoord.split(":")
        start, stop = map(int, startstop.split("-"))
        itree[start:stop] = f"ri_{i}"

    itree_orig = itree.copy()
    itree.merge_overlaps(lambda x, y: ";".join([x, y]))
    if len(itree) != len(itree_orig):
        logging.warn(f"Contains overlaps: {itree_orig}")

    # The itree sorts everything in 5' to 3' regardless of strand
    seqs = []
    for interval in itree:
        # Actual sequences are rev comped properly
        # seq = GENOME_FA[chrom][interval.begin:interval.end]
        seq = GENOME_FA.get_seq(chrom, interval.begin, interval.end,
                                strand == "-")
        assert seq.seq
        seqs.append(seq.seq)

    return ''.join(seqs) if join else seqs
Exemplo n.º 31
0
def merge_overlapping(features: Sequence[GFFRecord],
                      pad: int) -> Iterator[Tuple[int, int]]:
    """ """

    itree = IntervalTree(pad_intervals(gffrecords_to_intervals(features), pad))

    itree.merge_overlaps(strict=False)

    seen: Set[Tuple[int, int]] = set()

    for interval in itree:

        if (interval.begin, interval.end) in seen:
            continue
        else:
            seen.add((interval.begin, interval.end))

        yield interval.begin, interval.end

    return
Exemplo n.º 32
0
def merge_short_intervals(tree: IntervalTree,
                          min_len: int = 1) -> IntervalTree:
    """Merge short intervals with their neighbour."""
    merged_tree = IntervalTree()
    for i, iv in enumerate(sorted(tree)):
        if iv.length() >= min_len:
            merged_tree.add(iv)
            continue
        if has_adjoining_neighbour_to_right(iv, tree):
            # have to add 2 to end to force merge as is no inclusive
            merged_tree.add(Interval(iv.begin, iv.end + 2, data=iv.data))
        elif has_adjoining_neighbour_to_left(iv, tree):
            merged_tree.add(Interval(iv.begin - 1, iv.end, data=iv.data))

    merged_tree.merge_overlaps(data_reducer=data_reducer)

    if any(iv.length() < min_len for iv in merged_tree):
        return merge_short_intervals(merged_tree, min_len=min_len)
    else:
        return merged_tree
Exemplo n.º 33
0
def get_tx_juncs(read):
    '''
    Get all junctions from the given contig
    '''
    starts, ends = zip(*read.get_blocks())

    # merge adjacent 'junctions' (i.e. insertions)
    blocks = IntervalTree()
    for s, e in zip(starts, ends):
        blocks.addi(s, e)
    blocks.merge_overlaps(strict=False)

    starts = np.sort([block[0] for block in blocks])
    ends = np.sort([block[1] for block in blocks])

    chroms = [read.reference_name] * (len(starts) - 1)
    tx_juncs = list(zip(chroms, ends[:-1], starts[1:]))
    tx_juncs = [junc for junc in tx_juncs if (junc[2] - junc[1]) >= MIN_GAP]

    return tx_juncs
Exemplo n.º 34
0
def test_merge_overlaps_empty():
    t = IntervalTree()
    t.merge_overlaps()
    t.verify()

    assert len(t) == 0
Exemplo n.º 35
0
	def get_unknown_meanings(self, w=None,option=None):
		ivt = IntervalTree([Interval(iv.begin,iv.end) for iv in self._content_coding if not iv.data])
		ivt.merge_overlaps()
		return ivt
Exemplo n.º 36
0
class BitwrappedStream(object):

	"""A stream that wraps other streams to provide bit-level
	access"""

	closed = True

	def __init__(self, stream):
		"""Init the bit-wrapped stream

		:stream: The normal byte stream
		"""
		self._stream = stream
		self._bits = collections.deque()

		self.closed = False
		
		# assume that bitfields end on an even boundary,
		# otherwise the entire stream will be treated as
		# a bit stream with no padding
		self.padded = True

		# packed left-to-right
		self.direction = BIT_DIR_LEFT_RIGHT

		self.range_set = IntervalTree()
	
	def is_eof(self):
		"""Return if the stream has reached EOF or not
		without discarding any unflushed bits

		:returns: True/False
		"""
		pos = self._stream.tell()
		byte = self._stream.read(1)
		self._stream.seek(pos, 0)

		return utils.binary(byte) == utils.binary("")
		
	def close(self):
		"""Close the stream
		"""
		self.closed = True
		self._flush_bits_to_stream()
		self._stream.close()
	
	def flush(self):
		"""Flush the stream
		"""
		self._flush_bits_to_stream()
		self._stream.flush()
	
	def isatty(self):
		"""Return if the stream is a tty
		"""
		return self._stream.isatty()
	
	def read(self, num):
		"""Read ``num`` number of bytes from the stream. Note that this will
		automatically resets/ends the current bit-reading if it does not
		end on an even byte AND ``self.padded`` is True. If ``self.padded`` is
		True, then the entire stream is treated as a bitstream.

		:num: number of bytes to read
		:returns: the read bytes, or empty string if EOF has been reached
		"""
		start_pos = self.tell()

		if self.padded:
			# we toss out any uneven bytes
			self._bits.clear()
			res = utils.binary(self._stream.read(num))
		else:
			bits = self.read_bits(num * 8)
			res = bits_to_bytes(bits)
			res = utils.binary(res)

		end_pos = self.tell()
		self._update_consumed_ranges(start_pos, end_pos)

		return res
	
	def read_bits(self, num):
		"""Read ``num`` number of bits from the stream

		:num: number of bits to read
		:returns: a list of ``num`` bits, or an empty list if EOF has been reached
		"""
		if num > len(self._bits):
			needed = num - len(self._bits)
			num_bytes = (needed // 8) + 1
			read_bytes = self._stream.read(num_bytes)

			for bit in bytes_to_bits(read_bytes):
				self._bits.append(bit)

		res = []
		while len(res) < num and len(self._bits) > 0:
			res.append(self._bits.popleft())

		return res
	
	def write(self, data):
		"""Write data to the stream

		:data: the data to write to the stream
		:returns: None
		"""
		if self.padded:
			# flush out any remaining bits first
			if len(self._bits) > 0:
				self._flush_bits_to_stream()
			self._stream.write(data)
		else:
			# nothing to do here
			if len(data) == 0:
				return

			bits = bytes_to_bits(data)
			self.write_bits(bits)
	
	def write_bits(self, bits):
		"""Write the bits to the stream.

		Add the bits to the existing unflushed bits and write
		complete bytes to the stream.
		"""
		for bit in bits:
			self._bits.append(bit)

		while len(self._bits) >= 8:
			byte_bits = [self._bits.popleft() for x in six.moves.range(8)]
			byte = bits_to_bytes(byte_bits)
			self._stream.write(byte)
		
		# there may be unflushed bits leftover and THAT'S OKAY
	
	def tell(self):
		"""Return the current position in the stream (ignoring bit
		position)

		:returns: int for the position in the stream
		"""
		res = self._stream.tell()
		if len(self._bits) > 0:
			res -= 1
		return res
	
	def seek(self, pos, seek_type=0):
		"""Seek to the specified position in the stream with seek_type.
		Unflushed bits will be discarded in the case of a seek.

		The stream will also keep track of which bytes have and have
		not been consumed so that the dom will capture all of the
		bytes in the stream.

		:pos: offset
		:seek_type: direction
		:returns: TODO

		"""
		self._bits.clear()
		return self._stream.seek(pos, seek_type)
	
	def size(self):
		"""Return the size of the stream, or -1 if it cannot
		be determined.
		"""
		pos = self._stream.tell()
		# seek to the end of the stream
		self._stream.seek(0,2)
		size = self._stream.tell()
		self._stream.seek(pos, 0)

		return size
	
	def unconsumed_ranges(self):
		"""Return an IntervalTree of unconsumed ranges, of the format
		(start, end] with the end value not being included
		"""
		res = IntervalTree()

		prev = None

		# normal iteration is not in a predictable order
		ranges = sorted([x for x in self.range_set], key=lambda x: x.begin)

		for rng in ranges:
			if prev is None:
				prev = rng
				continue
			res.add(Interval(prev.end, rng.begin))
			prev = rng
		
		# means we've seeked past the end
		if len(self.range_set[self.tell()]) != 1:
			res.add(Interval(prev.end, self.tell()))

		return res
	
	# -----------------------------
	# PRIVATE FUNCTIONS
	# -----------------------------

	def _update_consumed_ranges(self, start_pos, end_pos):
		"""Update the ``self.consumed_ranges`` array with which
		byte ranges have been consecutively consumed.
		"""
		self.range_set.add(Interval(start_pos, end_pos+1))
		self.range_set.merge_overlaps()
	
	def _flush_bits_to_stream(self):
		"""Flush the bits to the stream. This is used when
		a few bits have been read and ``self._bits`` contains unconsumed/
		flushed bits when data is to be written to the stream
		"""
		if len(self._bits) == 0:
			return 0

		bits = list(self._bits)

		diff = 8 - (len(bits) % 8)
		padding = [0] * diff

		bits = bits + padding

		self._stream.write(bits_to_bytes(bits))

		self._bits.clear()
nodes_to_process = True

start = time.time()

more_file = True
merge_requested = False
loop_count = 0
while nodes_to_process:
  loop_count += 1
  if loop_count > MIN_MEM_NODE_COUNT:
    loop_count = 0
    merge_requested = True
  #REFILL THE BUFFER
  if (g_person_node_count+g_wine_node_count) < MIN_MEM_NODE_COUNT:
    if merge_requested:
      pt.merge_overlaps()
      wt.merge_overlaps()
      merge_requested = False
    while (g_person_node_count+g_wine_node_count) < MAX_MEM_NODE_COUNT and more_file:
      line = f.readline() #read in line from input
      if line:
        add_line_to_graph(line)
      else:
        more_file = False

  # WINE SECTION
  wine_node_with_fewest_edges = None
  wine_node_with_fewest_edges_edge_count = FEWER_COMPARE
  wine_search_count = 0

  for node in nx.dfs_postorder_nodes(fg, "r"): #dfs postorder is magic and should be worshiped. --Andy Weir