Пример #1
def _coverage(scdf, ocdf, kwargs):

    if scdf.empty:
        return None
    if ocdf.empty:
        df = scdf.copy()
        df.insert(df.shape[1], "FractionOverlaps", 0.0)
        return df

    oncls = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values)

    starts = scdf.Start.values
    ends = scdf.End.values
    indexes = scdf.index.values

    _lengths = oncls.coverage(starts, ends, indexes)
    _lengths = _lengths / (ends - starts)
    _fractions = _lengths
    _fractions = _fractions.astype("float64")
    _fractions = np.nan_to_num(_fractions)

    scdf = scdf.copy()

    scdf.insert(scdf.shape[1], "FractionOverlaps", _fractions)

    return scdf
Пример #2
def _both_dfs(scdf, ocdf, how=False):

    assert how in "containment first".split() + [False, None]
    starts = scdf.Start.values
    ends = scdf.End.values
    indexes = scdf.index.values

    ocdf = ocdf.reset_index(drop=True)
    it = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values)

    if not how:
        _self_indexes, _other_indexes = it.all_overlaps_both(
            starts, ends, indexes)
    elif how == "containment":
        _self_indexes, _other_indexes = it.all_containments_both(
            starts, ends, indexes)
        _self_indexes, _other_indexes = it.first_overlap_both(
            starts, ends, indexes)

    _self_indexes = _self_indexes
    _other_indexes = _other_indexes
    scdf = scdf.reindex(_self_indexes)
    ocdf = ocdf.reindex(_other_indexes)

    return scdf, ocdf
Пример #3
    def __init__(self, bedFile):
        super(BedObject, self).__init__()

        # This function builds an index tree from the bed file to have a fast check if a read falls within
        # a specified area or not.
        # the hard work is done by NCLS (https://github.com/biocore-ntnu/ncls) which is also used by the
        # pyranges module

        starts = []
        ends = []
        currChr = None
        self.__ncls = {}

        with open(bedFile) as f:
            for line in f:
                # break the line into fields
                lineArray = line.strip().split()
                # if the chromosome is still the same, or we do this the first time, we append
                if currChr == lineArray[0] or currChr is None:
                    # this is not changing anything but for the first time (when currChr is None), but
                    # thats fine as this is neither time consuming, nor the bottle neck, its just not
                    # pretty
                    currChr = lineArray[0]

                    # add the starts and stops to the list

                    # convert to array with dtype (ncls needs that)
                    starts = array(starts, dtype=int64)
                    ends = array(ends, dtype=int64)
                    # add one to the end, to have inclusive ends
                    ends = ends + 1

                    # create the data structure (third column is ids... which could be anything, but
                    # needs to be a number )
                    tmpNcls = NCLS(starts, ends, starts)
                    # store the data structure under its chromosome name
                    self.__ncls[currChr] = tmpNcls

                    # reset all the things for the next chromosome (and initialise it while we are
                    # already at it)
                    currChr = lineArray[0]
                    starts = [lineArray[1]]
                    ends = [lineArray[2]]
            # finally, when we are done with everything, and the currentChr is not None, we need to
            # add the things one last time (just like in the else statement)
            if not currChr is None:
                # convert to array with dtype (ncls needs that)
                starts = array(starts, dtype=int64)
                ends = array(ends, dtype=int64)
                # add one to the end, to have inclusive ends
                ends = ends + 1

                # create the data structure (third column is ids... which could be anything, but
                # needs to be a number )
                tmpNcls = NCLS(starts, ends, starts)
                # store the data structure under its chromosome name
                self.__ncls[currChr] = tmpNcls
Пример #4
def _first_df(scdf, ocdf, how=False, invert=False, n_jobs=1, **kwargs):

    assert how in "containment first".split() + [False, None]
    starts = scdf.Start.values
    ends = scdf.End.values
    indexes = scdf.index.values

    print("n_jobs " * 10)

    if n_jobs > 1:
        scdf = scdf.copy(deep=True)

    it = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values)

    if not how:
        _indexes = it.has_overlaps(starts, ends, indexes)
    elif how == "containment":
        _indexes = it.has_containments(starts, ends, indexes)

    if not invert:
        return scdf.reindex(_indexes)
        return scdf.loc[~scdf.index.isin(_indexes)]
Пример #5
def _overlap(scdf, ocdf, kwargs):

    invert = kwargs["invert"]
    return_indexes = kwargs.get("return_indexes", False)

    if scdf.empty or ocdf.empty:
        return None

    how = kwargs["how"]

    assert how in "containment first".split() + [False, None]
    starts = scdf.Start.values
    ends = scdf.End.values
    indexes = scdf.index.values

    it = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values)

    if not how:
        _indexes = it.all_overlaps_self(starts, ends, indexes)
    elif how == "containment":
        _indexes = it.has_containment(starts, ends, indexes)
        _indexes = it.has_overlaps(starts, ends, indexes)

    if invert:
        _indexes = scdf.index.difference(_indexes)

    if return_indexes:
        return _indexes

    return scdf.reindex(_indexes)
Пример #6
    def test_all_containments_both():

        starts = np.array([5, 10], dtype=np.int64)
        ends = np.array([6, 50], dtype=np.int64)
        ids = np.array([0, 1], dtype=np.int64)

        ncls = NCLS(starts, ends, ids)
        subs, covers = ncls.all_containments_both(starts, ends, ids)


        assert list(subs) == [0, 1] == list(covers)
Пример #7
def test_all_containments_both():

    starts = np.array([1291845632, 3002335232], dtype=int)
    ends = np.array([1292894207, 3002597375], dtype=int)
    ids = np.array([0, 1], dtype=int)

    ncls = NCLS(starts, ends, ids)
    subs, covers = ncls.all_containments_both(starts, ends, ids)


    assert list(subs) == [0, 1] == list(covers)
Пример #8
 def __init__(self, starts=None, ends=None, indices=None, reduce=False):
     self.ncls = None
     if starts is not None and indices is not None:
         if ends is None:
             ends = [s + 1 for s in starts]
         if reduce:
             starts, ends, indices = list(
                 zip(*merge_overlaps(zip(starts, ends, indices))))
         starts = np.array(starts, dtype='i8')
         ends = np.array(ends, dtype='i8')
         indices = np.array(indices, dtype='i8')
         self.ncls = NCLS(starts, ends, indices)
    def __init__(self, node_type, node_id, nodes_list, is_robot=False):
        super(MultiNode, self).__init__(node_type, node_id, data=None, is_robot=is_robot)
        self.nodes_list = nodes_list
        for node in self.nodes_list:
            node.is_robot = is_robot

        self.first_timestep = min(node.first_timestep for node in self.nodes_list)
        self._last_timestep = max(node.last_timestep for node in self.nodes_list)

        starts = np.array([node.first_timestep for node in self.nodes_list], dtype=np.int64)
        ends = np.array([node.last_timestep for node in self.nodes_list], dtype=np.int64)
        ids = np.arange(len(self.nodes_list), dtype=np.int64)
        self.interval_tree = NCLS(starts, ends, ids)
Пример #10
def _intersection(scdf, ocdf, kwargs):

    how = kwargs["how"]

    if ocdf.empty or scdf.empty:
        return None

    assert how in "containment first".split() + [False, None]
    starts = scdf.Start.values
    ends = scdf.End.values
    indexes = scdf.index.values

    in_dtype = ocdf.Start.dtype

    oncls = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values)

    if not how or how is None:
        _self_indexes, _other_indexes = oncls.all_overlaps_both(
            starts, ends, indexes)
    elif how == "containment":
        _self_indexes, _other_indexes = oncls.all_containments_both(
            starts, ends, indexes)
    elif how == "first":
        _self_indexes, _other_indexes = oncls.first_overlap_both(
            starts, ends, indexes)

    _self_indexes = _self_indexes
    _other_indexes = _other_indexes

    scdf, ocdf = scdf.reindex(_self_indexes), ocdf.reindex(_other_indexes)

    new_starts = pd.Series(np.where(scdf.Start.values > ocdf.Start.values,
                                    scdf.Start, ocdf.Start),

    new_ends = pd.Series(np.where(scdf.End.values < ocdf.End.values, scdf.End,

    pd.options.mode.chained_assignment = None  # default='warn'
    scdf.loc[:, "Start"] = new_starts
    scdf.loc[:, "End"] = new_ends
    pd.options.mode.chained_assignment = 'warn'

    if not scdf.empty:
        return scdf
        return None
Пример #11
    def test_ncls():
        # ids = starts

        print(starts, ends, ids)

        ncls = NCLS(starts, ends, ids)

        assert list(ncls.find_overlap(0, 2)) == []
        assert list(ncls.find_overlap(0, 2_147_483_647)) == [(5, 6, 0), (2_147_483_645, 2_147_483_646, 3)]

        r, l = ncls.all_overlaps_both(starts, ends, ids)
        assert list(r) == [0, 3]
        assert list(l) == [0, 3]
Пример #12
def _number_overlapping(scdf, ocdf, kwargs):

    keep_nonoverlapping = kwargs.get("keep_nonoverlapping", True)

    if scdf.empty:
        return None
    if ocdf.empty:
        if keep_nonoverlapping:
            df = scdf.copy()
            # print(df)
            df.insert(df.shape[1], "NumberOverlaps", 0)
            # print("df" * 100)
            # print(df)
            return df
            return None

    oncls = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values)

    starts = scdf.Start.values
    ends = scdf.End.values
    indexes = scdf.index.values

    _self_indexes, _other_indexes = oncls.all_overlaps_both(
        starts, ends, indexes)

    s = pd.Series(_self_indexes)
    counts_per_read = s.value_counts()[s.unique()].reset_index()
    counts_per_read.columns = ["Index", "Count"]

    df = scdf.copy()

    if keep_nonoverlapping:
        _missing_indexes = np.setdiff1d(scdf.index, _self_indexes)
        missing = pd.DataFrame(data={
            "Index": _missing_indexes,
            "Count": 0
        counts_per_read = pd.concat([counts_per_read, missing])
        df = df.loc[_self_indexes]

    counts_per_read = counts_per_read.set_index("Index")

    df.insert(df.shape[1], "NumberOverlaps", counts_per_read)

    return df
Пример #13
class NestedContainmentList(object):
    def __init__(self, starts=None, ends=None, indices=None, reduce=False):
        self.ncls = None
        if starts is not None and indices is not None:
            if ends is None:
                ends = [s + 1 for s in starts]
            if reduce:
                starts, ends, indices = list(
                    zip(*merge_overlaps(zip(starts, ends, indices))))
            starts = np.array(starts, dtype='i8')
            ends = np.array(ends, dtype='i8')
            indices = np.array(indices, dtype='i8')
            self.ncls = NCLS(starts, ends, indices)

    def find_overlaps(self, start, end):
        if self.ncls is None:
            # we allow for empty objects, in which case nothing overlaps
            # use case: non-matching seqids
            return []
        overlaps = []
        for overlap in self.ncls.find_overlap(start, end):
        return overlaps

    def from_intervals(intervals, reduce=False):
        starts, ends, indices = zip(*intervals)
        starts = np.array(starts, dtype='i8')
        ends = np.array(ends, dtype='i8')
        indices = np.array(indices, dtype='i8')
        obj = NestedContainmentList(starts, ends, indices, reduce=reduce)
        return obj
Пример #14
    def ncls_overlap(self, decimal_places = 5, start_idx = 0, end_idx = None):
        # if end_idx is none set as the end of the list
        if end_idx is None:
            end_idx == len(self.start)
        #decimal_places = accuracy of the retentionTime axis (how many decimal places should take into acoount) (since NCLS works optimally with integers)

        # convert the retention times to integers so that it is compatible with ncls
        int_start = (self.start * (10**decimal_places)).astype(int) 
        int_end = (self.end * (10**decimal_places)).astype(int)

        #create the ncls object
        ncls = NCLS(int_start, int_end, self.idx)

        #find all pairwise retention time overlaps, store in a vertical 2XN numpy nd array

        return np.column_stack(ncls.all_overlaps_both(int_start[start_idx:end_idx], int_end[start_idx:end_idx], self.idx[start_idx:end_idx])) #column stack puts the two lists vertically (easier iteration for np.vectorize)
Пример #15
def test_ncls():
    # ids = starts

    print(starts, ends, ids)

    ncls = NCLS(starts, ends, ids)

    assert list(ncls.find_overlap(0, 2)) == []
    print("aaa", list(ncls.find_overlap(9_223_372_036_854_775_805, 9_223_372_036_854_775_806)))
    assert list(ncls.find_overlap(0, 9_223_372_036_854_775_806)) == [(5, 6, 2147483647), (9223372036854775805, 9223372036854775807, 3)]

    r, l = ncls.all_overlaps_both(starts, ends, ids)
    assert list(r) == [2147483647, 3]
    assert list(l) == [2147483647, 3]
Пример #16
    def __init__(self, predicate, records):
        self.predicate = predicate
        self.records = []

        working_tree_map = {}

        for idx, record in enumerate(records):
            genome_pos = predicate(record)

            if genome_pos is None:

            chrom = genome_pos.chrom

            if not chrom in working_tree_map:
                # (starts, ends, ids)
                working_tree_map[chrom] = ([], [], [])

            starts, ends, ids = working_tree_map[chrom]


            idx += 1

        tree_map = {}

        for chrom, (starts, ends, ids) in working_tree_map.items():
            tree_map[chrom] = NCLS(np.array(starts, dtype=np.long),
                                   np.array(ends, dtype=np.long),
                                   np.array(ids, dtype=np.long))

        self.tree_map = tree_map
def gmap_parse_ncls(gmapFile, cutoff):
    gmapLoc = {}
    starts = []
    ends = []
    ids = []
    ongoingCount = 0
    with open(gmapFile, 'r') as fileIn:
        for line in fileIn:
            # Skip unneccessary lines
            if line.startswith('#'):
            sl = line.split('\t')
            if sl[2] != 'cDNA_match':  # I don't think any other type of line is present in a GMAP gff3 file produced with PASA's settings, but this could potentially future proof the script?
            # Get details from line including start, stop, and orientation
            contigID = sl[0]
            contigStart = int(sl[3])
            contigStop = int(sl[4])
            identity = float(sl[5])
            if identity < cutoff:  # Speed up program by only holding onto hits that will pass our cutoff check.
            # Add to our NCLS                                                               # We index using ranges since it provides an easy way to retrieve GMAP matches by coordinates. Since these coordinates aren't unique, we filter any results returned by their contig ID.
                contigStop + 1
            )  # NCLS indexes 0-based, so +1 to make this more logically compliant with gff3 1-based system.
            gmapLoc[ongoingCount] = contigID
            ongoingCount += 1
    # Build the NCLS object
    starts = pd.Series(starts)
    ends = pd.Series(ends)
    ids = pd.Series(ids)
    ncls = NCLS(starts.values, ends.values, ids.values)
    return ncls, gmapLoc
Пример #18
def main(argv):
	if len(argv) < 3:
		print("Usage: bedcov.py <loaded.bed> <streamed.bed>")

	bed, i = {}, 0
	start = timer()
	with open(argv[1]) as fp:
		for line in fp:
			t = line[:-1].split("\t")
			if not t[0] in bed:
				bed[t[0]] = [[], [], [], None]
			i += 1
	sys.stderr.write("Read in {} sec\n".format(timer() - start))
	start = timer()
	for ctg in bed:
		bed[ctg][3] = NCLS(np.array(bed[ctg][0], dtype=np.long), np.array(bed[ctg][1], dtype=np.long), np.array(bed[ctg][2], dtype=np.long))
	sys.stderr.write("Index in {} sec\n".format(timer() - start))
	start = timer()
	with open(argv[2]) as fp:
		for line in fp:
			t = line[:-1].split("\t")
			if not t[0] in bed:
				print("{}\t{}\t{}\t0".format(t[0], t[1], t[2]))
				cnt = 0
				it = bed[t[0]][3].find_overlap(long(t[1]), long(t[2]))
				for r in it: cnt += 1
				print("{}\t{}\t{}\t{}".format(t[0], t[1], t[2], cnt))
	sys.stderr.write("Query in {} sec\n".format(timer() - start))
Пример #19
def buildNCLSindex(sites):

    starts = array(sites, dtype=int64)
    ends = array(starts + 1, dtype=int64)
    idxs = arange(len(starts))

    index = NCLS(starts, ends, idxs)
    return index
Пример #20
    def overlap(self, im=True, decimal_places=5):
        #if index not linear then filter and use this hidden index
        use_hidden = False  #if true that means reindex done for overlap (with have to unindex before return results)
        if not np.all(self.retentionTable.idx == np.arange(
                0, len(self.retentionTable.start))):
            hidden_idx = np.arange(0, len(self.retentionTable.start))
            use_hidden = True
            hidden_idx = self.retentionTable.idx

        #decimal_places = accuracy of the retentionTime axis (how many decimal places should take into acoount)

        # convert the retention times to integers so that it is compatible with ncls
        ret_int_start = (self.retentionTable.start *
        ret_int_end = (self.retentionTable.end *

        #create the ncls object
        ncls = NCLS(ret_int_start, ret_int_end, hidden_idx)

        #find all pairwise retention time overlaps, store in a vertical 2XN numpy nd array
        ret_idx = np.column_stack(
            ncls.all_overlaps_both(ret_int_start, ret_int_end, hidden_idx)
        )  #column stack puts the two lists vertically (easier iteration for np.vectorize)

        #filter out pairs where x=y, although these overlap not interested in them
        ret_idx = ret_idx[ret_idx[:, 0] != ret_idx[:, 1]]

        if ret_idx.size > 0:  #only look for overlap if there is overlap in retention time
            #if im flag on, then have to check for overlap in both mz and im dimensions
            if im:
                rslt = ret_idx[self.__vecMzImOverlap(ret_idx[:, 0],
                                                     ret_idx[:, 1])]
                rslt = ret_idx[self.mzTable.vec_idx_overlap(
                    ret_idx[:, 0], ret_idx[:, 1])]
            rslt = np.array([])

        #unindex if need to
        if use_hidden:
            return Precursor.unindex(rslt, self.retentionTable.idx)
            return rslt
Пример #21
 def as_ncls_dict(self) -> Dict[Chrom, NCLS]:
     res = {}
     for chrom, chrom_df in self._obj.groupby("chrom"):
         res[chrom] = NCLS(
     return res
Пример #22
def _subtraction(scdf, ocdf, **kwargs):

    if ocdf.empty or scdf.empty:
        return scdf

    strandedness = kwargs["strandedness"]
    strand = True if strandedness else False

    chromosome = scdf.Chromosome.head(1).iloc[0]
    kwargs["chromosome"] = chromosome

    if "Strand" in ocdf and strand:
        strand = scdf.Strand.head(1).iloc[0]
        kwargs["strand"] = strand

    o = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values)

    idx_self, new_starts, new_ends = o.set_difference_helper(
        scdf.Start.values, scdf.End.values, scdf.index.values,

    missing_idx = pd.Index(scdf.index).difference(idx_self)

    idx_to_drop = new_starts != -1

    new_starts = new_starts[idx_to_drop]
    new_ends = new_ends[idx_to_drop]

    idx_self = idx_self[idx_to_drop]
    new_starts = pd.Series(new_starts, index=idx_self)
    new_ends = pd.Series(new_ends, index=idx_self)

    scdf = scdf.reindex(missing_idx.union(idx_self)).sort_index()
    new_starts = new_starts.sort_index()
    new_ends = new_ends.sort_index()

    if len(idx_self):
        scdf.loc[scdf.index.isin(idx_self), "Start"] = new_starts.values
        scdf.loc[scdf.index.isin(idx_self), "End"] = new_ends.values

    if not scdf.empty:
        return scdf
        return None
Пример #23
 def filter_by_human_annotations(self, article, annotations):
     ncls = NCLS(*get_intervals(article['annotations']))
     new_annotations = []
     num_filtered = 0
     for annotation in annotations:
         entity_start, entity_end = get_start_end(annotation)
         matched_human_annotation = list(
             ncls.find_overlap(entity_start, entity_end))
         if len(matched_human_annotation) == 0:
             human_annotation = article['annotations'][
             human_annotation_start, human_annotation_end = get_start_end(
             assert intersect(human_annotation_start, human_annotation_end,
                              entity_start, entity_end)
             num_filtered += 1
     assert len(new_annotations) + num_filtered == len(annotations)
     return new_annotations, num_filtered
Пример #24
def test_ncls():
    starts = pd.Series(range(0, int(1e6)))
    ends = starts + 100
    ids = starts

    print(starts, ends, ids)

    ncls = NCLS(starts.values, ends.values, ids.values)

    # starts = pd.Series([0, 4])
    # ends = pd.Series([2, 5])
    # indexes = pd.Series([98, 99])
    print(starts, ends, indexes)
    it = ncls.all_overlaps_both_stack(starts.values, ends.values,
    it2 = ncls.all_overlaps_both(starts.values, ends.values, indexes.values)

    assert it == it2
Пример #25
def _both_indexes(scdf, ocdf, how=False):

    assert (how in "containment first".split() + [False, None]) or isinstance(
        how, int)
    starts = scdf.Start.values
    ends = scdf.End.values
    indexes = scdf.index.values

    it = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values)

    if not how:
        _self_indexes, _other_indexes = it.all_overlaps_both(
            starts, ends, indexes)
    elif how == "containment":
        _self_indexes, _other_indexes = it.all_containments_both(
            starts, ends, indexes)
        _self_indexes, _other_indexes = it.first_overlap_both(
            starts, ends, indexes)

    return _self_indexes, _other_indexes
Пример #26
    def overlap_against_window(self, exp, im=True, decimal_places=5):
        print("starting overlap against window")
        ms2 = exp.ms2
        #decimal_places = accuracy of the retentionTime axis (how many decimal places should take into acoount)

        # convert the retention times to integers so that it is compatible with ncls
        ret_int_start = (self.retentionTable.start *
        ret_int_end = (self.retentionTable.end *

        ms2_time_int_start = (ms2.timeTable.start *
        ms2_time_int_end = (ms2.timeTable.end *


        #create the ncls object
        ncls = NCLS(ret_int_start, ret_int_end, self.retentionTable.idx)

        #find all pairwise retention time overlaps, store in a vertical 2XN numpy nd array
        ret_idx = np.column_stack(
            ncls.all_overlaps_both(ms2_time_int_start, ms2_time_int_end,
        )  #column stack puts the two lists vertically (easier iteration for np.vectorize)
        ret_idx = np.fliplr(ret_idx)

        #if im flag on, then have to check for overlap in both mz and im dimensions
        if im:
            return ret_idx[self.vec_mz_im_overlap(ret_idx[:, 0],
                                                  ret_idx[:, 1],
            return ret_idx[self.mzTable.vec_idx_overlap(ret_idx[:, 0],
                                                        ret_idx[:, 1],
def create_ncls(seed):


    total_nb = int(1e7)

    starts = randint(0, int(1e8), total_nb)
    ends = starts + 100

    ncls = NCLS(starts, ends, starts)

    return ncls
Пример #28
def _both_dfs(scdf, ocdf, how=False, **kwargs):

    assert how in "containment first".split() + [False, None]
    starts = scdf.Start.values
    ends = scdf.End.values
    indexes = scdf.index.values

    it = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values)

    if not how:
        _self_indexes, _other_indexes = it.all_overlaps_both(
            starts, ends, indexes)
    elif how == "containment":
        _self_indexes, _other_indexes = it.all_containments_both(
            starts, ends, indexes)
        _self_indexes, _other_indexes = it.first_overlap_both(
            starts, ends, indexes)

    _self_indexes = _self_indexes
    _other_indexes = _other_indexes

    return scdf.loc[_self_indexes], ocdf.loc[_other_indexes]
Пример #29
def create_eij_ncls_dict(standard_event_dict):

    eij_by_chrom_strand = {}
    eij_indexed_event_dict = {}
    eij_only_count_dict = {}
    ncls_by_chrom_strand = {}

    for event, event_val in standard_event_dict.iteritems():

        strand = event_val["strand"]
        chrom = event_val["chrom"]

        for eij in event_val["included_ei_junctions"]:

            ((eij_by_chrom_strand).setdefault(chrom, {})).setdefault(
                strand, set()).add(int(eij))

            eij_index = chrom + "_" + str(eij) + "_" + strand

                                              set()).add(event + "_included")

            eij_only_count_dict.setdefault(eij_index, 0)

        for eij in event_val["excluded_ei_junctions"]:

            ((eij_by_chrom_strand).setdefault(chrom, {})).setdefault(
                strand, set()).add(int(eij))

            eij_index = chrom + "_" + str(eij) + "_" + strand

                                              set()).add(event + "_excluded")

            eij_only_count_dict.setdefault(eij_index, 0)

    for chrom, chrom_dict in eij_by_chrom_strand.iteritems():

        ncls_by_chrom_strand[chrom] = {}

        for strand, strand_dict in chrom_dict.iteritems():

            starts = np.array(list(strand_dict)) - 1  ## to make 0-based
            ends = starts
            ids = starts

            ncls_by_chrom_strand[chrom][strand] = NCLS(starts, ends, ids)

    return (ncls_by_chrom_strand, eij_indexed_event_dict, eij_only_count_dict)
Пример #30
def _subtraction(scdf, ocdf, **kwargs):

    chromosome, strand = parse_grpby_key(kwargs["key"])

    if ocdf.empty or scdf.empty:
        return scdf

    strandedness = kwargs["strandedness"]
    strand = True if strandedness else False

    oc = _cluster(ocdf, chromosome, strand)
    o = NCLS(oc.Start.values, oc.End.values, oc.index.values)

    idx_self, new_starts, new_ends = o.set_difference_helper(
        scdf.Start.values, scdf.End.values, scdf.index.values)

    missing_idx = pd.Index(scdf.index).difference(idx_self)

    idx_to_drop = new_starts != -1

    new_starts = new_starts[idx_to_drop]
    new_ends = new_ends[idx_to_drop]

    idx_self = idx_self[idx_to_drop]
    new_starts = pd.Series(new_starts, index=idx_self).sort_index()
    new_ends = pd.Series(new_ends, index=idx_self).sort_index()
    idx_self = np.sort(idx_self)

    scdf = scdf.reindex(missing_idx.union(idx_self))

    if len(idx_self):

        scdf.loc[scdf.index.isin(idx_self), "Start"] = new_starts
        scdf.loc[scdf.index.isin(idx_self), "End"] = new_ends

    return scdf