Пример #1
0
def _coverage(scdf, ocdf, kwargs):

    if scdf.empty:
        return None
    if ocdf.empty:
        df = scdf.copy()
        df.insert(df.shape[1], "FractionOverlaps", 0.0)
        return df

    oncls = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values)

    starts = scdf.Start.values
    ends = scdf.End.values
    indexes = scdf.index.values

    _lengths = oncls.coverage(starts, ends, indexes)
    _lengths = _lengths / (ends - starts)
    _fractions = _lengths
    _fractions = _fractions.astype("float64")
    _fractions = np.nan_to_num(_fractions)

    scdf = scdf.copy()

    scdf.insert(scdf.shape[1], "FractionOverlaps", _fractions)

    return scdf
Пример #2
0
def _both_dfs(scdf, ocdf, how=False):

    assert how in "containment first".split() + [False, None]
    starts = scdf.Start.values
    ends = scdf.End.values
    indexes = scdf.index.values

    ocdf = ocdf.reset_index(drop=True)
    it = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values)

    if not how:
        _self_indexes, _other_indexes = it.all_overlaps_both(
            starts, ends, indexes)
    elif how == "containment":
        _self_indexes, _other_indexes = it.all_containments_both(
            starts, ends, indexes)
    else:
        _self_indexes, _other_indexes = it.first_overlap_both(
            starts, ends, indexes)

    _self_indexes = _self_indexes
    _other_indexes = _other_indexes
    scdf = scdf.reindex(_self_indexes)
    ocdf = ocdf.reindex(_other_indexes)

    return scdf, ocdf
Пример #3
0
    def __init__(self, bedFile):
        super(BedObject, self).__init__()

        # This function builds an index tree from the bed file to have a fast check if a read falls within
        # a specified area or not.
        # the hard work is done by NCLS (https://github.com/biocore-ntnu/ncls) which is also used by the
        # pyranges module

        starts = []
        ends = []
        currChr = None
        self.__ncls = {}

        with open(bedFile) as f:
            for line in f:
                # break the line into fields
                lineArray = line.strip().split()
                # if the chromosome is still the same, or we do this the first time, we append
                if currChr == lineArray[0] or currChr is None:
                    # this is not changing anything but for the first time (when currChr is None), but
                    # thats fine as this is neither time consuming, nor the bottle neck, its just not
                    # pretty
                    currChr = lineArray[0]

                    # add the starts and stops to the list
                    starts.append(int(lineArray[1]))
                    ends.append(int(lineArray[2]))

                else:
                    # convert to array with dtype (ncls needs that)
                    starts = array(starts, dtype=int64)
                    ends = array(ends, dtype=int64)
                    # add one to the end, to have inclusive ends
                    ends = ends + 1

                    # create the data structure (third column is ids... which could be anything, but
                    # needs to be a number )
                    tmpNcls = NCLS(starts, ends, starts)
                    # store the data structure under its chromosome name
                    self.__ncls[currChr] = tmpNcls

                    # reset all the things for the next chromosome (and initialise it while we are
                    # already at it)
                    currChr = lineArray[0]
                    starts = [lineArray[1]]
                    ends = [lineArray[2]]
            # finally, when we are done with everything, and the currentChr is not None, we need to
            # add the things one last time (just like in the else statement)
            if not currChr is None:
                # convert to array with dtype (ncls needs that)
                starts = array(starts, dtype=int64)
                ends = array(ends, dtype=int64)
                # add one to the end, to have inclusive ends
                ends = ends + 1

                # create the data structure (third column is ids... which could be anything, but
                # needs to be a number )
                tmpNcls = NCLS(starts, ends, starts)
                # store the data structure under its chromosome name
                self.__ncls[currChr] = tmpNcls
Пример #4
0
def _first_df(scdf, ocdf, how=False, invert=False, n_jobs=1, **kwargs):

    assert how in "containment first".split() + [False, None]
    starts = scdf.Start.values
    ends = scdf.End.values
    indexes = scdf.index.values

    print("n_jobs " * 10)
    print(n_jobs)

    if n_jobs > 1:
        print("deepcopy")
        scdf = scdf.copy(deep=True)

    it = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values)

    if not how:
        _indexes = it.has_overlaps(starts, ends, indexes)
    elif how == "containment":
        _indexes = it.has_containments(starts, ends, indexes)

    if not invert:
        return scdf.reindex(_indexes)
    else:
        return scdf.loc[~scdf.index.isin(_indexes)]
Пример #5
0
def _overlap(scdf, ocdf, kwargs):

    invert = kwargs["invert"]
    return_indexes = kwargs.get("return_indexes", False)

    if scdf.empty or ocdf.empty:
        return None

    how = kwargs["how"]

    assert how in "containment first".split() + [False, None]
    starts = scdf.Start.values
    ends = scdf.End.values
    indexes = scdf.index.values

    it = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values)

    if not how:
        _indexes = it.all_overlaps_self(starts, ends, indexes)
    elif how == "containment":
        _indexes = it.has_containment(starts, ends, indexes)
    else:
        _indexes = it.has_overlaps(starts, ends, indexes)

    if invert:
        _indexes = scdf.index.difference(_indexes)

    if return_indexes:
        return _indexes

    return scdf.reindex(_indexes)
Пример #6
0
    def test_all_containments_both():

        starts = np.array([5, 10], dtype=np.int64)
        ends = np.array([6, 50], dtype=np.int64)
        ids = np.array([0, 1], dtype=np.int64)

        ncls = NCLS(starts, ends, ids)
        subs, covers = ncls.all_containments_both(starts, ends, ids)

        print(ncls.intervals())

        assert list(subs) == [0, 1] == list(covers)
Пример #7
0
def test_all_containments_both():

    starts = np.array([1291845632, 3002335232], dtype=int)
    ends = np.array([1292894207, 3002597375], dtype=int)
    ids = np.array([0, 1], dtype=int)

    ncls = NCLS(starts, ends, ids)
    subs, covers = ncls.all_containments_both(starts, ends, ids)

    print(ncls.intervals())

    assert list(subs) == [0, 1] == list(covers)
Пример #8
0
 def __init__(self, starts=None, ends=None, indices=None, reduce=False):
     self.ncls = None
     if starts is not None and indices is not None:
         if ends is None:
             ends = [s + 1 for s in starts]
         if reduce:
             starts, ends, indices = list(
                 zip(*merge_overlaps(zip(starts, ends, indices))))
         starts = np.array(starts, dtype='i8')
         ends = np.array(ends, dtype='i8')
         indices = np.array(indices, dtype='i8')
         self.ncls = NCLS(starts, ends, indices)
    def __init__(self, node_type, node_id, nodes_list, is_robot=False):
        super(MultiNode, self).__init__(node_type, node_id, data=None, is_robot=is_robot)
        self.nodes_list = nodes_list
        for node in self.nodes_list:
            node.is_robot = is_robot

        self.first_timestep = min(node.first_timestep for node in self.nodes_list)
        self._last_timestep = max(node.last_timestep for node in self.nodes_list)

        starts = np.array([node.first_timestep for node in self.nodes_list], dtype=np.int64)
        ends = np.array([node.last_timestep for node in self.nodes_list], dtype=np.int64)
        ids = np.arange(len(self.nodes_list), dtype=np.int64)
        self.interval_tree = NCLS(starts, ends, ids)
Пример #10
0
def _intersection(scdf, ocdf, kwargs):

    how = kwargs["how"]

    if ocdf.empty or scdf.empty:
        return None

    assert how in "containment first".split() + [False, None]
    starts = scdf.Start.values
    ends = scdf.End.values
    indexes = scdf.index.values

    in_dtype = ocdf.Start.dtype

    oncls = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values)

    if not how or how is None:
        _self_indexes, _other_indexes = oncls.all_overlaps_both(
            starts, ends, indexes)
    elif how == "containment":
        _self_indexes, _other_indexes = oncls.all_containments_both(
            starts, ends, indexes)
    elif how == "first":
        _self_indexes, _other_indexes = oncls.first_overlap_both(
            starts, ends, indexes)

    _self_indexes = _self_indexes
    _other_indexes = _other_indexes

    scdf, ocdf = scdf.reindex(_self_indexes), ocdf.reindex(_other_indexes)

    new_starts = pd.Series(np.where(scdf.Start.values > ocdf.Start.values,
                                    scdf.Start, ocdf.Start),
                           index=scdf.index,
                           dtype=in_dtype)

    new_ends = pd.Series(np.where(scdf.End.values < ocdf.End.values, scdf.End,
                                  ocdf.End),
                         index=scdf.index,
                         dtype=in_dtype)

    pd.options.mode.chained_assignment = None  # default='warn'
    scdf.loc[:, "Start"] = new_starts
    scdf.loc[:, "End"] = new_ends
    pd.options.mode.chained_assignment = 'warn'

    if not scdf.empty:
        return scdf
    else:
        return None
Пример #11
0
    def test_ncls():
        # ids = starts

        print(starts, ends, ids)

        ncls = NCLS(starts, ends, ids)
        print(ncls)
        print(ncls.intervals())

        assert list(ncls.find_overlap(0, 2)) == []
        assert list(ncls.find_overlap(0, 2_147_483_647)) == [(5, 6, 0), (2_147_483_645, 2_147_483_646, 3)]

        r, l = ncls.all_overlaps_both(starts, ends, ids)
        assert list(r) == [0, 3]
        assert list(l) == [0, 3]
Пример #12
0
def _number_overlapping(scdf, ocdf, kwargs):

    keep_nonoverlapping = kwargs.get("keep_nonoverlapping", True)

    if scdf.empty:
        return None
    if ocdf.empty:
        if keep_nonoverlapping:
            df = scdf.copy()
            # print(df)
            df.insert(df.shape[1], "NumberOverlaps", 0)
            # print("df" * 100)
            # print(df)
            return df
        else:
            return None

    oncls = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values)

    starts = scdf.Start.values
    ends = scdf.End.values
    indexes = scdf.index.values

    _self_indexes, _other_indexes = oncls.all_overlaps_both(
        starts, ends, indexes)

    s = pd.Series(_self_indexes)
    counts_per_read = s.value_counts()[s.unique()].reset_index()
    counts_per_read.columns = ["Index", "Count"]

    df = scdf.copy()

    if keep_nonoverlapping:
        _missing_indexes = np.setdiff1d(scdf.index, _self_indexes)
        missing = pd.DataFrame(data={
            "Index": _missing_indexes,
            "Count": 0
        },
                               index=_missing_indexes)
        counts_per_read = pd.concat([counts_per_read, missing])
    else:
        df = df.loc[_self_indexes]

    counts_per_read = counts_per_read.set_index("Index")

    df.insert(df.shape[1], "NumberOverlaps", counts_per_read)

    return df
Пример #13
0
class NestedContainmentList(object):
    def __init__(self, starts=None, ends=None, indices=None, reduce=False):
        self.ncls = None
        if starts is not None and indices is not None:
            if ends is None:
                ends = [s + 1 for s in starts]
            if reduce:
                starts, ends, indices = list(
                    zip(*merge_overlaps(zip(starts, ends, indices))))
            starts = np.array(starts, dtype='i8')
            ends = np.array(ends, dtype='i8')
            indices = np.array(indices, dtype='i8')
            self.ncls = NCLS(starts, ends, indices)

    def find_overlaps(self, start, end):
        if self.ncls is None:
            # we allow for empty objects, in which case nothing overlaps
            # use case: non-matching seqids
            return []
        overlaps = []
        for overlap in self.ncls.find_overlap(start, end):
            overlaps.append(Interval(*overlap))
        return overlaps

    @staticmethod
    def from_intervals(intervals, reduce=False):
        starts, ends, indices = zip(*intervals)
        starts = np.array(starts, dtype='i8')
        ends = np.array(ends, dtype='i8')
        indices = np.array(indices, dtype='i8')
        obj = NestedContainmentList(starts, ends, indices, reduce=reduce)
        return obj
Пример #14
0
    def ncls_overlap(self, decimal_places = 5, start_idx = 0, end_idx = None):
        # if end_idx is none set as the end of the list
        if end_idx is None:
            end_idx == len(self.start)
        #decimal_places = accuracy of the retentionTime axis (how many decimal places should take into acoount) (since NCLS works optimally with integers)

        # convert the retention times to integers so that it is compatible with ncls
        int_start = (self.start * (10**decimal_places)).astype(int) 
        int_end = (self.end * (10**decimal_places)).astype(int)

        #create the ncls object
        ncls = NCLS(int_start, int_end, self.idx)

        #find all pairwise retention time overlaps, store in a vertical 2XN numpy nd array

        return np.column_stack(ncls.all_overlaps_both(int_start[start_idx:end_idx], int_end[start_idx:end_idx], self.idx[start_idx:end_idx])) #column stack puts the two lists vertically (easier iteration for np.vectorize)
Пример #15
0
def test_ncls():
    # ids = starts

    print(starts, ends, ids)

    ncls = NCLS(starts, ends, ids)
    print(ncls)
    print(ncls.intervals())

    assert list(ncls.find_overlap(0, 2)) == []
    print("aaa", list(ncls.find_overlap(9_223_372_036_854_775_805, 9_223_372_036_854_775_806)))
    assert list(ncls.find_overlap(0, 9_223_372_036_854_775_806)) == [(5, 6, 2147483647), (9223372036854775805, 9223372036854775807, 3)]

    r, l = ncls.all_overlaps_both(starts, ends, ids)
    assert list(r) == [2147483647, 3]
    assert list(l) == [2147483647, 3]
Пример #16
0
    def __init__(self, predicate, records):
        self.predicate = predicate
        self.records = []

        working_tree_map = {}

        for idx, record in enumerate(records):
            genome_pos = predicate(record)

            if genome_pos is None:
                continue

            chrom = genome_pos.chrom

            if not chrom in working_tree_map:
                # (starts, ends, ids)
                working_tree_map[chrom] = ([], [], [])

            starts, ends, ids = working_tree_map[chrom]
            starts.append(genome_pos.start)
            ends.append(genome_pos.end)
            ids.append(idx)

            self.records.append(record)

            idx += 1

        tree_map = {}

        for chrom, (starts, ends, ids) in working_tree_map.items():
            tree_map[chrom] = NCLS(np.array(starts, dtype=np.long),
                                   np.array(ends, dtype=np.long),
                                   np.array(ids, dtype=np.long))

        self.tree_map = tree_map
def gmap_parse_ncls(gmapFile, cutoff):
    gmapLoc = {}
    starts = []
    ends = []
    ids = []
    ongoingCount = 0
    with open(gmapFile, 'r') as fileIn:
        for line in fileIn:
            # Skip unneccessary lines
            if line.startswith('#'):
                continue
            sl = line.split('\t')
            if sl[2] != 'cDNA_match':  # I don't think any other type of line is present in a GMAP gff3 file produced with PASA's settings, but this could potentially future proof the script?
                continue
            # Get details from line including start, stop, and orientation
            contigID = sl[0]
            contigStart = int(sl[3])
            contigStop = int(sl[4])
            identity = float(sl[5])
            if identity < cutoff:  # Speed up program by only holding onto hits that will pass our cutoff check.
                continue
            # Add to our NCLS                                                               # We index using ranges since it provides an easy way to retrieve GMAP matches by coordinates. Since these coordinates aren't unique, we filter any results returned by their contig ID.
            starts.append(contigStart)
            ends.append(
                contigStop + 1
            )  # NCLS indexes 0-based, so +1 to make this more logically compliant with gff3 1-based system.
            ids.append(ongoingCount)
            gmapLoc[ongoingCount] = contigID
            ongoingCount += 1
    # Build the NCLS object
    starts = pd.Series(starts)
    ends = pd.Series(ends)
    ids = pd.Series(ids)
    ncls = NCLS(starts.values, ends.values, ids.values)
    return ncls, gmapLoc
Пример #18
0
def main(argv):
	if len(argv) < 3:
		print("Usage: bedcov.py <loaded.bed> <streamed.bed>")
		sys.exit(1)

	bed, i = {}, 0
	start = timer()
	with open(argv[1]) as fp:
		for line in fp:
			t = line[:-1].split("\t")
			if not t[0] in bed:
				bed[t[0]] = [[], [], [], None]
			bed[t[0]][0].append(t[1])
			bed[t[0]][1].append(t[2])
			bed[t[0]][2].append(i)
			i += 1
	sys.stderr.write("Read in {} sec\n".format(timer() - start))
	start = timer()
	for ctg in bed:
		bed[ctg][3] = NCLS(np.array(bed[ctg][0], dtype=np.long), np.array(bed[ctg][1], dtype=np.long), np.array(bed[ctg][2], dtype=np.long))
	sys.stderr.write("Index in {} sec\n".format(timer() - start))
	start = timer()
	with open(argv[2]) as fp:
		for line in fp:
			t = line[:-1].split("\t")
			if not t[0] in bed:
				print("{}\t{}\t{}\t0".format(t[0], t[1], t[2]))
			else:
				cnt = 0
				it = bed[t[0]][3].find_overlap(long(t[1]), long(t[2]))
				for r in it: cnt += 1
				print("{}\t{}\t{}\t{}".format(t[0], t[1], t[2], cnt))
	sys.stderr.write("Query in {} sec\n".format(timer() - start))
Пример #19
0
def buildNCLSindex(sites):

    starts = array(sites, dtype=int64)
    ends = array(starts + 1, dtype=int64)
    idxs = arange(len(starts))

    index = NCLS(starts, ends, idxs)
    return index
Пример #20
0
    def overlap(self, im=True, decimal_places=5):
        #if index not linear then filter and use this hidden index
        use_hidden = False  #if true that means reindex done for overlap (with have to unindex before return results)
        if not np.all(self.retentionTable.idx == np.arange(
                0, len(self.retentionTable.start))):
            hidden_idx = np.arange(0, len(self.retentionTable.start))
            use_hidden = True
        else:
            hidden_idx = self.retentionTable.idx

        #decimal_places = accuracy of the retentionTime axis (how many decimal places should take into acoount)

        # convert the retention times to integers so that it is compatible with ncls
        ret_int_start = (self.retentionTable.start *
                         (10**decimal_places)).astype(int)
        ret_int_end = (self.retentionTable.end *
                       (10**decimal_places)).astype(int)

        #create the ncls object
        ncls = NCLS(ret_int_start, ret_int_end, hidden_idx)

        #find all pairwise retention time overlaps, store in a vertical 2XN numpy nd array
        ret_idx = np.column_stack(
            ncls.all_overlaps_both(ret_int_start, ret_int_end, hidden_idx)
        )  #column stack puts the two lists vertically (easier iteration for np.vectorize)

        #filter out pairs where x=y, although these overlap not interested in them
        ret_idx = ret_idx[ret_idx[:, 0] != ret_idx[:, 1]]

        if ret_idx.size > 0:  #only look for overlap if there is overlap in retention time
            #if im flag on, then have to check for overlap in both mz and im dimensions
            if im:
                rslt = ret_idx[self.__vecMzImOverlap(ret_idx[:, 0],
                                                     ret_idx[:, 1])]
            else:
                rslt = ret_idx[self.mzTable.vec_idx_overlap(
                    ret_idx[:, 0], ret_idx[:, 1])]
        else:
            rslt = np.array([])

        #unindex if need to
        if use_hidden:
            return Precursor.unindex(rslt, self.retentionTable.idx)
        else:
            return rslt
Пример #21
0
 def as_ncls_dict(self) -> Dict[Chrom, NCLS]:
     res = {}
     for chrom, chrom_df in self._obj.groupby("chrom"):
         res[chrom] = NCLS(
             chrom_df.start.values.astype(np.int64),
             chrom_df.end.values.astype(np.int64),
             chrom_df.index.values.astype(np.int64),
         )
     return res
Пример #22
0
def _subtraction(scdf, ocdf, **kwargs):

    if ocdf.empty or scdf.empty:
        return scdf

    strandedness = kwargs["strandedness"]
    strand = True if strandedness else False

    chromosome = scdf.Chromosome.head(1).iloc[0]
    kwargs["chromosome"] = chromosome

    if "Strand" in ocdf and strand:
        strand = scdf.Strand.head(1).iloc[0]
        kwargs["strand"] = strand

    o = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values)

    idx_self, new_starts, new_ends = o.set_difference_helper(
        scdf.Start.values, scdf.End.values, scdf.index.values,
        scdf.__num__.values)

    missing_idx = pd.Index(scdf.index).difference(idx_self)

    idx_to_drop = new_starts != -1

    new_starts = new_starts[idx_to_drop]
    new_ends = new_ends[idx_to_drop]

    idx_self = idx_self[idx_to_drop]
    new_starts = pd.Series(new_starts, index=idx_self)
    new_ends = pd.Series(new_ends, index=idx_self)

    scdf = scdf.reindex(missing_idx.union(idx_self)).sort_index()
    new_starts = new_starts.sort_index()
    new_ends = new_ends.sort_index()

    if len(idx_self):
        scdf.loc[scdf.index.isin(idx_self), "Start"] = new_starts.values
        scdf.loc[scdf.index.isin(idx_self), "End"] = new_ends.values

    if not scdf.empty:
        return scdf
    else:
        return None
Пример #23
0
 def filter_by_human_annotations(self, article, annotations):
     ncls = NCLS(*get_intervals(article['annotations']))
     new_annotations = []
     num_filtered = 0
     for annotation in annotations:
         entity_start, entity_end = get_start_end(annotation)
         matched_human_annotation = list(
             ncls.find_overlap(entity_start, entity_end))
         if len(matched_human_annotation) == 0:
             new_annotations.append(annotation)
         else:
             human_annotation = article['annotations'][
                 matched_human_annotation[0][2]]
             human_annotation_start, human_annotation_end = get_start_end(
                 human_annotation)
             assert intersect(human_annotation_start, human_annotation_end,
                              entity_start, entity_end)
             num_filtered += 1
     assert len(new_annotations) + num_filtered == len(annotations)
     return new_annotations, num_filtered
Пример #24
0
def test_ncls():
    starts = pd.Series(range(0, int(1e6)))
    ends = starts + 100
    ids = starts

    print(starts, ends, ids)

    ncls = NCLS(starts.values, ends.values, ids.values)

    # starts = pd.Series([0, 4])
    # ends = pd.Series([2, 5])
    # indexes = pd.Series([98, 99])
    print(starts, ends, indexes)
    it = ncls.all_overlaps_both_stack(starts.values, ends.values,
                                      indexes.values)
    it2 = ncls.all_overlaps_both(starts.values, ends.values, indexes.values)

    print(it)
    print(it2)
    assert it == it2
Пример #25
0
def _both_indexes(scdf, ocdf, how=False):

    assert (how in "containment first".split() + [False, None]) or isinstance(
        how, int)
    starts = scdf.Start.values
    ends = scdf.End.values
    indexes = scdf.index.values

    it = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values)

    if not how:
        _self_indexes, _other_indexes = it.all_overlaps_both(
            starts, ends, indexes)
    elif how == "containment":
        _self_indexes, _other_indexes = it.all_containments_both(
            starts, ends, indexes)
    else:
        _self_indexes, _other_indexes = it.first_overlap_both(
            starts, ends, indexes)

    return _self_indexes, _other_indexes
Пример #26
0
    def overlap_against_window(self, exp, im=True, decimal_places=5):
        print("starting overlap against window")
        ms2 = exp.ms2
        #decimal_places = accuracy of the retentionTime axis (how many decimal places should take into acoount)

        # convert the retention times to integers so that it is compatible with ncls
        ret_int_start = (self.retentionTable.start *
                         (10**decimal_places)).astype(int)
        ret_int_end = (self.retentionTable.end *
                       (10**decimal_places)).astype(int)

        ms2_time_int_start = (ms2.timeTable.start *
                              (10**decimal_places)).astype(int)
        ms2_time_int_end = (ms2.timeTable.end *
                            (10**decimal_places)).astype(int)

        print(ret_int_start)
        print(ret_int_end)

        #create the ncls object
        ncls = NCLS(ret_int_start, ret_int_end, self.retentionTable.idx)

        #find all pairwise retention time overlaps, store in a vertical 2XN numpy nd array
        ret_idx = np.column_stack(
            ncls.all_overlaps_both(ms2_time_int_start, ms2_time_int_end,
                                   ms2.timeTable.idx)
        )  #column stack puts the two lists vertically (easier iteration for np.vectorize)
        ret_idx = np.fliplr(ret_idx)

        #if im flag on, then have to check for overlap in both mz and im dimensions
        if im:
            return ret_idx[self.vec_mz_im_overlap(ret_idx[:, 0],
                                                  ret_idx[:, 1],
                                                  idx2_data=ms2)]
        else:
            #print(ms2.mzTable)
            return ret_idx[self.mzTable.vec_idx_overlap(ret_idx[:, 0],
                                                        ret_idx[:, 1],
                                                        yData=ms2.mzTable)]
def create_ncls(seed):

    np.random.seed(seed)

    total_nb = int(1e7)

    starts = randint(0, int(1e8), total_nb)
    ends = starts + 100

    ncls = NCLS(starts, ends, starts)

    print("returning")
    return ncls
Пример #28
0
def _both_dfs(scdf, ocdf, how=False, **kwargs):

    assert how in "containment first".split() + [False, None]
    starts = scdf.Start.values
    ends = scdf.End.values
    indexes = scdf.index.values

    it = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values)

    if not how:
        _self_indexes, _other_indexes = it.all_overlaps_both(
            starts, ends, indexes)
    elif how == "containment":
        _self_indexes, _other_indexes = it.all_containments_both(
            starts, ends, indexes)
    else:
        _self_indexes, _other_indexes = it.first_overlap_both(
            starts, ends, indexes)

    _self_indexes = _self_indexes
    _other_indexes = _other_indexes

    return scdf.loc[_self_indexes], ocdf.loc[_other_indexes]
Пример #29
0
def create_eij_ncls_dict(standard_event_dict):

    eij_by_chrom_strand = {}
    eij_indexed_event_dict = {}
    eij_only_count_dict = {}
    ncls_by_chrom_strand = {}

    for event, event_val in standard_event_dict.iteritems():

        strand = event_val["strand"]
        chrom = event_val["chrom"]

        for eij in event_val["included_ei_junctions"]:

            ((eij_by_chrom_strand).setdefault(chrom, {})).setdefault(
                strand, set()).add(int(eij))

            eij_index = chrom + "_" + str(eij) + "_" + strand

            eij_indexed_event_dict.setdefault(eij_index,
                                              set()).add(event + "_included")

            eij_only_count_dict.setdefault(eij_index, 0)

        for eij in event_val["excluded_ei_junctions"]:

            ((eij_by_chrom_strand).setdefault(chrom, {})).setdefault(
                strand, set()).add(int(eij))

            eij_index = chrom + "_" + str(eij) + "_" + strand

            eij_indexed_event_dict.setdefault(eij_index,
                                              set()).add(event + "_excluded")

            eij_only_count_dict.setdefault(eij_index, 0)

    for chrom, chrom_dict in eij_by_chrom_strand.iteritems():

        ncls_by_chrom_strand[chrom] = {}

        for strand, strand_dict in chrom_dict.iteritems():

            starts = np.array(list(strand_dict)) - 1  ## to make 0-based
            ends = starts
            ids = starts

            ncls_by_chrom_strand[chrom][strand] = NCLS(starts, ends, ids)

    return (ncls_by_chrom_strand, eij_indexed_event_dict, eij_only_count_dict)
Пример #30
0
def _subtraction(scdf, ocdf, **kwargs):

    chromosome, strand = parse_grpby_key(kwargs["key"])

    if ocdf.empty or scdf.empty:
        return scdf

    strandedness = kwargs["strandedness"]
    strand = True if strandedness else False

    oc = _cluster(ocdf, chromosome, strand)
    o = NCLS(oc.Start.values, oc.End.values, oc.index.values)

    idx_self, new_starts, new_ends = o.set_difference_helper(
        scdf.Start.values, scdf.End.values, scdf.index.values)

    missing_idx = pd.Index(scdf.index).difference(idx_self)

    idx_to_drop = new_starts != -1

    new_starts = new_starts[idx_to_drop]
    new_ends = new_ends[idx_to_drop]

    idx_self = idx_self[idx_to_drop]
    new_starts = pd.Series(new_starts, index=idx_self).sort_index()
    new_ends = pd.Series(new_ends, index=idx_self).sort_index()
    idx_self = np.sort(idx_self)

    scdf = scdf.reindex(missing_idx.union(idx_self))

    if len(idx_self):

        scdf.loc[scdf.index.isin(idx_self), "Start"] = new_starts
        scdf.loc[scdf.index.isin(idx_self), "End"] = new_ends

    return scdf