def test_comparisons(self): p = p00 = SequenceRange(self.pep_start, self.pep_stop) p2 = SequenceRange(self.pep_start, self.pep_stop) p01 = SequenceRange(self.pep_start, self.pep_stop + 1) p10 = SequenceRange(self.pep_start + 1, self.pep_stop) p_tuple = (self.pep_start, self.pep_stop) # equal / unequal assert p == p2 assert p == p_tuple assert p_tuple == p assert p_tuple != p01 assert p_tuple != 123 assert p is not None assert p not in (None, 66) assert p != (str(self.pep_start), str(self.pep_stop)) with pytest.raises(TypeError): p < "Wrong type!!" with pytest.raises(TypeError): "Wrong type!!" < p with pytest.raises(TypeError): p > "Wrong type!!" assert not p < p and not p > p assert p <= p and p >= p and p == p # less/greater or equal # (x, y) < (x, y + 1), < (x+1, y) assert p00 < p01 < p10 assert p10 > p01 > p00 assert p_tuple < p01 < p10 and p_tuple <= p01 <= p10 assert p10 > p01 > p_tuple and p10 >= p01 >= p_tuple
def test___sub__(self): assert SequenceRange(5, 20) - SequencePoint(3) == SequenceRange(3, 18) with pytest.raises(ValueError): # 20 - 5, 20 - 10 -> 15, 10 = makes no sense!! (SequencePoint(20) - SequenceRange(5, 10)).validate() assert SequenceRange(10, 15) - SequencePoint(5) == SequenceRange(6, 11)
def test___iter__(self): sr = SequenceRange(5, 10) sr_points = list(sr) # should be equivalent to list(sr.__iter__()) assert sr.length == len(sr_points) assert sr_points[0].pos == sr.start.pos assert sr_points[-1].pos == sr.stop.pos assert list(SequenceRange(5, 5))[0] == SequencePoint(5)
def test___repr__(self): assert repr(SequenceRange(10, 20)) == "SequenceRange(10, 20, seq=None)" seq = "A" * 11 assert repr(SequenceRange(10, 20, seq=seq)) == \ 'SequenceRange(10, 20, seq="{}")'.format(seq) assert repr(SequenceRange(10, 20).pos) == "(10, 20)" assert repr(SequenceRange.from_index(10, 20).index) == "(10, 20)"
def test___add__(self): assert SequenceRange(2, 3) + SequencePoint(2) == SequenceRange(3, 4) assert SequencePoint(2) + SequenceRange(2, 3) == SequenceRange(3, 4) assert SequenceRange(2, 3) + SequencePoint(2) + 5 == SequenceRange( 8, 9) assert 5 + SequenceRange(2, 3) + SequencePoint(2) == SequenceRange( 8, 9) assert SequenceRange(2, 3) + 5 + SequencePoint(2) == SequenceRange( 8, 9)
def test_math_seq(self): # simple math should retain the seq evil = SequenceRange(12, 15, seq="EVIL") evil_p2 = SequenceRange(14, 17, seq="EVIL") evil_m2 = SequenceRange(10, 13, seq="EVIL") assert evil + 2 == evil_p2 == 2 + evil assert evil - 2 == evil_m2 == -2 + evil # complex math should change seq to None assert (evil + SequenceRange(1, 2)).seq is None assert (evil - SequenceRange(1, 2)).seq is None
def test_from_sequence(self, glucagon_peptides, glucagon_seq): expected = SequenceRange(self.pep_start, self.pep_stop, seq=self.pep_seq) observed = SequenceRange.from_sequence(self.protein_seq, self.pep_seq) assert observed == expected with pytest.raises(IndexError): SequenceRange.from_sequence('PROTEINSEQ', "PEPTIDESEQ") for (start, stop, seq) in glucagon_peptides: p = SequenceRange.from_sequence(glucagon_seq, seq) self._assert(p, seq, glucagon_seq)
def test_from_index_and_length(self, glucagon_peptides, glucagon_seq): # simple tests index = self.protein_seq.index(self.pep_seq) p = SequenceRange.from_index(index, length=len(self.pep_seq)) self._assert(p, self.pep_seq, self.protein_seq) # all peptides for (start, stop, seq) in glucagon_peptides: p = SequenceRange.from_index(glucagon_seq.index(seq), length=len(seq)) self._assert(p, seq, glucagon_seq) with pytest.raises(ValueError): SequenceRange.from_index(SequenceRange(10), length=20)
def test_from_slices(self, glucagon_peptides, glucagon_seq): pep_start_slice = 5 pep_stop_slice = 9 p_slice = SequenceRange.from_slice(pep_start_slice, pep_stop_slice) p_slice2 = SequenceRange.from_slice( slice(pep_start_slice, pep_stop_slice)) p = SequenceRange(self.pep_start, self.pep_stop) assert p == p_slice == p_slice2 assert self.pep_seq == self.protein_seq[p_slice.slice.start:p_slice. slice.stop] assert self.pep_seq == self.protein_seq[p_slice.slice] with pytest.raises(ValueError): # slices has to have step=1 or None SequenceRange.from_slice(slice(2, 10, 2))
def test_bugs(self): """ make sure that bugs do not recure!! """ # '53' is a abc.Sequecne with length 2, thus is used to be interpeted much like ('5', '3') for start in (53, '53', b'53'): for stop in (63, '63', b'63'): assert SequenceRange(start, stop, seq='A' * 11) == SequenceRange(53, seq='A' * 11) for start in (153, '153', b'153'): for stop in (163, '163', b'163'): assert SequenceRange(start, stop, seq='A' * 11) == SequenceRange(153, seq='A' * 11)
def _iter_build_lpv(self, min_overlap): # phase 1) # sorted reverse, because you can only pop from the end peptides = sorted(self.peptides, reverse=True) if len(peptides) == 0: return lpv = peptides.pop() while len(peptides) != 0: pep = peptides.pop() # Step 1) you are inside the peptide - ignore/delete - if pep in lpv: continue # Step 2) you are extending the peptide - extend - overlap = lpv.stop - pep.start + 1 if overlap >= min_overlap: lpv = SequenceRange(lpv.start, pep.stop, full_sequence=self.protein_sequence) continue # no extension, no internal -> new lpv yield lpv lpv = pep yield lpv
def get_clusters(cls, h_cluster): clusters = {} for n_clust in range(1, h_cluster.max() + 1): cluster_indexes = np.where(h_cluster == n_clust)[0] clusters[n_clust] = SequenceRange.from_index( cluster_indexes[0], cluster_indexes[-1]) return clusters
def get_bond_slice(cls, peptide): """ A peptide like this SequenceRange(10, 20), has a length of 11, but only 10 bonds, thus SequenceRange(10, 20).slice -> slice(9, 20) cls.get_bodn_slice(SeuqenceRange(10, 20) -> slice(9, 19) """ return SequenceRange(peptide.start, peptide.stop - 1).slice
def make_histograms(cls, df, length, n_samples, ladder_window=5): histogram = np.zeros(length) histogram_start = np.zeros(length) histogram_stop = np.zeros(length) histogram_ac = np.zeros(length) histogram_am = np.zeros(length) # histogram_bonds = np.zeros(length - 1) for pep_var_id, peptide_series in cls.iterrows(df): p = SequenceRange(pep_var_id.start, pep_var_id.stop) intensity = peptide_series.sum() / n_samples if pep_var_id.mod_seq.startswith('_(ac)'): histogram_ac[p.start.index] += intensity if pep_var_id.mod_seq.endswith('_(am)'): histogram_am[p.stop.index] += intensity histogram_start[p.start.index] += intensity histogram_stop[p.stop.index] += intensity histogram[p.slice] += intensity bonds = np.stack((histogram[1:], histogram[:-1])) with np.errstate(invalid='ignore'): histogram_bonds = bonds.min(axis=0) / bonds.max(axis=0) first = (histogram == histogram_start) & (histogram != 0) last = (histogram == histogram_stop) & (histogram != 0) return (histogram, histogram_start, histogram_stop, histogram_ac, histogram_am, histogram_bonds, first, last)
def diversify_score(self, dampening=2.0): """ This method modifies a series of scores (usally ppv prediction) and tries to make a tradeoff between high predictions and high similarity to previous predictions, the algorithm to achive this is very simple: 1) take the highest score 2) downvote all who share amino acids with it by a factor of 'dampening' 3) goto 1 """ # create overlap[campaign][proteinid] = {SequenceRange(..)..} # so we can quickley find overlapping peptides peptides = collections.defaultdict(lambda: collections.defaultdict(set)) for campaign_id, entry_id, score in self.series.peputils.iteritems(keep_campaign_id=True): sequence_range = SequenceRange(entry_id.start, entry_id.stop) peptides[campaign_id][entry_id.protein_id].add(sequence_range) # the algorithm scores = self.series.copy() new_score = pd.Series(index=self.series.index) while scores.shape[0] != 0: # add best score to new_score best_id = scores.idxmax() new_score[best_id] = scores[best_id] # penaltize overlapping peptides overlapping_ids = self._get_overlaping_ids(*best_id, peptides) scores[overlapping_ids] /= dampening del scores[best_id] return new_score
def _iter_split_ptm(self, lpv_iter): # warning this iter can return the same lpv twice (so convert to set!) # Phase 2) for lpv in lpv_iter: pos_array = np.arange(len(lpv)) + lpv.start.pos starts = set( pos_array[self.h_ac[lpv.slice] != 0]) | {lpv.start.pos} stops = set(pos_array[self.h_am[lpv.slice] != 0]) | {lpv.stop.pos} for start in starts: yield SequenceRange(start, lpv.stop, full_sequence=self.protein_sequence) for stop in stops: yield SequenceRange(lpv.start, stop, full_sequence=self.protein_sequence) yield lpv
def glucagon_known_peptides(): with open(pjoin(TEST_DATA, "glucagon/mouse_glucagon.known")) as f: peptides = set() f.readline() # skip header for line in f.readlines(): protein_id, start, stop, sequence, *_ = line.rstrip().split('\t') peptides.add(SequenceRange(start, stop, seq=sequence)) return peptides
def get_valid_peptides(cls, valid_starts, valid_stops, protein_sequence): valid_peptides = {} for v_start, c_start in valid_starts.items(): for v_stop, c_stop in valid_stops.items(): if v_start < v_stop and c_start == c_stop: p = SequenceRange(v_start, v_stop, full_sequence=protein_sequence) valid_peptides[p] = c_start return valid_peptides
def get_known_peptides(cls, known_file: str) -> typing.Dict[str, set]: known_peptides = collections.defaultdict(set) with open(known_file) as known_file: known_file.readline() # skip header for line in known_file: known = Known(*line.rstrip('\r\n').split('\t')) if known.type in ('peptide', 'propeptide'): peptide = SequenceRange(int(known.start), int(known.stop), seq=known.seq) known_peptides[known.protein_id].add(peptide) return dict(known_peptides)
def make_sample_frequency_histogram(self, df): histogram_samples = pd.DataFrame(np.zeros((self.length, df.shape[1])), columns=df.columns) for pep_var_id, peptide_series in self.iterrows(df): p = SequenceRange(pep_var_id.start, pep_var_id.stop) for group, intensity in peptide_series.dropna().iteritems(): histogram_samples[group][p.slice] = 1 if not (0 <= histogram_samples.shape[1] <= self.n_samples): raise ValueError( "max_samples, higher than the accual number of samples!!!") return histogram_samples.sum(axis=1).values / self.n_samples
def test_deprecation(self): sr = SequenceRange(1, 2) with pytest.warns(None): sr.pos sr.index with pytest.warns(DeprecationWarning): sr.pos.start with pytest.warns(DeprecationWarning): sr.pos.stop with pytest.warns(DeprecationWarning): sr.index.start with pytest.warns(DeprecationWarning): sr.index.stop
def test___hash__(self): hash(SequenceRange(1, 2)) my_set = set() self._assert_hash(my_set, SequenceRange(1, 1), 0, 1) self._assert_hash(my_set, SequenceRange(1, 1), 1, 1) self._assert_hash(my_set, SequenceRange(1, 2), 1, 2) self._assert_hash(my_set, SequenceRange(2, 2), 2, 3) self._assert_hash(my_set, SequenceRange(1, 2), 3, 3)
def test_immutability(self): s = SequenceRange(1, 2) with pytest.raises(AttributeError): s.pos = (1, 2) with pytest.raises(AttributeError): s.index = (1, 2) with pytest.raises(AttributeError): s.slice = (1, 2) with pytest.raises(AttributeError): s.start = SequencePoint(2) with pytest.raises(AttributeError): s.stop = SequencePoint(2)
def test___contains__(self): peptide = SequenceRange(5, 20) self._in(SequencePoint, 5, peptide) self._in(SequencePoint, 10, peptide) self._in(SequencePoint, 20, peptide) self._not_in(SequencePoint, 4, peptide) self._not_in(SequencePoint, 21, peptide) self._in(SequenceRange, (5, 10), peptide) self._in(SequenceRange, (10, 15), peptide) self._in(SequenceRange, (15, 20), peptide) self._not_in(SequenceRange, (1, 5), peptide) self._not_in(SequenceRange, (4, 11), peptide) self._not_in(SequenceRange, (10, 21), peptide)
def count_ladders(cls, position_counts, h_cluster, clusters, ladder_window=10): """ Returns the percentages of top +/- window_ladder around a possition_count thus if there are 5 peptides that stops at position 100 and 10 peptides that stop within 10 of that position the that index of the returned array would be: 5 / (10 + 5) = 0.3333.. thus close to 0 means loads of close starting positions, and 1 means only starting position """ # TODO: ladders should take into account the number of start stops, IE # if 5 starts at the position and 10 peptides start 5 other places # 1 / (1 + 5) = 1/6 <--- how we do it in the code below # counts = np.zeros(h_cluster.shape[0]) # for position in positions: # counts[position.index] = 1 # h_ladder = np.zeros(h_cluster.shape[0]) # 5 / (5 + 10) = 1/3 <--- ideal counts = np.zeros(h_cluster.shape[0]) for position, count in position_counts.items(): counts[position.index] = count h_ladder = np.zeros(h_cluster.shape[0]) # ladders are pos +/- ladder_window, but has to stay within cluster boundaries # for position in positions.items(): for position, count in position_counts.items(): n_cluster = h_cluster[position.index] ladder_start = max(clusters[n_cluster].start, position.pos - ladder_window) ladder_stop = min(clusters[n_cluster].stop, position.pos + ladder_window) ladder_range = SequenceRange(ladder_start, ladder_stop) # h_ladder[position.index] = counts[ladder_range.slice].sum() - 1 h_ladder[position.index] = count / counts[ladder_range.slice].sum() return h_ladder
def test_can_not_create_a_sequence_from_range_if_start_and_stop_are_different( self): with pytest.raises(TypeError): assert SequencePoint(SequenceRange(10, 12))
def test_can_create_a__point_from_range_if_start_and_stop_are_the_same( self): assert SequencePoint(SequenceRange(10)) == SequencePoint(10)
def test_wierd_stuff(self): assert (SequenceRange(10, 20) - 15).contains(2)
def test2b(self): assert SequenceRange(2) + SequencePoint(2) == 3
def test_conversion(self): assert SequenceRange(1, 1) == SequenceRange(SequencePoint(1)) assert SequenceRange(1, 2) == SequenceRange(SequencePoint(1), SequencePoint(2)) assert SequenceRange(1, 2) == SequenceRange(SequencePoint(1), 2) assert SequenceRange(1, 2) == SequenceRange(1, SequencePoint(2))