def pcr_sequences_for_amplicon(amplicon, padding_pos5=0, padding_pos3=0, include_snps=False): """ Returns the PCRSequence objects representing the specified amplicon. If include_snps=True, each object will include an attribute 'snps', which will include a list of SNP dictionaries (like the SNPDBCache object, only SQLAlchemy-agnostic, and 'class' in place of 'class_') :param padding_pos5: The amount of padding to prefix the amplicon. Max MAX_CACHE_PADDING. :param padding_pos3: The amount of padding to suffix the amplicon. Max MAX_CACHE_PADDING. :param include_snps: Whether to include the snps attribute on each amplicon. """ if padding_pos5 < 0 or padding_pos5 > MAX_CACHE_PADDING: raise ValueError, "Illegal padding value: %s" % padding_pos5 if padding_pos3 < 0 or padding_pos3 > MAX_CACHE_PADDING: raise ValueError, "Illegal padding value: %s" % padding_pos3 pseqs = [] for seq in amplicon.cached_sequences: main = SimpleGenomeSequence(seq.chromosome, seq.start_pos, seq.end_pos, '+', seq.positive_amplicon) # this could be of a different length than requested padding_pos5_seq = seq.padding_pos5(padding_pos5, '+') padding_pos3_seq = seq.padding_pos3(padding_pos3, '+') prefix = SimpleGenomeSequence(seq.chromosome, seq.start_pos-len(padding_pos5_seq), seq.start_pos-1, '+', padding_pos5_seq) suffix = SimpleGenomeSequence(seq.chromosome, seq.end_pos+1, seq.end_pos+len(padding_pos3_seq), '+', padding_pos3_seq) pseq = PCRSequence(main, prefix, suffix) if include_snps: snps = seq.snps_in_range(padding_pos5=len(padding_pos5_seq), padding_pos3=len(padding_pos3_seq)) pseq.snps = [dict([(k, v) for k, v in snp.__dict__.items() if not k.startswith('_')]) for snp in snps] for snp in pseq.snps: snp['class'] = snp['class_'] pseqs.append(pseq) return pseqs
def sequences_snps_for_assay(config, assay, seq_source, snp_source, left_padding=0, right_padding=0, cache=True): sequences = [] if not assay: return sequences if assay.cached_sequences: all_cached = True for seq in assay.cached_sequences: if not seq.cached(left_padding, right_padding): all_cached = False break if all_cached: for seq in assay.cached_sequences: amplicon = SimpleGenomeSequence(seq.chromosome, seq.start_pos, seq.end_pos, "+", seq.positive_amplicon) prefix = SimpleGenomeSequence( seq.chromosome, seq.start_pos - left_padding, seq.start_pos - 1, "+", seq.padding_pos5(left_padding, "+"), ) suffix = SimpleGenomeSequence( seq.chromosome, seq.end_pos + 1, seq.end_pos + right_padding, "+", seq.padding_pos3(right_padding, "+"), ) pseq = PCRSequence(amplicon, prefix, suffix) # TODO unify SNP object pseq.snps = [ dict([(k, v) for k, v in snp.__dict__.items() if not k.startswith("_")]) for snp in seq.snps ] for snp in pseq.snps: snp["class"] = snp["class_"] sequences.append(pseq) return sequences if assay.assay_type == Assay.TYPE_PRIMER: sequences = seq_source.sequences_for_primers(assay.primer_fwd, assay.primer_rev, left_padding, right_padding) elif assay.assay_type == Assay.TYPE_LOCATION: sequence = seq_source.sequence_around_loc( assay.chromosome, assay.probe_pos, assay.amplicon_width, left_padding, right_padding ) sequences.append(sequence) elif assay.assay_type == Assay.TYPE_SNP: snps = snp_source.snps_by_rsid(assay.snp_rsid) # TODO: make SNP object so that access style is same as assay for snp in snps: if snp["refUCSC"] == "-": # deletion: sequences.append( seq_source.sequence_around_region( snp["chrom"][3:], snp["chromEnd"], snp["chromEnd"], assay.amplicon_width, left_padding, right_padding, ) ) else: sequences.append( seq_source.sequence_around_region( snp["chrom"][3:], snp["chromStart"] + 1, snp["chromEnd"], assay.amplicon_width, left_padding, right_padding, ) ) for seq in sequences: seq.snps = snp_source.snps_in_range(seq.chromosome, seq.start, seq.end) if cache: # TODO: library method? -- given PCR object, SNP dict? or unify display objects # to and from their DB representation? assay.cached_sequences = [] for seq in sequences: cached_seq = HG19AssayCache( chromosome=seq.chromosome, start_pos=seq.amplicon.start, end_pos=seq.amplicon.end, seq_padding_pos5=left_padding, seq_padding_pos3=right_padding, positive_sequence=seq.merged_positive_sequence.sequence, ) cached_seq.amplicon_dg = dg_seq(config, cached_seq.positive_amplicon) cached_seq.amplicon_tm = tm_probe(config, cached_seq.positive_amplicon) for snp in seq.snps: cached_seq.snps.append( SNP131AssayCache( bin=snp["bin"], chrom=snp["chrom"], chromStart=snp["chromStart"], chromEnd=snp["chromEnd"], name=snp["name"], score=snp["score"], strand=snp["strand"], refNCBI=snp["refNCBI"], refUCSC=snp["refUCSC"], observed=snp["observed"], molType=snp["molType"], class_=snp["class"], valid=snp["valid"], avHet=snp["avHet"], avHetSE=snp["avHetSE"], func=snp["func"], locType=snp["locType"], weight=snp["weight"], ) ) assay.cached_sequences.append(cached_seq) Session.commit() return sequences
def cut(self): left_padding = self.form_result['left_padding'] right_padding = self.form_result['right_padding'] enzyme = Session.query(Enzyme).get(self.form_result['enzyme']) cutseq = enzyme.cutseq # TODO change to single amplicon? if self.form_result['assay_id']: assay = Session.query(SequenceGroup).get(self.form_result['assay_id']) amplicon_tuples = pcr_sequences_snps_for_group(assay, padding_pos5=left_padding, padding_pos3=right_padding) sequences = [] for amp, pseqs in amplicon_tuples: sequences.extend(pseqs) else: manual_seq = PCRSequence(SimpleGenomeSequence(0, 0, len(self.form_result['positive_sequence'])-1, '+', full_sequence=self.form_result['positive_sequence'])) manual_seq.snps = [] sequences = [manual_seq] # TODO: this is arbitrary location_cut_data = self.__enzyme_cut_locations(sequences[0], [enzyme]) # TODO support multiple sequences, somehow. pos_seq = sequences[0].merged_positive_sequence total_width = len(pos_seq) re_width_pct = 100*float(len(cutseq))/total_width enzyme_cut_data = location_cut_data[self.form_result['enzyme']] positive_matches = [] negative_matches = [] return_dict = {} snp_dict = dict([(s['name'], s) for s in sequences[0].snps]) # keys here are going to be amplicon_cuts, left_cuts and right_cuts, left_cut for k, v in sorted(enzyme_cut_data.items()): blank, original_positives, original_negatives = v[0] snp_positives = [] snp_negatives = [] cancel_positives = [] cancel_negatives = [] for cuts in v[1:]: snp_name, shifted_positives, shifted_negatives = cuts snp = snp_dict[snp_name] # TODO: this is an oversimplification but will probably only result # in a shift in a particular restriction site # # TODO: I think this is sketchy right at the edges, needs to be tested. (> vs >=, etc) # TODO: the code could stand to be more compact as well. if len(shifted_positives) > len(original_positives): found = False for start, end, strand in shifted_positives: if (snp['chromEnd'] >= pos_seq.start+start-1 and snp['chromEnd'] <= pos_seq.start+end) or \ (snp['chromStart'] >= pos_seq.start+start-1 and snp['chromStart'] <= pos_seq.start+end): snp_positives.append((start, end, strand)) found = True if not found: pass #raise Exception, "ERROR: additional positive strand restriction site not found by analyzing SNPs" elif len(shifted_positives) < len(original_positives): found = False for start, end, strand in original_positives: if (snp['chromEnd'] >= pos_seq.start+start-1 and snp['chromEnd'] <= pos_seq.start+end) or \ (snp['chromStart'] >= pos_seq.start+start-1 and snp['chromStart'] <= pos_seq.start+end) or \ (pos_seq.start+start-1 >= snp['chromStart'] and pos_seq.start+end <= snp['chromEnd']): if (start, end, strand) not in cancel_positives: cancel_positives.append((start, end, strand)) found = True if not found: pass #raise Exception, "ERROR: cancelled positive strand restriction site not found by analyzing SNPs" if len(shifted_negatives) > len(original_negatives): # find where the new snp is found = False for start, end, strand in shifted_negatives: if (snp['chromEnd'] >= pos_seq.end-(end+1) and snp['chromEnd'] <= pos_seq.end-start) or \ (snp['chromStart'] >= pos_seq.end-(end+1) and snp['chromStart'] <= pos_seq.end-start): snp_negatives.append((start, end, strand)) found = True if not found: pass # insertion screws you here. #raise Exception, (snp['chromEnd'], snp['chromStart'], pos_seq.end, pos_seq.end-(shifted_negatives[0][1]+1), pos_seq.end-(shifted_negatives[0][0])) #raise Exception, "ERROR: additional negative strand restriction site not found by analyzing SNPs" elif len(shifted_negatives) < len(original_negatives): found = False for start, end, strand in original_negatives: if (snp['chromEnd'] >= pos_seq.end-(end+1) and snp['chromEnd'] <= pos_seq.end-start) or \ (snp['chromStart'] >= pos_seq.end-(end+1) and snp['chromStart'] <= pos_seq.end-start) or \ (pos_seq.end-(end+1) >= snp['chromStart'] and pos_seq.end-start <= snp['chromEnd']): if (start, end, strand) not in cancel_negatives: cancel_negatives.append((start, end, strand)) found = True if not found: pass #raise Exception, "ERROR: cancelling negative strand restriction site not found by analyzing SNPs" for tup in cancel_positives: original_positives.remove(tup) for tup in cancel_negatives: original_negatives.remove(tup) return_dict[k] = len(original_positives) + len(cancel_positives) \ + len(original_negatives) + len(cancel_negatives) \ + len(snp_positives) + len(snp_negatives) for start, end, strand in original_positives: positive_matches.append({'offset': start, 'pos': '%s%%' % (start*100.0/total_width), 'class': 'stable_re_site'}) for start, end, strand in original_negatives: negative_matches.append({'offset': start, 'pos': '%s%%' % (100-(start*100.0/total_width)-re_width_pct), 'class': 'stable_re_site'}) for start, end, strand in snp_positives: positive_matches.append({'offset': start, 'pos': '%s%%' % (start*100.0/total_width), 'class': 'snp_re_site'}) for start, end, strand in snp_negatives: negative_matches.append({'offset': start, 'pos': '%s%%' % (100-(start*100.0/total_width)-re_width_pct), 'class': 'snp_re_site'}) for start, end, strand in cancel_positives: positive_matches.append({'offset': start, 'pos': '%s%%' % (start*100.0/total_width), 'class': 'snp_cancel_re_site'}) for start, end, strand in cancel_negatives: negative_matches.append({'offset': start, 'pos': '%s%%' % (100-(start*100.0/total_width)-re_width_pct), 'class': 'snp_cancel_re_site'}) return_dict['positive_cuts'] = positive_matches return_dict['negative_cuts'] = negative_matches return_dict['re_width_pct'] = "%s%%" % re_width_pct # future out amplicon position amplicon_start = left_padding amplicon_end = len(pos_seq) - (right_padding+1) left_offsets = [match['offset'] for match in positive_matches if match['offset'] < (amplicon_start - len(cutseq))] if left_offsets: rightmost_left = max(left_offsets)+len(cutseq) else: rightmost_left = None right_offsets = [match['offset'] for match in positive_matches if match['offset'] > amplicon_end] if right_offsets: leftmost_right = min(right_offsets) else: leftmost_right = None amplicon_cuts = [match for match in positive_matches if match['offset'] >= amplicon_start and match['offset'] <= amplicon_end] # todo: bug if the cutters are asymmetric and the negative cutsite is shorter (redmine 669) if rightmost_left is not None and leftmost_right is not None and len(amplicon_cuts) == 0: inner_len = leftmost_right - rightmost_left inner_seq = pos_seq.sequence[rightmost_left:leftmost_right] inner_gc = gc_content(inner_seq) left_offset = amplicon_start - rightmost_left right_offset = leftmost_right - amplicon_end return_dict['fragment'] = {'len': inner_len, 'loff': left_offset, 'roff': right_offset, 'gc': "%.2f%%" % (inner_gc*100)} return return_dict