def construct_design(doc, root, target_design): # target_design is a list of part uris n_components = len(doc.components) n_annotations = len(doc.annotations) n_sequences = len(doc.sequences) sbol_parts = [] for uri in target_design: #uri = uris[part_name] part = doc.components[uri] SA = sbol.SequenceAnnotation( doc, '%s/SA_%d' % (root.uri, n_annotations + 1)) n_annotations += 1 if not part.sequence: part.sequence = sbol.DNASequence( doc, '%s/Seq_%d' % (part.uri, n_sequences + 1)) part.sequence.nucleotides = 'n' SA.start = 1 SA.end = len(part.sequence.nucleotides) SA.orientation = '+' sbol_parts.append(SA) SA.subcomponent = part root.annotations.append(sbol_parts[0]) for i_part in range(1, len(sbol_parts)): upstream_ann = sbol_parts[i_part - 1] downstream_ann = sbol_parts[i_part] insert_annotation_downstream(root, upstream_ann, downstream_ann) assemble_subcomponents(root) #for part in sbol_parts: # print part.start, part.end, part.subcomponent.name, part.subcomponent.type, part.subcomponent.uri return root
def testAnnotations(self): for n in range(NUM_SLOW_TESTS): self.assertEqual(len(self.testees[0].annotations), n) uri = random_uri() self.uris.append(uri) ann = sbol.SequenceAnnotation(self.doc, uri) self.assertFalse(ann in self.testees[0].annotations) self.testees[0].annotations += ann self.assertTrue(ann in self.testees[0].annotations)
def makeAnnot(f, parent=dc): if self.shape == 'c' and f.end > l and parent == dc: end = f.end f.end = l makeAnnot(f, parent) f.end = end - l f.start = 0 makeAnnot(f, parent) return dcf = sbol.DNAComponent(doc, "#dc_" + str(fid[0])) dcf.display_id = str(f.type) dcf.description = str(";".join(["%s:%s" % (q.name,q.data) for q in f.qualifiers.all()])) sa = sbol.SequenceAnnotation(doc, "#sa_" + str(fid[0])) sa.subcomponent = dcf if f.direction == 'f': sa.strand = '+' else: sa.strand = '-' sa.start = f.start + 1 # SBOL 1-based sa.end = f.end + 1 parent.annotations.append(sa) fid[0] += 1 return dcf
def qc(design, data=None, infile=None): if infile: with open(infile, "r") as f: data = f.read() if data: if len(parse_fasta(data)) > 1: multialignment = align(data) clone = find_consensus(multialignment) else: clone = data target_design = write_to_fasta([(design.uri, design.sequence.nucleotides)]) alignment_qc = align(target_design + '\r\n' + clone, outfile='%s.align' % design.display_id) # Scan alignment and classify mutations design_seq = design.sequence.nucleotides reference_seq = parse_fasta(alignment_qc)[0][1][:] query_seq = parse_fasta(alignment_qc)[1][1][:] assert len(reference_seq) == len(query_seq) # Translate alignment coordinates into coordinates of the reference and query sequences l_alignment = len(reference_seq) # Determine length of alignment l_ref = len(reference_seq.replace('-', '')) l_que = len(query_seq.replace('-', '')) # The following dictionaries are used like lists indexed from one ref_map = { } # Maps nucleotide coordinates of reference sequence to alignment coordinates i_ref = 0 # If the design sequence is not fully covered by sequencing data, there may be '---' padding the end of # the query sequence. The following indices mark the padded regions of the query_seq # Eg, # ref actggtca # qry --tggt-- # i_left = query_seq.index( next(token for token in query_seq if not token == '-')) i_right = len(query_seq) - query_seq[::-1].index( next(token for token in reversed(query_seq) if not token == '-')) for i_alignment in range(l_alignment): ref_base = reference_seq[i_alignment] que_base = query_seq[i_alignment] if not ref_base == '-': i_ref += 1 # Do not map the design coordinates to alignment coordinates if they aren't covered if i_alignment >= i_left and i_alignment <= i_right: ref_map[i_ref] = i_alignment # Should be a unit test #for i in range(0, l_ref): # assert design_sequence[i] == reference_seq[ref_map[i+1]], "%d %s does not match %s"%(i,design_sequence[i], reference_seq[ref_map[i+1]]) # Only leaf annotations at the bottom of the hierarchy are annotated... leaf_annotations = [] for i_design in range(len(design_seq)): target_annotations = getSequenceAnnotationsAtBaseNo( design, i_design) for ann in target_annotations: if not ann in leaf_annotations: leaf_annotations.append(ann) # Slice the alignment into segments that pertain to each annotation, # then determine the covered bases in the annotation. All, part, or several discontiguous parts of an annotation # may be covered for i_ann, ann in enumerate(leaf_annotations): covered_coordinates = list( ref_map.keys() ) # List of all base coordinates for this design / reference sequence that are covered # Now narrow down to find just the bases in this annotation covered_coordinates = [ x for x in covered_coordinates if x >= ann.start and x <= ann.end ] # Now translate into alignment coordinates alignment_coordinates = [ref_map[x] for x in covered_coordinates] if len(alignment_coordinates) > 0: alignment_start = min(alignment_coordinates) alignment_end = max(alignment_coordinates) # Scan alignment print("Verifying %s from %d to %d" % (ann.subcomponent.display_id, ann.start, ann.end)) print(''.join([ nt for nt in reference_seq[alignment_start:alignment_end] ])) print(''.join( [nt for nt in query_seq[alignment_start:alignment_end]])) # Classification of alignment base_comparisons = [ verify_base(reference_seq[x], query_seq[x]) for x in alignment_coordinates ] for x in alignment_coordinates: comparison = verify_base(reference_seq[x], query_seq[x]) if comparison == None: print(x, reference_seq[x], query_seq[x]) # Select a contiguous region of interest in alignment coordinates # TODO: replace while with for i_alignment = 0 regions = [] region_classifications = [] while i_alignment < len(base_comparisons): current_term = base_comparisons[i_alignment] if i_alignment == 0: reg_start = 0 reg_end = 0 previous_term = None elif i_alignment > 0 and i_alignment < ( len(base_comparisons) - 1): # Mark end of an old region of interest and beginning of a new region if not current_term == previous_term: ref_start = covered_coordinates[ reg_start] # Translate from alignment to design / reference coordinates ref_end = covered_coordinates[ reg_end] # Translate from alignment to design / reference coordinates region_of_interest = ((ref_start, ref_end), previous_term) regions.append(region_of_interest) reg_start = i_alignment reg_end = i_alignment # Else extend the old region of interest to include the current coordinate elif current_term == previous_term: reg_end = i_alignment elif i_alignment == (len(base_comparisons) - 1): if not current_term == previous_term: reg_start = i_alignment reg_end = i_alignment ref_start = covered_coordinates[ reg_start] # Translate from alignment to design / reference coordinates ref_end = covered_coordinates[ reg_end] # Translate from alignment to design / reference coordinates region_of_interest = ((ref_start, ref_end), previous_term) regions.append(region_of_interest) elif current_term == previous_term: reg_end = i_alignment ref_start = covered_coordinates[ reg_start] # Translate from alignment to design / reference coordinates ref_end = covered_coordinates[ reg_end] # Translate from alignment to design / reference coordinates region_of_interest = ((ref_start, ref_end), previous_term) regions.append(region_of_interest) #print i_alignment, current_term, reg_start, reg_end, covered_coordinates[reg_start], covered_coordinates[reg_end] previous_term = current_term i_alignment += 1 # TODO: add unit test checking that the first region starts and the last region ends # TODO: add unit test checking that two distinct regions of interest can be demarcated # TODO: add unit test checking a single base region of interest at the beginning or the start # TODO: add unit test checking if first or last bases of query are '-'. These are currently classified as # insertions, but are in fact uncovered regions # Create SequenceAnnotations for QC'd regions doc = design.doc for i_region, region in enumerate(regions): print(i_region) qc_start, qc_end = region[0] qc_classification = region[1] n_components = len(doc.components) n_annotations = len(doc.annotations) if qc_classification: if qc_classification == SO_NUCLEOTIDE_MATCH: # The reference sequence matches the query sequence annotated_region = sbol.SequenceAnnotation( doc, "%s/MatchedSequence/SA%d" % (design.uri, n_annotations)) annotated_region.start = qc_start annotated_region.end = qc_end annotated_region.subcomponent = sbol.DNAComponent( doc, "%s/MatchedSequence/SA%d/DC%d" % (design.uri, n_annotations, n_components)) annotated_region.subcomponent.display_id = "" annotated_region.subcomponent.type = qc_classification else: # A mismatch was identified annotated_region = sbol.SequenceAnnotation( doc, "%s/AssemblyErrors/SA%d" % (design.uri, n_annotations)) annotated_region.start = qc_start annotated_region.end = qc_end annotated_region.subcomponent = sbol.DNAComponent( doc, "%s/AssemblyErrors/SA%d/DC%d" % (design.uri, n_annotations, n_components)) annotated_region.subcomponent.display_id = "" annotated_region.subcomponent.type = qc_classification print("Adding %s to %s from %d to %d" % (annotated_region.uri, ann.subcomponent.display_id, annotated_region.start, annotated_region.end)) ann.subcomponent.annotations.append(annotated_region)
def createTestees(self): uri = random_uri() self.uris.append(uri) self.testees.append(sbol.SequenceAnnotation(self.doc, uri))