def _get_nucseq(feature, seq_record, extension): start = feature.location.start - extension cds_start = extension cds_end = cds_start + len(feature.location) if start < 0: offset = 0 - start start = start + offset cds_start = cds_start - offset cds_end = cds_end - offset end = feature.location.end + extension if end > len(seq_record): offset = end - len(seq_record) end = end - offset if feature.location.strand == -1: # swap cds_start and cd_end length = end - start cds_start, cds_end = length - cds_end, length - cds_start extended_location = FeatureLocation(start=start, end=end, strand=feature.strand) nucseq = extended_location.extract(seq_record) # for debug # print(feature.id, start, end, feature.location.strand, cds_start, cds_end, str(nucseq.seq), str(nucseq.seq)[cds_start:cds_end]) # print(str(nucseq.seq)[cds_start:cds_end]) # print(str(feature.extract(seq_record).seq)) assert str(nucseq.seq)[cds_start:cds_end] == str(feature.extract(seq_record).seq) return Extended_CDS(feature.id, str(nucseq.seq), int(cds_start), int(cds_end), int(start), int(end), int(feature.location.strand))
def get_aa_translation_from_location( self, location: FeatureLocation, transl_table: Union[str, int] = None) -> Seq: """ Obtain the translation for a feature based on its location """ if location.end > len(self.seq): raise ValueError("location outside available sequence") if transl_table is None: transl_table = self._transl_table extracted = location.extract(self.seq).ungap('-') if len(extracted) % 3 != 0: extracted = extracted[:-(len(extracted) % 3)] seq = extracted.translate(to_stop=True, table=transl_table) if not seq: # go past stop codons and hope for something to work with seq = extracted.translate(table=transl_table) # replace ambiguous proteins with an explicit unknown string_version = str(seq) for invalid in "*BJOUZ": string_version = string_version.replace(invalid, "X") if "-" in str(seq): seq = Seq(str(seq).replace("-", "")) return Seq(string_version)
def test_reference_in_location_record(self): """Test location with reference to another record.""" parent_record = SeqRecord.SeqRecord(seq=Seq.Seq("actg")) another_record = SeqRecord.SeqRecord(seq=Seq.Seq("gtcagctac")) location = FeatureLocation(5, 8, ref="ANOTHER.7") with self.assertRaisesRegex( ValueError, r"Feature references another sequence \(ANOTHER\.7\), references mandatory", ): location.extract(parent_record) with self.assertRaisesRegex( ValueError, r"Feature references another sequence \(ANOTHER\.7\), not found in references", ): location.extract(parent_record, references={"SOMEOTHER.2": another_record}) self.assertEqual( location.extract(parent_record, references={"ANOTHER.7": another_record}), "cta", )
def test_reference_in_location_sequence(self): """Test location with reference to another sequence.""" parent_sequence = Seq.Seq("actg") another_sequence = Seq.Seq("gtcagctac") location = FeatureLocation(5, 8, ref="ANOTHER.7") self.assertEqual( location.extract(parent_sequence, references={"ANOTHER.7": another_sequence}), "cta", )
def createSeqFromTblastn(subject_fna, sseq_seq_faa, exonerate_target_id, start_match, end_match): """ Use the result from the tBlastn to extract a region from the subject genome. The region extracted corresponds to the match region and 10kb before and 10kb after. Parameters ---------- subject_fna: str path to subject fasta sequence (genome) sseq_seq_faa: str path to output fasta sequence exonerate_target_id: str ID of the contig/scaffold/chromosome where a match has been found start_match: int start of the match end_match: int end of the match """ if not os.path.exists(sseq_seq_faa): with open(subject_fna, "r") as fna: sseq_seq = [seq_record for seq_record in SeqIO.parse(fna, "fasta") if seq_record.id == exonerate_target_id][0] sseq_seq.description = "tblastn identified sequence" sseq_seq.id = exonerate_target_id + "_" + str(start_match) + "_" + str(end_match) if start_match > end_match: start_match = start_match + 10000 end_match = end_match - 10000 if start_match > len(sseq_seq.seq): start_match = len(sseq_seq.seq) if end_match < 0: end_match = 0 seq_location = FeatureLocation(end_match, start_match) sseq_seq.seq = seq_location.extract(sseq_seq.seq) elif start_match < end_match: start_match = start_match - 10000 end_match = end_match + 10000 if start_match < 0: start_match = 0 if end_match > len(sseq_seq.seq): end_match = len(sseq_seq.seq) seq_location = FeatureLocation(start_match, end_match) sseq_seq.seq = seq_location.extract(sseq_seq.seq) SeqIO.write(sseq_seq, sseq_seq_faa, "fasta")
def test_reference_in_compound_location_sequence(self): """Test compound location with reference to another sequence.""" parent_sequence = Seq.Seq("aaccaaccaaccaaccaa") another_sequence = Seq.Seq("ttggttggttggttggtt") location = FeatureLocation(2, 6) + FeatureLocation(5, 8, ref="ANOTHER.7") self.assertEqual( location.extract( parent_sequence, references={"ANOTHER.7": another_sequence} ), "ccaatgg", )
def __init__(self, logger, sequences, reference, dateFormat): super(sequence_set, self).__init__() self.log = logger # load sequences from the (parsed) JSON - don't forget to sort out dates self.seqs = {} for name, data in sequences.iteritems(): self.seqs[name] = SeqRecord(Seq(data["seq"], generic_dna), id=name, name=name, description=name) self.seqs[name].attributes = data["attributes"] # tidy up dates date_struc = parse_date(self.seqs[name].attributes["raw_date"], dateFormat) self.seqs[name].attributes["num_date"] = date_struc[1] self.seqs[name].attributes["date"] = date_struc[2] # if the reference is to be analysed it'll already be in the (filtered & subsampled) # sequences, so no need to add it here, and no need to care about attributes etc # we do, however, need it for alignment self.reference_in_dataset = reference["included"] name = reference["strain"] self.reference_seq = SeqRecord(Seq(reference["seq"], generic_dna), id=name, name=name, description=name) if "genes" in reference and len(reference["genes"]): self.proteins = {} for k, v in reference["genes"].iteritems(): feature = FeatureLocation(start=v["start"], end=v["end"], strand=v["strand"]) # Translate sequences to identify any proteins ending with a stop codon. translation = Seq.translate( Seq(feature.extract(str(self.reference_seq.seq)))) if translation.endswith("*"): # Truncate the last codon of the protein to omit the stop codon. feature = FeatureLocation(start=v["start"], end=v["end"] - 3, strand=v["strand"]) self.proteins[k] = feature else: self.proteins = None # other things: self.run_dir = '_'.join([ 'temp', time.strftime('%Y%m%d-%H%M%S', time.gmtime()), str(random.randint(0, 1000000)) ]) self.nthreads = 2 # should load from config file
def test_reference_in_compound_location_record(self): """Test compound location with reference to another record.""" parent_record = SeqRecord.SeqRecord(Seq.Seq("aaccaaccaaccaaccaa")) another_record = SeqRecord.SeqRecord(Seq.Seq("ttggttggttggttggtt")) location = FeatureLocation(2, 6) + FeatureLocation(5, 8, ref="ANOTHER.7") with self.assertRaisesRegex( ValueError, r"Feature references another sequence \(ANOTHER\.7\), references mandatory", ): location.extract(parent_record) with self.assertRaisesRegex( ValueError, r"Feature references another sequence \(ANOTHER\.7\), not found in references", ): location.extract(parent_record, references={"SOMEOTHER.2": another_record}) self.assertEqual( location.extract( parent_record, references={"ANOTHER.7": another_record} ).seq, "ccaatgg", )
def extractUpstream(r, f, leftmost=200): from Bio.SeqFeature import FeatureLocation location = f.location start, end, strand = location.start, location.end, location.strand if strand == 1: start_, end_ = start - 200, start else: start_, end_ = end + 1, end + 201 fl = FeatureLocation(start_, end_, strand) upstream = fl.extract(r) upstream.id = f.qualifiers['locus_tag'][0] upstream.name, upstream.description = '', ''
def find_cds (): seq_des = str(record_dict[keys].description).split("|") for i in seq_des: if re.match("CDS", i): feature, cds_start, cds_end = re.split(":|-", i) f = FeatureLocation(int(cds_start)-1, int(cds_end)) cds_sequence = f.extract(record_dict[keys].seq) protein_sequence = cds_sequence.translate() if "*" not in protein_sequence: return 0 else return 1 else return 0
def find_cds (): seq_des = str(record_dict[keys].description).split("|") if any("CDS:" in s for s in seq_des): for des in seq_des: match = re.match("CDS:", des) if match is not None: print record_dict[keys].id feature, cds_start, cds_end = re.split(":|-", des) f = FeatureLocation(int(cds_start)-1, int(cds_end)) cds_sequence = f.extract(record_dict[keys].seq) protein_sequence = cds_sequence.translate() if "*" not in protein_sequence: return 3 else: return 1 else: return 0
def get_stop( cdss: List[GFF3Record], seq: SeqRecord, strand: Strand, ) -> Tuple[str, int, int]: assert len(cdss) > 0 if strand == Strand.MINUS: cds = cdss[0] start = cds.start end = start + 3 feat = FeatureLocation(start, end, -1) else: cds = cdss[-1] end = cds.end start = end - 3 feat = FeatureLocation(start, end, +1) return str(feat.extract(seq).seq), start, end
def get_aa_translation_from_location( self, location: FeatureLocation, transl_table: Union[str, int] = None) -> Seq: """ Obtain the translation for a feature based on its location """ if transl_table is None: transl_table = self._transl_table extracted = location.extract(self.seq).ungap('-') if len(extracted) % 3 != 0: extracted = extracted[:-(len(extracted) % 3)] seq = extracted.translate(to_stop=True, table=transl_table) if not seq: # go past stop codons and hope for something to work with seq = extracted.translate(table=transl_table) if "*" in str(seq): seq = Seq(str(seq).replace("*", "X"), Bio.Alphabet.generic_protein) if "-" in str(seq): seq = Seq(str(seq).replace("-", ""), Bio.Alphabet.generic_protein) return seq
def getgenefromgbk(gbkfile, location): # change to work with locations """parses a genesequence from a gbk file using the gene location parameters ---------- gbkfile string, path to gbk file + file location string of coordinates, example: "[start:end>](+)" returns ---------- ret = DNA sequence of housekeepinggene from featurelocation coordinates abs_loc = validation, contains the location of HG on specific scaffold. [scaffold, start, end] """ ret = "" scaff_number, start, end, strand = location.split(",") scaff_number = int(scaff_number) # Making the FeatureLocation f_start = BeforePosition( start.strip("<")) if "<" in start else ExactPosition(start) f_end = AfterPosition(end.strip(">")) if ">" in end else ExactPosition(end) f = FeatureLocation(f_start, f_end, int(strand)) gbkcontents = SeqIO.parse(gbkfile, "genbank") for record in gbkcontents: record_no = record.name.split(".")[0] scaff_check = int(record_no[-3:]) # = scaffold number if scaff_check == scaff_number: DNA = record.seq ret = f.extract(DNA) # The DNA sequence of the housekeepinggene # VALIDATION start = start.replace(">", "") start = start.replace("<", "") start = int(start) end = end.replace(">", "") end = end.replace("<", "") end = int(end) abs_loc = [scaff_number, start, end] return (ret, abs_loc)
def __init__(self, logger, sequences, reference, dateFormat): super(sequence_set, self).__init__() self.log = logger # load sequences from the (parsed) JSON - don't forget to sort out dates self.seqs = {} for name, data in sequences.items(): self.seqs[name] = SeqRecord(Seq(data["seq"], generic_dna), id=name, name=name, description=name) self.seqs[name].attributes = data["attributes"] # tidy up dates date_struc = parse_date(self.seqs[name].attributes["raw_date"], dateFormat) self.seqs[name].attributes["num_date"] = date_struc[1] self.seqs[name].attributes["date"] = date_struc[2] # if the reference is to be analysed it'll already be in the (filtered & subsampled) # sequences, so no need to add it here, and no need to care about attributes etc # we do, however, need it for alignment self.reference_in_dataset = reference["included"] name = reference["strain"] self.reference_seq = SeqRecord(Seq(reference["seq"], generic_dna), id=name, name=name, description=name) if "genes" in reference and len(reference["genes"]): self.proteins = {} for k, v in reference["genes"].items(): feature = FeatureLocation(start=v["start"], end=v["end"], strand=v["strand"]) # Translate sequences to identify any proteins ending with a stop codon. translation = Seq.translate(Seq(feature.extract(str(self.reference_seq.seq)))) if translation.endswith("*"): # Truncate the last codon of the protein to omit the stop codon. feature = FeatureLocation(start=v["start"], end=v["end"] - 3, strand=v["strand"]) self.proteins[k] = feature else: self.proteins = None # other things: self.run_dir = '_'.join(['temp', time.strftime('%Y%m%d-%H%M%S',time.gmtime()), str(random.randint(0,1000000))]) self.nthreads = 2 # should load from config file
def cds_extract(seq, location, codon_start=1): """Specialized wrapper for extract method of FeatureLocation. Returns cds and cds_translation as strings. Corrects BeforePosition behavior by specifying default codon_start. Correctly handles hanging incomplete codons in tail. """ if location.start.__class__.__name__ == 'ExactPosition': cds_Seq = location.extract(seq) else: new_location = FeatureLocation( codon_start - 1, location.end, location.strand, location.ref, location.ref_db) cds_Seq = new_location.extract(seq) cds = str(cds_Seq) cds_translation = str(cds_Seq.translate()) # Fix hanging codons (non-triplets at 3' end) # Final cds should be a triplet, either trimmed or with additional 'N' if len(cds) % 3 != 0: hanging_codon = cds[-(len(cds)%3):] hanging_aa = aa_or_X_given_codon[hanging_codon] # E.g. 'CG(N)' -> 'R' if hanging_aa != 'X': assert len(hanging_codon) == 2 cds += 'N' cds_translation += hanging_aa else: cds = cds[:-(len(cds)%3)] # Remove stop codons if cds_translation.endswith('*'): cds = cds[:-3] cds_translation = cds_translation[:-1] return cds, cds_translation
def _concatenate_features(left_id, right_id): ''' Let the N-term part 'parent' and C-term part 'child'. Two features are concatenated based on the parent feature. Child features will be removed in the downstream process. Returns parent's feature_id and child's feature_id If consistency check fails, returns None. ''' left_feature = self.genome.features[left_id] right_feature = self.genome.features[right_id] if left_feature.type != "CDS" or right_feature.type != "CDS" or left_feature.seq_id != right_feature.seq_id: return None if left_feature.strand == right_feature.strand == 1: parent, child = left_feature, right_feature stop_codon_location = FeatureLocation(start=parent.location.end - 3, end=parent.location.end, strand=1) elif left_feature.strand == right_feature.strand == -1: parent, child = right_feature, left_feature stop_codon_location = FeatureLocation(start=parent.location.start, end=parent.location.start + 3, strand=-1) else: return None concatenated_location = FeatureLocation(start=left_feature.location.start, end=right_feature.location.end, strand=left_feature.strand) seq_id = parent.seq_id whole_seq = self.genome.seq_records[seq_id] # annotations = parent.annotations.copy() # qualifiers = parent.qualifiers.copy() transl_table = parent.qualifiers.get("trasl_table", [11])[0] # if not available, use translation table 11. extracted_seq = concatenated_location.extract(whole_seq.seq) translated_seq = str(extracted_seq.translate(table=transl_table).rstrip("*")) if translated_seq.count("*") == 1: stop_codon_pos = translated_seq.index("*") + 1 else: return None # Only one stop codon must be included in aa. stop_codon = str(stop_codon_location.extract(whole_seq.seq)) if stop_codon.upper() == "TGA": # opal > Selenocysteine, Sec, U # /transl_except=(pos:complement(5272379..5272381),aa:Sec) transl_except = "(pos:{},aa:Sec)".format(get_location_string(stop_codon_location)) translated_seq = translated_seq.replace("*", "U") note_value = "codon on position {} is selenocysteine opal codon.".format(stop_codon_pos) elif stop_codon.upper() == "TAG": # amber > pyrrolysine, Pyl, O # /transl_except=(pos:213..215,aa:Pyl ) transl_except = "(pos:{},aa:Pyl)".format(get_location_string(stop_codon_location)) translated_seq = translated_seq.replace("*", "O") note_value = "codon on position {} is pyrrolysine amber codon.".format(stop_codon_pos) else: return None # stop codon must be either of TGA/TAG # todo: Change this to hit obejct parent.location = concatenated_location parent.qualifiers["translation"] = [translated_seq] parent.qualifiers["transl_except"] = [transl_except] parent.qualifiers.setdefault("note", []).append(note_value) parent.primary_hit, parent.secondary_hits = None, [] # print("left", left_feature.location) # print("right", right_feature.location) # print(concatenated_location) return parent.id, child.id
def check_genomewide(refseq, VERBOSE=0): '''Check the integrity of all genes in the genomewide consensus''' # Check single-exon genes length_tolerance = {'gag': 30, 'pol': 30, 'env': 70, 'vpr': 15, 'vpu': 15} for genename, tol in length_tolerance.iteritems(): (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not start_found) or (not end_found): print 'ERROR: '+genename+' not found in genomewide!' return False elif VERBOSE >= 3: print 'OK: start and end of '+genename+' found' gene_HXB2 = get_gene_HXB2(genename) check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=tol) if not check: return False geneseq = refseq[start: end] gene = geneseq.seq check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE) if not check: # sometimes the gene ends a few nucleotides upstream, and there is a # frameshift mutation that screws up gene_new = refseq.seq[start:] gene_new = gene_new[:len(gene_new) - (len(gene_new) % 3)] prot_new = gene_new.translate() end_new = prot_new.find('*') end_diff = start + (3 * end_new + 3) - end if -90 < end_diff < 0: print genename.upper()+' ENDS '+str((end - start) // 3 - end_new - 1)+' AMINO ACIDS UPSTREAM!' gene = gene_new[:3 * (end_new + 1)] else: return False prot = gene.translate() check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE) if (not check): if genename != 'vpu': return False else: print 'ERROR IN VPU STARTING CODON, CONTINUING!' check = check_has_end(prot, genename, VERBOSE=VERBOSE) if not check: # sometimes a gene is a bit longer gene_new = refseq.seq[start:] gene_new = gene_new[:len(gene_new) - (len(gene_new) % 3)] prot_new = gene_new.translate() end_new = prot_new.find('*') end_diff = start + (3 * end_new + 3) - end if -90 < end_diff < 0: print genename.upper()+' ENDS '+str((end - start) // 3 - end_new - 1)+' AMINO ACIDS UPSTREAM!' gene = gene_new[:3 * (end_new + 1)] prot = gene.translate() elif 0 < end_diff < 90: print genename.upper()+' ENDS '+str(end_new + 1 - (end - start) // 3)+' AMINO ACIDS DOWNSTREAM!' gene = gene_new[:3 * (end_new + 1)] prot = gene.translate() else: return False check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE) if not check: return False # Vif is special because it can be longer than in HXB2 genename = 'vif' (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not start_found) or (not end_found): print 'ERROR: '+genename+' not found in genomewide!' return False elif VERBOSE >= 3: print 'OK: start and end of '+genename+' found' gene_HXB2 = get_gene_HXB2(genename) check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=15) if not check: return False geneseq = refseq[start: end] gene = geneseq.seq check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE) if not check: return False prot = gene.translate() check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_end(prot, genename, VERBOSE=0) if not check: # Vif tends to be a bit longer than in HXB2 for nc in xrange(1, 4): gene_ext = refseq[start: end + 3 * nc].seq prot_ext = gene_ext.translate() check = check_has_end(prot_ext, genename, VERBOSE=0) if check: gene = gene_ext prot = prot_ext if VERBOSE: print 'WARNING: '+genename+' actually ends '+str(nc)+' codons downstream' break else: print 'ERROR: '+genename+' does not end, not even slightly downstream' return False check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE) if not check: return False # Check 2-exon genes for genename_whole in ('tat', 'rev'): genename = genename_whole+'1' (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not start_found) or (not end_found): print 'ERROR: '+genename+' not found in genomewide!' return False elif VERBOSE >= 3: print 'OK: start and end of '+genename+' found' gene_HXB2 = get_gene_HXB2(genename) check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=15) if not check: return False geneseq = refseq[start: end] geneseq = geneseq[:len(geneseq) - len(geneseq) % 3] gene = geneseq.seq prot = gene.translate() check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE) if not check: return False start_exon1 = start end_exon1 = end genename = genename_whole+'2' (start, end, start_found, end_found) = locate_gene(refseq[end_exon1 + 2000:], genename, VERBOSE=VERBOSE) if (not start_found) or (not end_found): print 'ERROR: '+genename+' not found in genomewide!' return False elif VERBOSE >= 3: print 'OK: start and end of '+genename+' found' start += end_exon1 + 2000 end += end_exon1 + 2000 # NOTE: rev2 overlaps with env gp41 and can have insertions or deletions if genename == 'rev2': tol = 45 else: tol = 15 gene_HXB2 = get_gene_HXB2(genename) check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=tol) if not check: return False geneseq = refseq[start: end] frame = get_frame(geneseq, gene_HXB2, genename, VERBOSE=VERBOSE) geneseq = geneseq[frame:] gene = geneseq.seq prot = gene.translate() check = check_has_end(prot, genename, VERBOSE=VERBOSE) if not check: if genename != 'rev2': return False else: # rev2 can end a bit early end_new = prot.rfind('*') if end_new != -1: if len(prot) - 1 - end_new < 20: print 'REV2 ENDS '+str(len(prot) - end_new - 1)+' AMINO ACIDS UPSTREAM!' prot = prot[:end_new + 1] end = start + frame + 3 * (end_new + 1) else: return False else: # rev2 can also end quite a bit late gene_new = refseq.seq[start:] gene_new = gene_new[(end - start) % 3:] gene_new = gene_new[:len(gene_new) - (len(gene_new) % 3)] prot_new = gene_new.translate() end_new = prot_new.find('*') if (start + 3 * end_new) - end < 200: print 'REV2 ENDS '+str(end_new - len(prot) + 1)+' AMINO ACIDS DOWNSTREAM!' prot = prot_new[:end_new + 1] end = start + ((end - start) % 3) + 3 * (end_new + 1) else: return False check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE) if not check: return False start_exon2 = start end_exon2 = end genename = genename_whole gene_HXB2 = get_gene_HXB2(genename) from Bio.SeqFeature import FeatureLocation gene_loc = FeatureLocation(start_exon1, end_exon1, strand=+1) + \ FeatureLocation(start_exon2, end_exon2, strand=+1) geneseq = gene_loc.extract(refseq) gene = geneseq.seq check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE) if not check: return False prot = gene.translate() check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_end(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE) if not check: return False return True
def extract_sequence(self, upstream=0, downstream=0): location = FeatureLocation(self.location.start - upstream, self.location.end + downstream, self.strand) return location.extract(self.chromosome.nucleic_sequence)
q = f.qualifiers loc = f.location start = loc.start end = loc.end s = loc.strand if str(f.type).lower() == "rbs & cds": types = q['label'][0].split("_") if types[1] == "PhIF": types[1] = "PhlF" rbs = pdict[types[0]] cds = pdict[types[1]] r = FeatureLocation(start if s==1 else end-len(rbs), start+len(rbs) if s==1 else end ,strand=s) c = FeatureLocation(start+len(rbs) if s==1 else start, end if s==1 else end-len(rbs),strand=s) print(pdict[types[0]] == str((r.extract(record).seq))) print(pdict[types[1]] == str((c.extract(record).seq))) SequenceDictionary += [[types[0], str((r.extract(record).seq)), "rbs", (r.strand == -1), r.start + 1, r.end]] SequenceDictionary += [[types[1], str((c.extract(record).seq)), "cds", (c.strand == -1), c.start + 1, c.end]] elif "scar" in q['label'][0].lower() and (end-start) <= 4 : scars = True label = q['label'][0] label = re.sub(r'\W+', '', label) SequenceDictionary += [[scardict[str((f.extract(record).seq))], str((f.extract(record).seq)), str(f.type).lower(), (loc.strand == -1), start+1, end+0]] elif str(f.type).lower() in ["misc_feature"] and 'promoter' in re.sub(r'\W+', '', q['label'][0]): label = re.sub(r'\W+', '', q['label'][0]) SequenceDictionary += [[label, str((f.extract(record).seq)), "promoter", (loc.strand == -1), start+1, end+0]] elif str(f.type).lower() not in ["source", "primer", "primer_bind", "rep_origin", "misc_feature", "repressor"]: label = q['label'][0] label = re.sub(r'\W+', '', label)
def main(): """ This script extracts a gene sequence from a Genbank file and its promoter region. The name of the gene sequence to be extracted must be provided using the -name argument. If the gene name is stored in a GBK qualifier different than the "gene" qualifier (e.g. locus_tag) must be indicated using the -qual option. Output: a fasta file is created (geneName_outName.fna) with two sequences. The first sequence correspond to the gene (ORF) and the second to the promoter region (ORF +- 1000 bp) """ parser = ArgumentParser(description=main.__doc__) parser.add_argument("-gbk", dest="genbank", help="annotated genbank file", type=str) parser.add_argument("-name", dest="gene_name", help="name of the gene sequence", type=str) parser.add_argument("-qual", dest="tag", \ help="gene/locustag qualifier of the gene", type=str, default="gene") parser.add_argument("-out", dest="out", help="name of the output e.g. spp/strain name", type=str) args = parser.parse_args() # Parse genbank file annot = SeqIO.parse(args.genbank,"genbank") found = False for rec in annot: for feat in rec.features: if feat.type == "CDS": bases = feat.location.extract(rec.seq) if gen_ok(bases): if args.tag == "locustag": if "locus_tag" in feat.qualifiers.keys(): gen = feat.qualifiers["locus_tag"][0] if gen == args.gene_name: output = open(args.gene_name+"_"+args.out+".fna", "a") found = True start = int(feat.location.start - 1000) end = int(feat.location.end + 1000) promoter_loc = FeatureLocation(start,end,strand=feat.location.strand) # write gene seq to file sequence_object = Seq(str(bases)) record = SeqRecord(sequence_object, id=args.gene_name+"_"+args.out, description="") SeqIO.write(record, output, "fasta") # write gene seq + promoter to file sequence_promoter = Seq(str(promoter_loc.extract(rec.seq))) record_promoter = SeqRecord(sequence_promoter, id=args.gene_name+"+/-1000bp"+"_"+args.out, description="") SeqIO.write(record_promoter, output, "fasta") output.close() else: if "gene" in feat.qualifiers.keys(): gen = feat.qualifiers["gene"][0] if gen == args.gene_name: output = open(args.gene_name+"_"+args.out+".fna", "a") found = True start = int(feat.location.start - 1000) end = int(feat.location.end + 1000) promoter_loc = FeatureLocation(start,end,strand=feat.location.strand) # write gene seq to file sequence_object = Seq(str(bases)) record = SeqRecord(sequence_object, id=args.gene_name+"_"+args.out, description="") SeqIO.write(record, output, "fasta") # write gene seq + promoter to file sequence_promoter = Seq(str(promoter_loc.extract(rec.seq))) record_promoter = SeqRecord(sequence_promoter, id=args.gene_name+"+/-1000bp"+"_"+args.out, description="") SeqIO.write(record_promoter, output, "fasta") output.close() if not found: print("gene not found")
for f in in_files: cur_genome = SeqIO.parse(f, "embl") for record in cur_genome: for feat in record.features: if feat.type == 'CDS': if 'gene' in feat.qualifiers: gene = feat.qualifiers['gene'][0] if gene == sys.argv[1]: s, e, strand = feat.location.start, feat.location.end, feat.location.strand header = '>' + feat.qualifiers['gene'][0] + "," + str( s + 1) + ".." + str(e) + "(" + str( strand) + ")" + "," + "[" + f.replace( "genomes/", "") + "]" flanked = FeatureLocation(s, e, strand) out_seq = flanked.extract(record.seq) fname = header[1:].split(',')[0] + ".fna" if fname in stored.keys(): old = fname fname = fname.replace( ".fna", "_" + str(stored[fname]) + ".fna") stored[old] = stored[old] + 1 else: stored[fname] = 1 with open(os.path.join('results', fname), 'w') as out: out.write(header + '\n') out.write(str(out_seq) + '\n')
import os in_files = glob.glob('genomes/*.embl') flanking_region = 100 try: os.mkdir("results") except OSError: print "a 'results' dir already exists" print "Overwriting" stored = {} for f in in_files: cur_genome = SeqIO.parse(f, "embl") #print cur_genome for record in cur_genome: for feat in record.features: if feat.type == 'mobile_element': s, e, strand = feat.location.start, feat.location.end, feat.location.strand header = '>'+feat.qualifiers['mobile_element_type'][0].split(':')[-1]+","+feat.qualifiers['mobile_element_type'][0].split(':')[-1]+".."+str(s+1)+".."+str(e)+"("+str(strand)+"),""100bp flanked,[EC958 IS]" flanked = FeatureLocation(s-flanking_region, e+flanking_region, strand) out_seq = flanked.extract(record.seq) fname = header[1:].split(',')[0].replace('unclassified','unc').replace('family', 'fam').replace('(', '').replace('partial', 'p').replace(')', '').replace(' ', '_').replace('/', '-').strip()+'.fna' if fname in stored.keys(): old = fname fname = fname.replace(".fna", "_"+str(stored[fname])+".fna") stored[old] = stored[old]+1 else: stored[fname] = 1 with open(os.path.join('results', fname), 'w') as out: out.write(header+'\n') out.write(str(out_seq)+'\n')
def check_genomewide(refseq, VERBOSE=0): '''Check the integrity of all genes in the genomewide consensus''' # Check single-exon genes length_tolerance = {'gag': 30, 'pol': 30, 'env': 70, 'vpr': 15, 'vpu': 15} for genename, tol in length_tolerance.iteritems(): (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not start_found) or (not end_found): print 'ERROR: ' + genename + ' not found in genomewide!' return False elif VERBOSE >= 3: print 'OK: start and end of ' + genename + ' found' gene_HXB2 = get_gene_HXB2(genename) check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=tol) if not check: return False geneseq = refseq[start:end] gene = geneseq.seq check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE) if not check: # sometimes the gene ends a few nucleotides upstream, and there is a # frameshift mutation that screws up gene_new = refseq.seq[start:] gene_new = gene_new[:len(gene_new) - (len(gene_new) % 3)] prot_new = gene_new.translate() end_new = prot_new.find('*') end_diff = start + (3 * end_new + 3) - end if -90 < end_diff < 0: print genename.upper() + ' ENDS ' + str( (end - start) // 3 - end_new - 1) + ' AMINO ACIDS UPSTREAM!' gene = gene_new[:3 * (end_new + 1)] else: return False prot = gene.translate() check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE) if (not check): if genename != 'vpu': return False else: print 'ERROR IN VPU STARTING CODON, CONTINUING!' check = check_has_end(prot, genename, VERBOSE=VERBOSE) if not check: # sometimes a gene is a bit longer gene_new = refseq.seq[start:] gene_new = gene_new[:len(gene_new) - (len(gene_new) % 3)] prot_new = gene_new.translate() end_new = prot_new.find('*') end_diff = start + (3 * end_new + 3) - end if -90 < end_diff < 0: print genename.upper() + ' ENDS ' + str( (end - start) // 3 - end_new - 1) + ' AMINO ACIDS UPSTREAM!' gene = gene_new[:3 * (end_new + 1)] prot = gene.translate() elif 0 < end_diff < 90: print genename.upper() + ' ENDS ' + str( end_new + 1 - (end - start) // 3) + ' AMINO ACIDS DOWNSTREAM!' gene = gene_new[:3 * (end_new + 1)] prot = gene.translate() else: return False check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE) if not check: return False # Vif is special because it can be longer than in HXB2 genename = 'vif' (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not start_found) or (not end_found): print 'ERROR: ' + genename + ' not found in genomewide!' return False elif VERBOSE >= 3: print 'OK: start and end of ' + genename + ' found' gene_HXB2 = get_gene_HXB2(genename) check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=15) if not check: return False geneseq = refseq[start:end] gene = geneseq.seq check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE) if not check: return False prot = gene.translate() check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_end(prot, genename, VERBOSE=0) if not check: # Vif tends to be a bit longer than in HXB2 for nc in xrange(1, 4): gene_ext = refseq[start:end + 3 * nc].seq prot_ext = gene_ext.translate() check = check_has_end(prot_ext, genename, VERBOSE=0) if check: gene = gene_ext prot = prot_ext if VERBOSE: print 'WARNING: ' + genename + ' actually ends ' + str( nc) + ' codons downstream' break else: print 'ERROR: ' + genename + ' does not end, not even slightly downstream' return False check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE) if not check: return False # Check 2-exon genes for genename_whole in ('tat', 'rev'): genename = genename_whole + '1' (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not start_found) or (not end_found): print 'ERROR: ' + genename + ' not found in genomewide!' return False elif VERBOSE >= 3: print 'OK: start and end of ' + genename + ' found' gene_HXB2 = get_gene_HXB2(genename) check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=15) if not check: return False geneseq = refseq[start:end] geneseq = geneseq[:len(geneseq) - len(geneseq) % 3] gene = geneseq.seq prot = gene.translate() check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE) if not check: return False start_exon1 = start end_exon1 = end genename = genename_whole + '2' (start, end, start_found, end_found) = locate_gene(refseq[end_exon1 + 2000:], genename, VERBOSE=VERBOSE) if (not start_found) or (not end_found): print 'ERROR: ' + genename + ' not found in genomewide!' return False elif VERBOSE >= 3: print 'OK: start and end of ' + genename + ' found' start += end_exon1 + 2000 end += end_exon1 + 2000 # NOTE: rev2 overlaps with env gp41 and can have insertions or deletions if genename == 'rev2': tol = 45 else: tol = 15 gene_HXB2 = get_gene_HXB2(genename) check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=tol) if not check: return False geneseq = refseq[start:end] frame = get_frame(geneseq, gene_HXB2, genename, VERBOSE=VERBOSE) geneseq = geneseq[frame:] gene = geneseq.seq prot = gene.translate() check = check_has_end(prot, genename, VERBOSE=VERBOSE) if not check: if genename != 'rev2': return False else: # rev2 can end a bit early end_new = prot.rfind('*') if end_new != -1: if len(prot) - 1 - end_new < 20: print 'REV2 ENDS ' + str(len(prot) - end_new - 1) + ' AMINO ACIDS UPSTREAM!' prot = prot[:end_new + 1] end = start + frame + 3 * (end_new + 1) else: return False else: # rev2 can also end quite a bit late gene_new = refseq.seq[start:] gene_new = gene_new[(end - start) % 3:] gene_new = gene_new[:len(gene_new) - (len(gene_new) % 3)] prot_new = gene_new.translate() end_new = prot_new.find('*') if (start + 3 * end_new) - end < 200: print 'REV2 ENDS ' + str(end_new - len(prot) + 1 ) + ' AMINO ACIDS DOWNSTREAM!' prot = prot_new[:end_new + 1] end = start + ((end - start) % 3) + 3 * (end_new + 1) else: return False check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE) if not check: return False start_exon2 = start end_exon2 = end genename = genename_whole gene_HXB2 = get_gene_HXB2(genename) from Bio.SeqFeature import FeatureLocation gene_loc = FeatureLocation(start_exon1, end_exon1, strand=+1) + \ FeatureLocation(start_exon2, end_exon2, strand=+1) geneseq = gene_loc.extract(refseq) gene = geneseq.seq check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE) if not check: return False prot = gene.translate() check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_end(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE) if not check: return False return True
def crispy_scan( haystack: List[SeqRecord], needle: SeqRecord, pam: str = "GG", unique_size: int = 13, full_size: int = 23, threads: int = -1, ) -> List[Tuple[int, List[int]]]: if unique_size < 1: raise ValueError("unique size cannot be below 1") if full_size < unique_size: raise ValueError("full size cannot be below unique size") def build_json_base(location, seq_section, result) -> Dict[str, Union[str, int]]: base = { 'start': location.start, 'end': location.end, 'strand': location.strand, 'sequence': str(seq_section[:-3]), 'pam': str(seq_section[-3:]), 'all_hits': result, # new to JSON, for handy sorting '0bpmm': result[0] - 1, # remove self-hit } # add remaining mismatch info for i, val in enumerate(result[1:]): base['{}bpmm'.format(i + 1)] = val return base # set the size of the window to the unique size # and shift one back since in the previous system it skipped a leading N before_window = (-unique_size - 1, -1) final_result = [] comparison_text = build_comparison_text(haystack, unique_size) idx = 0 for strand in [1, -1]: if strand == -1: searcher = Searcher(str(needle.seq.reverse_complement())) else: searcher = Searcher(str(needle.seq)) results = searcher.find_repeat_counts(target=pam, before_window=before_window, other_text=comparison_text, threads=threads) for pam_start, result in sorted(results.items(), key=lambda x: x[1]): # set the window location, accounting for strand if strand == -1: start = len(needle.seq) - pam_start - len(pam) end = start + full_size else: start = pam_start - full_size + len(pam) end = pam_start + len(pam) # skip anything for which the full window shown would be truncated if start < 0 or end >= len(needle.seq): continue location = FeatureLocation(start, end, strand) seq = location.extract(needle.seq) final_result.append(build_json_base(location, seq, result)) idx += 1 # order by lowest hits, then by start position final_result.sort(key=lambda x: (x["all_hits"], x["start"])) return final_result
def extract_sequence(self, start, end): location = FeatureLocation(start, end) return location.extract(self.nucleic_sequence)