def chgAlpha(self, newAlpha): """Accepts 'DNA' 'RNA' or 'protein' or an alphabet object""" from Bio.Seq import Seq from Bio.Alphabet import IUPAC alpha = None if newAlpha == "DNA": alpha = IUPAC.IUPACUnambiguousDNA() self.typ = alpha elif newAlpha == "RNA": alpha = IUPAC.IUPACUnambiguousDNA() self.typ = alpha elif newAlpha == "protein": alpha = IUPAC.IUPACProtein() self.typ = alpha else: raise NameError, "type not 'DNA', 'RNA', or 'protein'" if not alpha: alpha = newAlpha self.seq = Seq(self.seq.tostring(), alpha) self.checkAlpha()
def cds_to_seqrecord(cds, parent_genome, gene_domains=[]): """Creates a SeqRecord object from a Cds and its parent Genome. :param cds: A populated Cds object. :type cds: Cds :param phage_genome: Populated parent Genome object of the Cds object. :param domains: List of domain objects populated with column attributes :type domains: list :returns: Filled Biopython SeqRecord object. :rtype: SeqRecord """ record = SeqRecord(cds.translation) record.seq.alphabet = IUPAC.IUPACProtein() record.name = cds.id if cds.locus_tag == "" or cds.locus_tag is None: record.id = "".join(["DRAFT ", cds.id]) else: record.id = cds.locus_tag cds.set_seqfeature() source = f"{parent_genome.host_genus} phage {cds.genome_id}" source_feature = cds.create_seqfeature("source", 0, cds.translation_length, 1) source_feature.qualifiers["organism"] = [source] record.features = [source_feature] record.features.append( cds.create_seqfeature("Protein", 0, cds.translation_length, 1)) cds_feature = cds.create_seqfeature("CDS", 0, cds.translation_length, 1) format_cds_seqrecord_CDS_feature(cds_feature, cds, parent_genome) record.features.append(cds_feature) region_features = get_cds_seqrecord_regions(gene_domains, cds) for region_feature in region_features: record.features.append(region_feature) record.description = (f"{cds.seqfeature.qualifiers['product'][0]} " f"[{source}]") record.annotations = get_cds_seqrecord_annotations(cds, parent_genome) return record
for chain in model: print("Working on chain %s." % (chain.get_id())) seq = list() position = list() for residue in chain: res_id = residue.get_id() if res_id[0] == ' ': seq.append(three_to_one(residue.get_resname())) position.append(res_id[1]) #print("%s -- %d" % (three_to_one(residue.get_resname()), res_id[1])) my_prot = Seq(str(''.join(seq)), IUPAC.protein) tmp_seq1 = SeqIO.read(conserved_fasta_file, "fasta", IUPAC.IUPACProtein()) my_x_seq = Seq(sub('[a-z]', 'X', str(tmp_seq1.seq))) seq1 = SeqRecord(my_x_seq, id=tmp_seq1.id, name="", description="") #print("%s" % seq1) seq2 = SeqRecord(my_prot, id="Chain_" + chain.get_id(), name="", description="") myseqs = [seq1, seq2] fasta_filename = prefix + "_" + chain.get_id() + ".fasta" align_filename = prefix + "_" + chain.get_id() + ".align" SeqIO.write(myseqs, fasta_filename, "fasta")
input_file = args.input_file dataset_id = input_file.split("_")[0] contig_length_filter = "/hps/nobackup2/production/metagenomics/aalmeida/scripts/EMBL-EBI/filter_contigs_len.py" subprocess.call("%s -f %s -l 1" % (contig_length_filter, input_file), stdout = subprocess.DEVNULL, stderr = subprocess.DEVNULL, shell = True) new_name_search = re.search(r"(%s\w+)\.fna(\w+\.fasta)" % dataset_id, ",".join(os.listdir())) new_name = new_name_search.group(1) + new_name_search.group(2) os.rename(new_name_search.group(0), new_name) input_file = new_name predicted_viruses = virus_pred(input_file) SeqIO.write(predicted_viruses, "%s_viral_sequences.fna" % dataset_id, "fasta") input_file = "%s_viral_sequences.fna" % dataset_id hmmer_result = hmmer_domtbl(input_file) informative_df = ratio_evalue(hmmer_result) os.mkdir("%s_annotated_viral_sequences" % dataset_id) for contig in SeqIO.parse(input_file, "fasta", IUPAC.IUPACUnambiguousDNA()): for protein in SeqIO.parse("%s_viral_CDS.faa" % dataset_id, "fasta", IUPAC.IUPACProtein()): if contig.id in protein.id: protein_fields = protein.description.split(" # ") CDS_annotation = SeqFeature(FeatureLocation(int(protein_fields[1]), int(protein_fields[2])), type = "CDS", strand = int(protein_fields[3])) CDS_annotation.qualifiers["locus_tag"] = [protein.id] CDS_annotation.qualifiers["transl_table"] = [11] CDS_annotation.qualifiers["translation"] = [str(protein.seq)] if protein.id in list(informative_df["query"].values): if list(informative_df["query"].values).count(protein.id) > 1: best_hit = max(informative_df[informative_df["query"] == protein.id]["Abs_Evalue_exp"].items(), key = operator.itemgetter(1)) if best_hit[1] >= 10: CDS_annotation.qualifiers["result"] = ["high confidence"] else: CDS_annotation.qualifiers["result"] = ["low confidence"] CDS_annotation.qualifiers["taxon"] = [informative_df.loc[best_hit[0], "Taxon"], informative_df.loc[best_hit[0], "Abs_Evalue_exp"]] else: