def rename(query_dir, input_dir): namemap = {} for i, filename in enumerate(sorted(os.listdir(input_dir)), 1): file = SeqIO.parse(files.joinpath(input_dir, filename), "fasta") records = [] for j, record in enumerate(file, 1): newid = "Genome_{i}::Contig_{j}".format(**locals()) records.append(seq.new_record(newid, str(record.seq))) newname = "Genome_{i}.fa".format(**locals()) SeqIO.write(records, files.joinpath(query_dir, newname), "fasta") namemap[files.replace_ext(newname)] = files.replace_ext(filename) return namemap
def reference_self_blastp(output_dir, freq): ref_recs = [ seq.new_record(locus, counter.most_common(1)[0][0].translate(table=11)) for locus, counter in freq.items() ] ref_length = {rec.id: len(rec.seq) for rec in ref_recs} ref_faa = os.path.join(output_dir, "ref_seq.faa") seq.save_records(ref_recs, ref_faa) ref_db = os.path.join(output_dir, "ref_db") seq.compile_blastpdb(ref_faa, ref_db) blastp_out_file = os.path.join(output_dir, "ref_db.blastp.out") seq.query_blastpdb(ref_faa, ref_db, blastp_out_file, seq.BLAST_COLUMNS) return blastp_out_file, ref_length
def make_ref_blastpdb(ref_db_file, database): query = "select loci.locus_id, alleles.peptide_seq " \ "from loci inner join alleles " \ "on loci.ref_allele = alleles.allele_id;" refs = db.from_sql(query, database=database) ref_recs = [ seq.new_record(row["locus_id"], row["peptide_seq"], seqtype="protein") for _, row in refs.iterrows() ] ref_fasta = ref_db_file + ".fasta" seq.save_records(ref_recs, ref_fasta) ref_len = generate_allele_len(ref_recs) seq.compile_blastpdb(ref_fasta, ref_db_file) os.remove(ref_fasta) return ref_len
def blast_for_new_alleles(candidates, alleles, ref_db, temp_dir, ref_len): filename = "new_allele_candidates" candidate_file = os.path.join(temp_dir, filename + ".fasta") recs = [ seq.new_record(cand, alleles[cand][1], seqtype="protein") for cand in candidates ] seq.save_records(recs, candidate_file) allele_len = generate_allele_len(recs) blastp_out_file = os.path.join(temp_dir, "{}.blastp.out".format(filename)) seq.query_blastpdb(candidate_file, ref_db, blastp_out_file, seq.BLAST_COLUMNS) blastp_out = filter_duplicates(blastp_out_file, allele_len, ref_len, identity=95) blastp_out = blastp_out.drop_duplicates("qseqid") new_allele_pairs = [(row["qseqid"], row["sseqid"]) for _, row in blastp_out.iterrows()] return new_allele_pairs
def save_locusfiles(freq, locus_dir): for locus, counter in freq.items(): records = [seq.new_record(operations.make_seqid(str(allele)), allele) for allele in counter.keys()] seq.save_records(records, files.joinpath(locus_dir, locus + ".fa"))
def save_refseq(freq, refseq_file): refseqs = {locus: counter.most_common(1)[0][0] for locus, counter in freq.items()} records = [seq.new_record(str(locus), sequence) for locus, sequence in refseqs.items()] SeqIO.write(records, refseq_file, "fasta") return refseqs
def __write_new_format(self, source_file, sink_file): records = [] for j, contig in enumerate(SeqIO.parse(source_file, "fasta"), 1): seqid = self.newseqid(j) records.append(seq.new_record(seqid, str(contig.seq))) SeqIO.write(records, sink_file, "fasta")