def find_genes(record, outdir): seq = record.seq rev_seq = seq.reverse_complement() for gene_name in ['pep', 'yej', 'omp', 'rim', 'pdf', 'sbm', 'asp', 'def']: genes = [] for feature in record.features: if feature.type == 'gene': if 'gene' in feature.qualifiers: gene_name_gb = str(feature.qualifiers['gene']) if gene_name in gene_name_gb: start = feature.location.start end = feature.location.end if feature.location.strand == 1: seq_name = gene_name_gb.translate(None, '!@#$[]') gene = SeqIO.SeqRecord(seq[start:end], id = seq_name) genes.append(gene) elif feature.location.strand == -1: seq_name = gene_name_gb .translate(None, '!@#$[]') gene = SeqIO.SeqRecord(rev_seq[start:end], id = seq_name) genes.append(gene) else: print 'Error' file_out = outdir + gene_name + '.fasta' genes = [f for f in sorted(genes, key=lambda x : str(x.id))] SeqIO.write(genes, file_out, "fasta")
def read(file): lines = open(file).readlines() idx = 0 # get past header while (True): line = lines[idx] idx += 1 if ("Maximal single base matches" in line): break aln = () while (idx < len(lines)): line = lines[idx].split() # if this is true then we are in a alignment section #if(len(line) > 0 and line[0].isdigit() and line[1] != "matching" ): if (len(line) > 0 and line[0] == "ALIGNMENT"): aln = readAln(lines, idx) break idx += 1 rec1 = SeqIO.SeqRecord(Seq(aln[1][4]), id=aln[0][4], description=" ".join(aln[0])) rec2 = SeqIO.SeqRecord(Seq(aln[1][5]), id=aln[0][9], description=" ".join(aln[0])) #SeqIO.write([rec1], "A."+args.out, "fasta") #SeqIO.write([rec2], "B."+args.out, "fasta") SeqIO.write([rec1, rec2], args.out, "fasta")
def parseFasta(name, nucleotides=False): """ Function that parses a fasta file into a Samples object. :param nucleotides: boolean, indicates if nucleotides or proteins :param name: str, filename of the file we wish to parse :return: Samples, the samples that are specified in the file """ samples = Samples() with open(name) as file: for record in SeqIO.parse(file, "fasta"): ID = getID(record) if nucleotides: genomeSequence = SeqIO.SeqRecord(record.seq, id=record.id, name=record.id, description='') samples.getSample(ID).addGenome(genomeSequence) else: proteinName = getProteinName(record) proteinSequence = SeqIO.SeqRecord(record.seq, id=record.id, name=proteinName, description='') origin = getOrigin(record) protein = Protein(proteinName, proteinSequence, origin) samples.getSample(ID).addProtein(protein) return samples
def simulateAmpliconReads(self,sequenceAbundanceCounter, prefix, readSize=76, coverage=10, reverseComplementAmplicon=False): sequences = [] print(( sequenceAbundanceCounter.most_common(1))) (v,hCount) = sequenceAbundanceCounter.most_common(1)[0] for iteration in range(0,hCount): for sequence,count in sequenceAbundanceCounter.most_common(): if count>=hCount: sequences.append(sequence) else: break hCount-=1 bpythonSeqs=[] for index,sequence in enumerate(sequences): if reverseComplementAmplicon : bpythonSeqs.append(SeqIO.SeqRecord(Seq(sequence).reverse_complement(), '%s-%s' % (str(index),str(sequence)))) else: bpythonSeqs.append(SeqIO.SeqRecord(Seq(sequence), '%s-%s' % (str(index),str(sequence)))) fastaPath = getTempFileName('art')+'.fa' SeqIO.write(bpythonSeqs, fastaPath, "fasta") os.system('%s -amp -i %s -l %s -f %s -o %s -sam -ss HS25' %(self.executable, fastaPath, readSize, coverage, prefix))
def combine_result(INS, DEL): result = list() # INS_chr_pos_len_#_seq_rc_dp for i in INS: for j in i: if len(j) != 8: continue key = "%s*%s*%d*%d*%s*%d*%d"%(j[0], j[1], j[2], j[3], j[4], j[6], j[7]) fake_seq = SeqIO.SeqRecord(seq = str(), id = key, name = key, description = key) fake_seq.seq = Seq(j[5]) result.append(fake_seq) # INS = list() del INS gc.collect() # DEL_chr_pos_len_seq_rc_dp for i in DEL: for j in i: if len(j) != 7: continue key = "%s*%s*%d*%d*%d*%d"%(j[0], j[1], j[2], j[3], j[4], j[6]) fake_seq = SeqIO.SeqRecord(seq = str(), id = key, name = key, description = key) fake_seq.seq = Seq(j[5]) result.append(fake_seq) # DEL = list() del DEL gc.collect() # print INS+DEL # Temp = sorted(INS + DEL, key = lambda x:x[2]) # result = list() # for i in Temp: # key = "%s_%s_%d_%d_%s"%(i[0], i[1], i[2], i[3], i[4]) # fake_seq = SeqIO.SeqRecord(seq = str(), id = key, name = key, description = key) # fake_seq.seq = Seq(i[5]) # result.append(fake_seq) return result
def __init__(self, score, name1, start1, match_size1, strand1, size1, seq1, name2, start2, match_size2, strand2, size2, seq2): self.score = score self.name1 = name1 self.start1 = int(start1) # origin-zero self.match_size1 = int(match_size1) self.strand1 = strand1 self.size1 = int(size1) self.seq1 = SeqIO.SeqRecord(Seq(seq1.rstrip('\n'))) self.name2 = name2 self.start2 = int(start2) # origin-zero, orientation dependent self.match_size2 = int(match_size2) self.strand2 = strand2 self.size2 = int(size2) self.seq2 = SeqIO.SeqRecord(Seq(seq2.rstrip('\n')))
def extract_cds_and_protein_fasta_from_rna(fpath): basename = os.path.splitext(fpath)[0] cds_fpath = basename + '_extracted_cds.fa' prot_fpath = basename + '_extracted_protein.fa' with open(cds_fpath, 'w') as cds_out, open(prot_fpath, 'w') as prot_out: for rec, gene_id, rna_id, prot_id, cds, prot in iterate_ncbi_rna_cds_and_tranlation(fpath): cds_rec = SeqIO.SeqRecord(Seq.Seq(cds.upper()), rna_id, '', 'gene_id=%s prot_id=%s' % (gene_id, prot_id)) prot_rec = SeqIO.SeqRecord(Seq.Seq(prot.upper()), prot_id, '', 'gene_id=%s rna_id=%s' % (gene_id, rna_id)) SeqIO.write(cds_rec, cds_out, 'fasta') SeqIO.write(prot_rec, prot_out, 'fasta')
def get_window(genome, position, radius): seqid, pos = position assert genome[seqid][pos:pos + 2].upper() == 'CG' start = pos - radius end = pos + radius + 2 seq = genome[seqid][start:end] return SeqIO.SeqRecord(seq=seq, id=f'{seqid}:{start}-{end}')
def main(): log = CreateLogger() params = ParseCommandLineParams(log) from Bio import SeqIO log.info("Reading reads from %s" % params.repertoire_path) records = [] with open(params.repertoire_path) as input_file: header = input_file.readline().split("\t") sequence_column = header.index("Clonal sequence(s)") size_column = header.index("Clone count") id = 0 for line in input_file: if len(line) == 0: break info = line.split("\t") from Bio import Seq record = SeqIO.SeqRecord(seq=Seq.Seq(info[sequence_column]), id="cluster___%d___size___%d" % (id, int(info[size_column])), description="") records.append(record) id += 1 log.info("Read %d reads" % len(records)) log.info("Writing output") with smart_open(params.output_path, "w") as output_file: for record in records: SeqIO.write(record, output_file, "fasta")
def AnalyzeRegion(options, RegionSequenceSource): region_name = options.RegionName if not region_name: print("Region name undefined.") exit(1) sequences = LoadSequences(options, region_name) source_seq = RegionSequenceSource.fetchGeneSequence(region_name) if source_seq is None: return 0, 0 TemplateProtein = SeqIO.SeqRecord(source_seq.translate(), id=region_name, description="") AllRegionSequences = [] for i in range(100): AllRecommendedWindows, AllRegionSequences, TotalSequences =\ EvaluateAllSequencesAllTranslationWindows(options, TemplateProtein, sequences) if len(list(set(AllRecommendedWindows))) == 1: print("Found correct window.") break if options.WriteFiles: BuildOutputAlignments(options, region_name, AllRegionSequences, TemplateProtein) print("Rate for %s: %.2f%%" % (region_name, 0)) return 0, 0
def writeFastaFile(sequences, fileName): ''' write a set of sequences to a fasta file. returns the name of the new file ''' primerSequenceIdent = "primer_sequences" utils.logMessage( "PrimerManager::writeFastaFile( )", "Writing {0} sequences to fasta file".format(len(sequences))) seqRecords = [] i = 0 for sequence in sequences: seqStr = str(reduce(lambda x, y: str(x) + str(y), sequence)) seqRecord = SeqIO.SeqRecord(Seq.Seq(seqStr, Alphabet.IUPAC.extended_dna), id="seq_{0}".format(i)) seqRecords.append(seqRecord) i += 1 SeqIO.write(seqRecords, open(fileName, "w"), "fasta") utils.logMessage("PrimerManager::writeFastaFile( )", "writing fasta file complete") return fileName
def read_genes(marburg_genome=None): # read all <genome_name>.csv files for accessing genes in genomes global all_genes, edit_distance_matrices if marburg_genome: align_and_find_genes(marburg_genome) # Align all genes in marburg genome genomes = ebolavirus_genomes + [marburg_genome] else: genomes = ebolavirus_genomes for gene in marburg_genes: # For every gene (7 genes) i = 0 genes = [] for genome in genomes: # For every species in ebolavirus indices = pd.read_csv("./Output/found_genes/" + genome.name + ".csv", header=None) # read .csv file begin_idx = int(indices.loc[i, 1]) # begin index for special gene end_idx = int(indices.loc[i, 2]) # end index for special gene new_record = SeqIO.SeqRecord(genome.seq[begin_idx: end_idx]) # Create SeqRecord Object File new_record.name = genome.name genes.append(new_record) # Append to gene list i += 1 all_genes[gene.name] = genes # Append genelist to all_genes dictionary if marburg_genome: edit_distance_matrices = [[[0 for i in range(6)] for j in range(6)] for k in range(7)] # matrix for edit distances else: edit_distance_matrices = [[[0 for i in range(5)] for j in range(5)] for k in range(7)] # matrix for edit distances if marburg_genome: global_align(with_marburg=2) else: global_align(with_marburg=1)
def run_hhblits_cns(self): # Generates multiple sequence alignment using hhblits seq = Seq(self.protein.str_seq_full.upper()) target = self.target version = self._MSA_FULL_VERSION sequence = SeqIO.SeqRecord(seq, name=target, id=target) query = os.path.join(PATHS.msa, 'query', target + '_full.fasta') SeqIO.write(sequence, query, "fasta") output_hhblits = os.path.join(PATHS.hhblits, 'a3m', target + '_full.a3m') output_reformat1 = os.path.join(PATHS.hhblits, 'a2m', target + '_full.a2m') output_reformat2 = os.path.join(PATHS.hhblits, 'fasta', target + '_full_v%s.fasta' % version) db_hh = '/cs/zbio/orzuk/projects/ContactMaps/data/MSA_Completion/hh/uniprot20_2016_02/uniprot20_2016_02' hhblits = [ 'hhblits', '-i', query, '-d', db_hh, '-n', '3', '-e', '1e-3', '-maxfilt', '10000000000', '-neffmax', '20', '-nodiff', '-realign', '-realign_max', '10000000000', '-oa3m', output_hhblits ] subprocess.run(hhblits) reformat = ['reformat.pl', output_hhblits, output_reformat1] subprocess.run(reformat) reformat = ['reformat.pl', output_reformat1, output_reformat2] subprocess.run(reformat)
def generate_decoy_sequences(orig_seqs, decoy_prefix='r', decoy_type='reverse'): """ Generate decoy BioSeq entries Decoy sequence can be conditionally altered to be reversed or shuffled Decoy IDs are prefixed with desired character """ if decoy_type == 'reverse': def decoy_func(seq): return seq[::-1] elif decoy_type == 'shuffle': def decoy_func(seq): seq_list = list(seq) random.shuffle(seq_list) return ''.join(seq_list) else: raise ValueError('Unknown decoy type: {}'.format(decoy_type)) decoy_fastas = list() for orig_fasta in orig_seqs: rev_seq = decoy_func(orig_fasta.seq) rev_id = '{}{}'.format(decoy_prefix, orig_fasta.id) decoy_entry = SeqIO.SeqRecord(seq=rev_seq, id=rev_id, name=orig_fasta.name, description=orig_fasta.description) decoy_fastas.append(decoy_entry) return decoy_fastas
def _write_fasta(self): s = self.protein.str_seq if self._family is None else self.str_seq if s is None: return seq = Seq(s.upper()) sequence = SeqIO.SeqRecord(seq, name=self.target, id=self.target) SeqIO.write(sequence, self.fasta_fname, "fasta")
def _run_hhblits(self): # Generates multiple sequence alignment using hhblits if self.protein.str_seq is None: # or self._train: return seq = Seq(self.protein.str_seq.upper()) target = self.target version = self._MSA_VERSION sequence = SeqIO.SeqRecord(seq, name=target, id=target) target_hhblits_path = get_target_hhblits_path(target) check_path(target_hhblits_path) query = os.path.join(target_hhblits_path, target + '.fasta') SeqIO.write(sequence, query, "fasta") output_hhblits = os.path.join(target_hhblits_path, target + '.a3m') output_reformat1 = os.path.join(target_hhblits_path, target + '.a2m') output_reformat2 = os.path.join(target_hhblits_path, target + '_v%s.fasta' % version) db_hh = os.path.join(PATHS.hhblits, "scop40") hhblits_params = '-n 3 -e 1e-3 -maxfilt 10000000000 -neffmax 20 -nodiff -realign_max 10000000000' hhblits_cmd = f'hhblits -i {query} -d {db_hh} {hhblits_params} -oa3m {output_hhblits}' subprocess.run(hhblits_cmd, shell=True) # subprocess.run(hhblits_cmd, shell=True, stdout=open(os.devnull, 'wb')) reformat_script = os.path.join(PATHS.periscope, 'scripts', 'reformat.pl') reformat = f"perl {reformat_script} {output_hhblits} {output_reformat1}" subprocess.run(reformat, shell=True) reformat = f"perl {reformat_script} {output_reformat1} {output_reformat2}" subprocess.run(reformat, shell=True)
def Naive_assembler(file, threshold = 2): records = list(SeqIO.parse(file, format=file.split(".")[-1])) unable_to_merge = False while len(records) != 1 and unable_to_merge != True: print(len(records)) i = 0 for record in records: i +=1 record.id = str(i) alignments = [] for i, record1 in enumerate(records): for j, record2 in enumerate(records): if i >= j: continue #print(record1.id, record2.id) score, alignment = local_align(str(record1.seq), str(record2.seq)) alignments.append([score, alignment, record1, record2]) alignments.sort(key= lambda element: element[0], reverse=True) records_merged = set() new_reads = [] i = 0 if alignments[i][0] < threshold: unable_to_merge = True for i in range(len(alignments)): if alignments[i][0] < threshold: break if records_merged.intersection(set([record.id for record in alignments[i][2:]])): continue new_reads.append(SeqIO.SeqRecord(id=str(i), seq=merge_seqs(alignments[i][1]))) for element in alignments[i][2:]: records_merged.add(element.id) new_reads += [record for record in records if record.id not in records_merged] records = new_reads return records
def parse_result(self, genome_path): result_path = genome_path + '.fasta.lst' if not os.path.isfile(os.path.expanduser(result_path)): return contigs = dict([(s.id, s.seq) for s in SeqIO.parse(open(genome_path), 'fasta')]) with open(result_path, 'r') as f: reading_genes = False for line in f.readlines(): if line.startswith(' #'): reading_genes = True continue if line.startswith('---') or line.strip() == '': reading_genes = False if reading_genes: gene_sp = re.split(r'[\t ]+', line.strip()) seq_id = contig_id + '_gene_' + gene_sp[0] l_index = int(gene_sp[2].replace('<', '')) - 1 r_index = int(gene_sp[3].replace('>', '')) seq = contig_seq[l_index:r_index] yield SeqIO.SeqRecord(seq, id=seq_id, description='', name='') if line.startswith('FASTA definition line'): contig_id = line.strip().replace('FASTA definition line: ', '') contig_seq = contigs[contig_id] os.remove(result_path) os.remove('gms.log') os.remove('GeneMark_hmm.mod')
def add_isolates(dic_list, db_name): for assembly in dic_list: print("Adding {} to kraken stagging area.".format( assembly['organism']), file=sys.stderr) genbank_zip_file = assembly['dest'] fi = gzip.open(genbank_zip_file, 'rt') seqs = list(SeqIO.parse(fi, 'genbank')) new_seqs = [] for s in seqs: tmp = SeqIO.SeqRecord(s.seq) tmp.id = 'gi|{}'.format(s.annotations['gi']) tmp.description = s.description tmp.name = s.name new_seqs.append(tmp) fi.close() fa_file = os.path.join( os.getcwd(), os.path.basename(genbank_zip_file).strip('gbff.gz') + ".fa") tmpf = open(fa_file, 'wt') SeqIO.write(new_seqs, tmpf, 'fasta') tmpf.close() kraken_add(db_name, fa_file) # cmd = 'kraken-build --add-to-library {} --db {}'.format(fa_file, db_name) # print(cmd, file = sys.stderr) # cmd = shlex.split(cmd) # p = subprocess.check_output(cmd) # os.remove(fa_file) print( "Added all {} assemblies to kraken stagging area. DB is ready to build" .format(len(dic_list)), file=sys.stderr)
def run(parameters): message = "All correct." APPLOGGER.info("Reading file...") handle = None filekind = filetype.guess(parameters['input']) if filekind and filekind.extension in ['gz', 'GZ', 'gZ', 'Gz']: APPLOGGER.info("Running gzip...") handle = gzip.open(parameters['input'], "rt") else: handle = open(parameters['input'], "rt") APPLOGGER.info("Creating output file...") out = open(parameters['output'], "wb") APPLOGGER.info("Parsing FASTA file...") for record in SeqIO.parse(handle, "fasta"): chunks, chunk_size = len( record.seq), len(record.seq) / parameters['size'] print(chunks, chunk_size) subseqs = [ record.seq[i:i + chunk_size] for i in range(0, chunks, chunk_size) ] num_digits = len(str(len(subseqs))) for index in range(0, len(subseqs)): seq=SeqIO.SeqRecord(\ seq=Seq.Seq(subseqs[index]),\ id="{0}_{1:0{2}d}".format(record.id, index, num_digits),\ description="" ) SeqIO.write(seq, out, "fasta") handle.close() out.close() APPLOGGER.info("Closing files...") return True, message
def process_pbp(isolate, protein_fasta, tpd_start, tpd_end, tpd_lab, results_csv, k): protein_rec = list(SeqIO.parse(protein_fasta, format="fasta")) prot = protein_rec[0].seq tpd_string = str(prot[tpd_start:tpd_end]) tpd_string = Seq(tpd_string, generic_protein) prot_tpd_id = isolate + "_" + tpd_lab + "_TPD" tpd_protein_string = SeqIO.SeqRecord(tpd_string, id=prot_tpd_id) prot_file = isolate + "_" + tpd_lab + ".prot" with open(prot_file, "w+") as output_handle: SeqIO.write(tpd_protein_string, output_handle, "fasta") top_id = str(results_csv.iloc[0, 1]) top_id = top_id[2:] if bassio_nameo != "22841_3#15.contigs_velvet.fa.gff": rm_command = "rm " + protein_fasta + " " + protein_csv os.system(rm_command) else: print(sstart, send) print(tpd_start, tpd_end) print("Generating CSV: %s%%" % round((k / len(gff_lines) * 100)))
def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--p_i_range", nargs=2, type=float, required=True) parser.add_argument("--p_ji_range", nargs=2, type=float, required=True) parser.add_argument("--n_seq", type=int, required=True) parser.add_argument("--n_pairs", type=int, required=True) parser.add_argument("--n_randoms", type=int, required=True) parser.add_argument("--seed", type=int, default=42) args = parser.parse_args() np.random.seed(args.seed) dependent = generate_sample(args.p_i_range, args.p_ji_range, args.n_seq, args.n_pairs) independent = random_sample(args.n_seq, args.n_randoms) seqs = [x + y for x, y in zip(dependent, independent)] records = [ SeqIO.SeqRecord( seq=Seq.Seq(seq), id=str(i), description="", ) for i, seq in enumerate(seqs) ] SeqIO.write(records, sys.stdout, "fasta")
def return_seq_record_with_features_from_genbank(r, part_id): # take a request object and add the features to it sequence_from_ice = r.json() amb = IUPACAmbiguousDNA() sequence_rec = SeqIO.SeqRecord(Seq(sequence_from_ice['sequence'], amb), id=part_id, name=part_id) for feature in sequence_from_ice['features']: # add_feature(sequence_rec=sequence_rec) start_position = feature['locations'][0]['genbankStart'] end_position = feature['locations'][0]['end'] strand = feature['strand'] name_of_feature = feature['name'] type_of_feature = feature['type'] id_of_feature = feature['id'] my_feature = add_feature(sequence_rec=sequence_rec, start_postion=start_position, end_position=end_position, strand=strand, name=name_of_feature, feature_type=type_of_feature, feature_id=id_of_feature) sequence_rec.features.append(my_feature) return sequence_rec
def find_upseq(loc, genome, n_up=1000, n_down=200): contig = genome[loc[1]] print(loc) if loc[2] == "+": upseq = contig.seq[loc[3]-1-n_up:loc[4]+n_down] else: upseq = contig.seq[loc[3]-1-n_down:loc[4]+n_up].reverse_complement() return SeqIO.SeqRecord(upseq.upper(), loc[0], description=str(n_up)+"_bases_upstream")
def output(record): record = SeqIO.SeqRecord( record.reverse_complement().seq, letter_annotations=record.letter_annotations, id=record.id, name=record.name, description=record.description) return _output(record)
def setUp(self): # Create a default FASTA record self.default_fasta_record = SeqIO.SeqRecord( seq="ACTGAAC", id="testid", name="testname", description="sequence_name | organism=testorganism | SO=chromosome" )
def test_make_records_to_dictionary(): TEST_RECORD_LIST = [ SeqIO.SeqRecord( Seq("ATGCTCGTAGCTGATCGA"), id="test1", name="test1", description="test record #1", ), SeqIO.SeqRecord( Seq("GTGCTCGTAGCTGATCGA"), id="test2", name="test2", description="test record #2", ), ] EXPECTED = {"test1": TEST_RECORD_LIST[0], "test2": TEST_RECORD_LIST[1]} actual = process.make_records_to_dictionary(TEST_RECORD_LIST) assert EXPECTED == actual
def _dump_fasta(inferred_reference, description, file_path): inferred_reference = ''.join(inferred_reference) seq = Seq.Seq(inferred_reference, Seq.IUPAC.unambiguous_dna) seq_record = SeqIO.SeqRecord(seq, '', description=description) record = [seq_record] log.debug("Writing fasta reference:\n%s\n%s", file_path, description) SeqIO.write(record, file_path, "fasta")
def read_gbk(genome: str) -> SeqIO.SeqRecord: """Reads the input genome file and concatenates all contigs into a single SeqRecord.""" whole_record = SeqIO.SeqRecord( seq="", id="", name="", features=None) # Blank SeqRecord that will be added to for record in SeqIO.parse( handle=genome, format='genbank'): # Merge all contigs into one large SeqRecord whole_record += record return whole_record
def get_genome_subrecord(chromosome, loc1, loc2): """Creates a SeqRecord DNA sequence of the given chromosome from loc1 to loc2.""" global recent_record if recent_record is None or recent_record.id != 'chr{}'.format(chromosome): recent_record = SeqIO.read("../data/hg19/chr{}.fa".format(chromosome), "fasta") id_str = ":".join(['chr{}'.format(chromosome), str(loc1), str(loc2)]) new_seq = str(recent_record[loc1:loc2].seq).upper() return SeqIO.SeqRecord(Seq.Seq(new_seq), id=id_str)