def extract_sequences(file,genes,buffer,prefix): # Since PF is fairly small, can be greedy about how the FASTA entries are being # processed and store them in memory. contigs = SeqIO.to_dict(SeqIO.parse(file, "fasta")) for gene in genes: vals = gene.split('|') source = vals[0] id = vals[4] strand = vals[3] start = (int(vals[1]) - buffer - 1) # correct for 0-base indexing stop = (int(vals[2]) + buffer) # know that start can't be less than 1 due to buffer alteration, # need to handle stop similarly using the sequence length. if start < 1: start = 0 if stop > len(contigs[source].seq): stop = len(contigs[source].seq) # Generate both a buffered and unbuffered sequence. These files will # be exactly the same if buffer==0 sequence1 = str(contigs[source].seq[start:stop].upper()) sequence2 = str(contigs[source].seq[(int(vals[1]) - 1):int(vals[2])].upper()) if strand == "-": # if reverse strand, swap the bases sequence1 = rev_comp(sequence1) sequence2 = rev_comp(sequence2) # Print out in standard FASTA format write_fasta("{0}_buffered.fsa".format(prefix),id,sequence1) write_fasta("{0}_unbuffered.fsa".format(prefix),id,sequence2)
def main(): parser = argparse.ArgumentParser( description= 'Script to run TASR in a multithreaded fashion on individual sequences.' ) parser.add_argument('-t', type=str, required=True, help='Location of the TASR exe (~/tasr_v1.6.2/TASR).') parser.add_argument('-d', type=int, required=True, help='Number of threads to use.') parser.add_argument( '-r', type=str, required=True, help= 'Path to a file with paths to reads to be used in assembly on each line.' ) parser.add_argument( '-s', type=str, required=True, help= 'Path to a file with the sequences which should be the targets for assembly.' ) parser.add_argument('-o', type=str, required=True, help='Location to generate output directories.') args = parser.parse_args() make_directory(args.o) seqs = SeqIO.to_dict(SeqIO.parse(args.s, "fasta")) manager = mp.Manager() pool = mp.Pool(args.d - 1) jobs = [] for id in seqs: cur_dir = "{}/{}".format(args.o, id) cur_fsa = "{}/seq.fasta".format(cur_dir) cur_rds = "{}/reads.txt".format(cur_dir) make_directory(cur_dir) write_fasta(cur_fsa, id, str(seqs[id].seq)) copyfile(args.r, cur_rds) jobs.append(pool.apply_async(run_tasr, (args.t, cur_fsa, cur_rds))) for job in jobs: # Get all the returns from the apply_async function job.get() pool.close() # Tell the queue it's done getting new jobs pool.join() # Make sure these new jobs are all finished
def main(): parser = argparse.ArgumentParser( description= 'Script to assess the results of the base Targeted Assembly pipeline.') parser.add_argument( '-remove', type=str, required=True, help= 'Prefix of a strain (3D7) to remove or strains, must be CSV: (3D7,7G8).' ) parser.add_argument( '-original_fsa', type=str, required=True, help= 'Path to where the initial FASTA file generated from the pipeline is.') parser.add_argument( '-new_fsa', type=str, required=True, help='Path to where the output for this filtered FASTA file should go.' ) args = parser.parse_args() nonrelevant_alleles = set() # ignore these alleles contigs = {} # final FASTA dict remove_us = [] if ',' in args.remove: remove_us = args.remove.split(',') else: remove_us.append(args.remove) regex_for_contig_id = ">([a-zA-Z0-9_\.]+)" with open(args.original_fsa, 'r') as fasta_in: for line in fasta_in: # iterate over the FASTA file and extract the entirety of each sequence line = line.rstrip() if line.startswith('>'): current_id = re.search(regex_for_contig_id, line).group(1) contigs[current_id] = "" prefix = line[1:].split('.')[0] if prefix in remove_us: nonrelevant_alleles.add(current_id) else: contigs[current_id] += line # add all the bases for allele in contigs: if allele not in nonrelevant_alleles: write_fasta(args.new_fsa, allele, contigs[allele])
def main(): parser = argparse.ArgumentParser( description= 'Script to refine a FASTA file to not have duplicate sequences.') parser.add_argument('-input', type=str, required=True, help='Path to input FASTA.') parser.add_argument('-output', type=str, required=True, help='Path to output FASTA.') parser.add_argument('-conflicts', type=str, required=True, help='Output of out where conflicts were found.') args = parser.parse_args() unique_dict = {} # keys are seq hashes and values are locus IDs duplicate_entries = [] duplicates, conflicts = (0 for i in range(2)) # Just need to iterate through once since we've already preferred the # the reference as the first sequence in extract_sequences.py. for record in SeqIO.parse(args.input, "fasta"): id = record.id locus = re.search(r'\.([A-Za-z0-9_]+)\.?\d?', id).group(1) seq = str(record.seq) md5_seq = hashlib.md5(seq.encode('utf-8')).hexdigest() # Found the first instance of a sequence, write it out if md5_seq not in unique_dict: unique_dict[md5_seq] = locus write_fasta(args.output, id, seq) else: # If we find an entry with the same locus, remove this new one if locus == unique_dict[md5_seq]: duplicates += 1 else: # Same sequence but different loci conflicts += 1 duplicate_entries.append(id) print("Number of duplicates removed: {0}".format(duplicates)) print("Number of conflicts sequences removed: {0}".format(conflicts)) with open(args.conflicts, 'w') as outfile: for dupe in duplicate_entries: outfile.write("{0}\n".format(dupe))
def main(): parser = argparse.ArgumentParser(description='Script to perform needle alignment on like sequences across FASTA files.') parser.add_argument('-f', type=str, required=True, help='Two paths to FASTA files split by a comma.') parser.add_argument('-n', type=str, required=True, help='Path to install directory of EMBOSS needle executable (e.g. /path/to/packages/emboss/bin/needle).') parser.add_argument('-o', type=str, required=True, help='Location to generate output directories.') args = parser.parse_args() make_directory(args.o) first_seqs = SeqIO.to_dict(SeqIO.parse(args.f.split(',')[0],"fasta")) second_seqs = SeqIO.to_dict(SeqIO.parse(args.f.split(',')[1],"fasta")) first_map = build_sequence_map(first_seqs.keys()) second_map = build_sequence_map(second_seqs.keys()) for key in first_map: if key in second_map: cur_key_dir = "{}/{}".format(args.o,key) make_directory(cur_key_dir) # one of these lists should be of size one for entry1_id in first_map[key]: for entry2_id in second_map[key]: entry1 = first_seqs[entry1_id] # get the Seq object entry2 = second_seqs[entry2_id] entry2.id = entry2.id.replace('|','.') entry1_file = "{}/{}.fsa".format(cur_key_dir,entry1.id) entry2_file = "{}/{}.fsa".format(cur_key_dir,entry2.id) if not os.path.isfile(entry1_file): write_fasta(entry1_file,entry1.id,str(entry1.seq)) if not os.path.isfile(entry2_file): write_fasta(entry2_file,entry2.id,str(entry2.seq)) run_needle( args.n, entry1_file, entry2_file, "{}/{}_WITH_{}.align.txt".format(cur_key_dir,entry1.id,entry2.id) ) os.remove(entry1_file) # sequences already stored in inputs os.remove(entry2_file)
def main(): parser = argparse.ArgumentParser( description= 'Script to run TASR in an iterative fashion on individual sequences.') parser.add_argument('-t', type=str, required=True, help='Location of the TASR exe (~/tasr_v1.6.2/TASR).') parser.add_argument( '-r', type=str, required=True, help= 'Path to a file with paths to reads to be used in assembly on each line.' ) parser.add_argument( '-s', type=str, required=True, help= 'Path to a file with the sequences which should be the targets for assembly.' ) parser.add_argument('-o', type=str, required=True, help='Location to generate output directories.') args = parser.parse_args() make_directory(args.o) seqs = SeqIO.to_dict(SeqIO.parse(args.s, "fasta")) for id in seqs: cur_dir = "{}/{}".format(args.o, id) cur_fsa = "{}/seq.fasta".format(cur_dir) cur_rds = "{}/reads.txt".format(cur_dir) make_directory(cur_dir) write_fasta(cur_fsa, id, str(seqs[id].seq)) copyfile(args.r, cur_rds) command = ("{} -s {} -f {} -w 1 -u 1 -c 1".format( args.t, cur_fsa, cur_rds)) subprocess.call(command.split())
def align(out, allele, contig, aseq, bseq, f_or_r, assmb_type, emboss_tool): initial_align = "{0}/{1}.WITH.{2}.align.txt".format(out, allele, contig) call_emboss(emboss_tool, aseq, bseq, initial_align) a, b = (None for i in range(2)) alignment = AlignIO.read(initial_align, "emboss") for sequence in alignment: if a == None: # grab both sequences, first being the reference seq a = sequence.seq else: # now grab the assembled seq b = sequence.seq # Once two sequences are extracted, refine and align trimming the # outside extended blank sequence. if a != None and b != None: refined_align = "{0}/{1}.WITH.{2}.{3}.trimmed_align.txt".format( out, allele, contig, f_or_r) seqs = trim_extensions(a, b) a_trim = "{0}.a.trimmed".format( f_or_r) # sequence header, file name makes distinction b_trim = "{0}.b.trimmed".format(f_or_r) if 'needle' in emboss_tool: a_fsa = "{0}/{1}.WITH.{2}.{3}.a.fsa".format( out, allele, contig, f_or_r) # filename b_fsa = "{0}/{1}.WITH.{2}.{3}.b.fsa".format( out, allele, contig, f_or_r ) # will be different since alignments will be different write_fasta(a_fsa, a_trim, seqs['a']) write_fasta(b_fsa, b_trim, seqs['b']) call_emboss(emboss_tool, a_fsa, b_fsa, refined_align) # No need to keep the initial align at this point as the trimmed # should be better. If really needed, can use the original untrimmed # sequences and manually re-perform needle alignment. os.remove(initial_align) elif 'water' in emboss_tool: write_fasta(a_fsa, a_trim, a.replace('-', '')) write_fasta(b_fsa, b_trim, b.replace('-', '')) os.rename(initial_align, refined_align) return seqs
def extract_sequences(file, assembled, aligned, outfile): regex_for_contig_id = ">([a-zA-Z0-9_\.]+)" # Since PF is fairly small, can be greedy about how the FASTA entries are being # processed and store them in memory. contigs = {} not_assembled, not_aligned = ( [] for i in range(2)) # note which IDs should be re-added at the end current_id = "" # store the previous key for the bases to be assigned to with open(file, 'r') as fasta: for line in fasta: # iterate over the FASTA file and extract the entirety of each sequence line = line.rstrip() if line.startswith('>'): current_id = re.search(regex_for_contig_id, line).group(1) contigs[current_id] = "" # in addition to grabbing entire header, check if this entry is needed later locus = line.split('.')[1] if locus not in aligned: # we know that if it didn't align, couldn't have assembled not_aligned.append(current_id) elif locus not in assembled: not_assembled.append(current_id) else: contigs[current_id] += line # add all the bases for allele in not_assembled: write_fasta(outfile, allele, contigs[allele]) for allele in not_aligned: write_fasta(outfile, allele, contigs[allele])