def main(): parser = argparse.ArgumentParser( description= 'Script to run TASR in a multithreaded fashion on individual sequences.' ) parser.add_argument('-t', type=str, required=True, help='Location of the TASR exe (~/tasr_v1.6.2/TASR).') parser.add_argument('-d', type=int, required=True, help='Number of threads to use.') parser.add_argument( '-r', type=str, required=True, help= 'Path to a file with paths to reads to be used in assembly on each line.' ) parser.add_argument( '-s', type=str, required=True, help= 'Path to a file with the sequences which should be the targets for assembly.' ) parser.add_argument('-o', type=str, required=True, help='Location to generate output directories.') args = parser.parse_args() make_directory(args.o) seqs = SeqIO.to_dict(SeqIO.parse(args.s, "fasta")) manager = mp.Manager() pool = mp.Pool(args.d - 1) jobs = [] for id in seqs: cur_dir = "{}/{}".format(args.o, id) cur_fsa = "{}/seq.fasta".format(cur_dir) cur_rds = "{}/reads.txt".format(cur_dir) make_directory(cur_dir) write_fasta(cur_fsa, id, str(seqs[id].seq)) copyfile(args.r, cur_rds) jobs.append(pool.apply_async(run_tasr, (args.t, cur_fsa, cur_rds))) for job in jobs: # Get all the returns from the apply_async function job.get() pool.close() # Tell the queue it's done getting new jobs pool.join() # Make sure these new jobs are all finished
def main(): parser = argparse.ArgumentParser( description= 'Script to map alleles across GFF3 file. Read the top of the file for more details.' ) parser.add_argument('--ea_input', '-eai', type=str, required=True, help='Path to a TSV list for references and isolates.') parser.add_argument( '--insert', '-i', type=int, required=False, default=0, help='Insert size from SRA for the reads that will be used as input.') parser.add_argument( '--gene_or_exon', '-ge', type=str, required=True, help='Either "gene" or "exon" for which level of sequences to pull.') parser.add_argument('--out_dir', '-o', type=str, required=False, default='.', help='Directory for where the output should go.') args = parser.parse_args() make_directory(args.out_dir) # dictionary where the key is the ID and the value is a list for ref/loc/coords allele_map = {} # Iterate over each reference/isolate with open(args.ea_input, 'r') as i: for entry in i: entry = entry.rstrip() vals = entry.split('\t') type = vals[0] gff3 = vals[1] name = vals[3] # Regardless of reference or isolate, all should be mapping to the same name # designated by the reference. allele_map = parse_gff3(gff3, allele_map, type, name, args.insert, args.gene_or_exon, args.out_dir) # Iterate over the final hash of lists and print out a TSV out = "ea_map.tsv" with open(out, 'w') as o: for key, value in allele_map.items(): vals = ('\t').join(value) line = "{0}\t{1}\n".format(key, vals) o.write(line)
def main(): parser = argparse.ArgumentParser(description='Script to perform needle alignment on like sequences across FASTA files.') parser.add_argument('-f', type=str, required=True, help='Two paths to FASTA files split by a comma.') parser.add_argument('-n', type=str, required=True, help='Path to install directory of EMBOSS needle executable (e.g. /path/to/packages/emboss/bin/needle).') parser.add_argument('-o', type=str, required=True, help='Location to generate output directories.') args = parser.parse_args() make_directory(args.o) first_seqs = SeqIO.to_dict(SeqIO.parse(args.f.split(',')[0],"fasta")) second_seqs = SeqIO.to_dict(SeqIO.parse(args.f.split(',')[1],"fasta")) first_map = build_sequence_map(first_seqs.keys()) second_map = build_sequence_map(second_seqs.keys()) for key in first_map: if key in second_map: cur_key_dir = "{}/{}".format(args.o,key) make_directory(cur_key_dir) # one of these lists should be of size one for entry1_id in first_map[key]: for entry2_id in second_map[key]: entry1 = first_seqs[entry1_id] # get the Seq object entry2 = second_seqs[entry2_id] entry2.id = entry2.id.replace('|','.') entry1_file = "{}/{}.fsa".format(cur_key_dir,entry1.id) entry2_file = "{}/{}.fsa".format(cur_key_dir,entry2.id) if not os.path.isfile(entry1_file): write_fasta(entry1_file,entry1.id,str(entry1.seq)) if not os.path.isfile(entry2_file): write_fasta(entry2_file,entry2.id,str(entry2.seq)) run_needle( args.n, entry1_file, entry2_file, "{}/{}_WITH_{}.align.txt".format(cur_key_dir,entry1.id,entry2.id) ) os.remove(entry1_file) # sequences already stored in inputs os.remove(entry2_file)
def main(): parser = argparse.ArgumentParser( description= 'Script to run TASR in an iterative fashion on individual sequences.') parser.add_argument('-t', type=str, required=True, help='Location of the TASR exe (~/tasr_v1.6.2/TASR).') parser.add_argument( '-r', type=str, required=True, help= 'Path to a file with paths to reads to be used in assembly on each line.' ) parser.add_argument( '-s', type=str, required=True, help= 'Path to a file with the sequences which should be the targets for assembly.' ) parser.add_argument('-o', type=str, required=True, help='Location to generate output directories.') args = parser.parse_args() make_directory(args.o) seqs = SeqIO.to_dict(SeqIO.parse(args.s, "fasta")) for id in seqs: cur_dir = "{}/{}".format(args.o, id) cur_fsa = "{}/seq.fasta".format(cur_dir) cur_rds = "{}/reads.txt".format(cur_dir) make_directory(cur_dir) write_fasta(cur_fsa, id, str(seqs[id].seq)) copyfile(args.r, cur_rds) command = ("{} -s {} -f {} -w 1 -u 1 -c 1".format( args.t, cur_fsa, cur_rds)) subprocess.call(command.split())
def main(): parser = argparse.ArgumentParser( description= 'Script to generate EMBOSS Needle alignments given output from format_for_assembly.py.' ) parser.add_argument( '--ea_map', '-eam', type=str, required=True, help='Path to ea_map.tsv output from extract_alleles.py.') parser.add_argument( '--assmb_map', '-am', type=str, required=True, help= 'Path to *map.tsv output from format_for_assembly.py or assembly_verdict.py.' ) parser.add_argument('--cpus', '-c', type=int, required=True, help='Number of cores to use.') parser.add_argument( '--original_fsa', '-of', type=str, required=True, help='Path to where the unbuffered FASTA from extract_sequences.py is.' ) parser.add_argument( '--min_align_len', '-minl', type=float, required=False, default=1.0, help= 'Optional minimum length ratio of an assembled sequence that should be aligned to. For instance, enter .1 to not align constructed sequences less than 10% of the original sequence length. Default 1.0.' ) parser.add_argument( '--max_align_len', '-maxl', type=int, required=False, default=75000, help= 'Optional maximum length of an assembled sequence that should be aligned to. This is a integer, not a ratio like the min length. Useful to prevent OOM.' ) parser.add_argument( '--assmb_path', '-asp', type=str, required=True, help= 'Path to the the directory preceding all the ref directories (e.g. for "/path/to/ref123" put "/path/to" as the input).' ) parser.add_argument( '--assmb_type', '-at', type=str, required=True, help= 'Either "SPAdes" or "HGA". Determines how many assembled sequences are aligned to.' ) parser.add_argument( '--priority', '-p', type=str, required=False, default="", help= 'If given, the prefix of the sequence to solely align to like XYZ.11203981.1 would require "XYZ" as input. Useful when trying to reconstruct a particular sequence.' ) parser.add_argument( '--align_path', '-alp', type=str, required=True, help='Path to output directory for all these alignments.') parser.add_argument( '--emboss_tool', '-e', type=str, required=True, help= 'Path to install directory of EMBOSS needle/water executable (e.g. /path/to/packages/emboss/bin/[needle|water]).' ) args = parser.parse_args() # First, extract the sequences from the reference file and # *STORE IN MEMORY* (careful how big the reference genome used is. # We need this to generate small FASTA files for Needle alignment. seq_dict = SeqIO.to_dict(SeqIO.parse(args.original_fsa, "fasta")) # In order to access these seqs efficiently, rebuild the DS created # in extract_alleles.py. This is a dictionary where the key is the # shared locus and the value is a list of all the mapped alleles. ref_dict = defaultdict(list) with open(args.ea_map, 'r') as loc_map: for line in loc_map: line = line.rstrip() ele = line.split('\t') locus = ele[0] # Need to handle the case where the reference locus is # split into multiple like ABC123.1,ABC123.2,etc. if '.' in locus: split_locus = locus.split('.') locus = split_locus[0] for j in range(1, len(ele)): allele_info = ele[j].split('|') allele = allele_info[4] ref_dict[locus].append(allele) manager = mp.Manager() q = manager.Queue() pool = mp.Pool(args.cpus) pool.apply_async(listener, (q, args.align_path)) min_len = args.min_align_len max_len = args.max_align_len # Build a jobs array to make sure these all finish. jobs = [] # Now that we can easily extract the sequences for alignment, iterate over # the directory name map file and perform alignments. with open(args.assmb_map, 'r') as dir_map: for line in dir_map: line = line.rstrip() ele = line.split('\t') locus = ele[0] # reference/locus that maps to directory number loc_dir = ele[ 1] # the directory number from assembly for grid submission out_dir = "{0}/{1}".format(args.align_path, locus) # alignment output goes here make_directory(out_dir) # Split out the contigs if more than one is present and have to do # alignment of all refs to all contigs. contigs = "" if args.assmb_type == "SPAdes": contigs = "{0}/{1}/contigs.fasta".format( args.assmb_path, loc_dir) jobs.append( pool.apply_async( worker, (locus, contigs, ref_dict[locus], seq_dict, out_dir, min_len, max_len, q, args.assmb_type, args.priority, args.emboss_tool))) else: contigs = "{0}/{1}/f_Scaffold.fasta".format( args.assmb_path, loc_dir) jobs.append( pool.apply_async( worker, (locus, contigs, ref_dict[locus], seq_dict, out_dir, min_len, max_len, q, args.assmb_type, args.priority, args.emboss_tool))) contigs = "{0}/{1}/r_Scaffold.fasta".format( args.assmb_path, loc_dir) jobs.append( pool.apply_async( worker, (locus, contigs, ref_dict[locus], seq_dict, out_dir, min_len, max_len, q, args.assmb_type, args.priority, args.emboss_tool))) # Get all the returns from the apply_async function. for job in jobs: job.get() q.put('stop') # should be no more messages pool.close() # Tell the queue it's done getting new jobs pool.join() # Make sure these new jobs are all finished
def main(): parser = argparse.ArgumentParser( description='Script to set up for SPAdes alignment on a grid.') parser.add_argument( '--ref_map', '-rm', type=str, required=True, help='Path to *_ref_map.tsv output from analyze_bam.py.') parser.add_argument( '--reads_dir', '-rd', type=str, required=True, help= 'Path to where the output directory for the FASTQs went, same as what was used for fastq_reads_to_fastq_alleles.py.' ) parser.add_argument( '--assmb_path', '-ap', type=str, required=True, help= 'Path to the the directory to initialize directories for all the assembly output.' ) parser.add_argument( '--outfile', '-o', type=str, required=True, help='Path to output map (maps the ref to the SGE ID)).') args = parser.parse_args() ref_map = {} # dict to hold the ref and its arbitrary ID starting at 1 id = 1 # First, rename the directories with open(args.ref_map, 'r') as infile: for line in infile: line = line.rstrip() ref = line.split( '\t') # really just want the first column which is the ref ID # Now rename the original directory so that it can be iterated over in a grid # job. old_dir = "{0}/{1}".format(args.reads_dir, ref[0]) new_dir = "{0}/{1}".format(args.reads_dir, id) try: os.rename(old_dir, new_dir) except OSError as exception: if exception.errno != errno.EEXIST: raise else: ref_map[ref[0]] = id # Make a new output directory for all the SPAdes assembly files spades_out_dir = "{0}/{1}".format(args.assmb_path, id) make_directory(spades_out_dir) id += 1 # if no exception, it was renamed and need a new ID # Now generate a map to know which directories correlate to what IDs with open(args.outfile, 'w') as outfile: for k, v in ref_map.items(): outfile.write("{0}\t{1}\n".format(k, v))
def main(): parser = argparse.ArgumentParser( description= 'Script to generate stats given output from analyze_bam.py and filter a set of paired-end FASTQ reads.' ) parser.add_argument( '--ab_read_map', '-ab', type=str, required=True, help='Path to *_read_map.tsv output from analyze_bam.py.') parser.add_argument('--fastq1', '-1', type=str, required=True, help='Path to the first paired fastq.gz file.') parser.add_argument('--fastq2', '-2', type=str, required=True, help='Path to the second paired fastq.gz file.') parser.add_argument( '--filter', '-f', type=str, required=True, help= 'Either "yes" or "no" for removing discrepancies + multi-locus mapping reads.' ) parser.add_argument( '--paired_suffixes', '-ps', type=str, required=True, help= 'Either "yes" or "no" for whether the reads are mapped to one another with suffixes like .1 and .2 and one wants to assess for concordancy. This is dependent on the aligner. Check the *read_map.tsv file and see if the first elements are by read pair (so no suffix) or individual read (each read has a suffix) and answer accordingly.' ) parser.add_argument( '--reads_dir', '-rd', type=str, required=True, help='Path to where the output directory for the FASTQs to go.') args = parser.parse_args() filter = args.filter output = args.reads_dir counts = { 'single_map': 0, 'multi_map': 0, 'discrepancy': 0 } # count these stats as they are processed. # Establish three dicts: # first two dicts consist of one for each mate # third dict is the IDs that need to be mapped (checking based on if the user wants to filter) r1, r2, ids_to_keep = (defaultdict(list) for j in range(3) ) # establish each mate dict as an empty list unique_refs = set() # make directories now for where all the reads will go if args.paired_suffixes == 'yes': # This first iteration only cares about grabbing all mates and their reference alignment info with open(args.ab_read_map, 'r') as reads: for line in reads: line = line.rstrip() ele = line.split('\t') if ele[0][-1] == "1": # read mate 1 for j in range(1, len(ele)): alignment = ele[j].split( '|') # split the alignment data ref = alignment[2].split( '.') # split the reference name ref_loc = ref[1] # grab just the base reference locus # don't double up on references (possible if mapping to same locus from different samples) if ref_loc not in r1[ele[0]]: r1[ele[0]].append(ref_loc) else: # read mate 2 for j in range(1, len(ele)): alignment = ele[j].split('|') ref = alignment[2].split('.') ref_loc = ref[1] if ref_loc not in r2[ele[0]]: r2[ele[0]].append(ref_loc) shared_id = "" # id in the format of ABC.123 for pairs ABC.123.1 + ABC.123.2 checked_ids, ref_dirs = ( set() for j in range(2) ) # set to speed up processing of R2 if already covered by R1 # Now, iterate over each dict of mates and filter if required for read in r1: # mate 1 shared_id = read[:-2] mate_id = shared_id + ".2" # Generate stats regardless of filtering or not, can help the user decide if they should count_val = verify_alignment(r1[read], r2[mate_id]) counts[count_val] += 1 if filter == "yes" and count_val == "single_map": # need to isolate reads that only map once # If a single map value, know that both reads share the same locus if not r1[read]: # if R1 didn't map, means R2 did ids_to_keep[shared_id].append(r2[mate_id][0]) elif not r2[ mate_id]: # same as above, if R2 didn't map, means R1 did ids_to_keep[shared_id].append(r1[read][0]) else: # else, they both mapped to the same locus and can use either value ids_to_keep[shared_id].append(r1[read][0]) unique_refs.add( ids_to_keep[shared_id][0]) # add the single locus else: # no filter needed, add all distinct loci found per read for ref in r1[read]: ids_to_keep[shared_id].append(ref) unique_refs.add(ref) # add all loci for ref in r2[mate_id]: if ref not in ids_to_keep[ shared_id]: # make sure not to double up on loci across mates ids_to_keep[shared_id].append(ref) unique_refs.add(ref) # add all loci checked_ids.add( shared_id ) # identify these as looked at before going into r2 dict for read in r2: # mate 2, only check if the mate wasn't caught by the r1 dict shared_id = read[:-2] if shared_id not in checked_ids: # if not checked using mate 1, verify now. mate_id = shared_id + ".1" count_val = verify_alignment(r1[mate_id], r2[read]) counts[count_val] += 1 # If we are here, the read was not found in R1. Thus, get loci strictly from R2. if filter == "yes" and count_val == "single_map": ids_to_keep[shared_id].append(r2[read][0]) unique_refs.add( ids_to_keep[shared_id][0]) # add the single locus else: # Again, was not found in R1 so we know all loci are from R2. for ref in r2[read]: ids_to_keep[shared_id].append(ref) unique_refs.add(ref) # add the single locus # At this point, ids_to_keep now has a dictionary mapping all read IDs to loci that # they aligned to. This is all that's needed to build a set of directories that house # reads just mapping to those loci for use in assembly. r1, r2, checked_ids = (None for j in range(3) ) # done with these, free up some memory # give the user some idea of how much they are potentially filtering out out_stats = output + ".stats" with open(out_stats, 'w') as stats_file: for k, v in counts.items(): stats_file.write("{0} read-pairs have a {1}.\n".format(v, k)) elif args.paired_suffixes == 'no': with open(args.ab_read_map, 'r') as reads: for line in reads: line = line.rstrip() ele = line.split('\t') for j in range(1, len(ele)): alignment = ele[j].split('|') # split the alignment data ref = alignment[2].split('.') # split the reference name ref_loc = ref[1] # grab just the base reference locus # don't double up on references (possible if mapping to same locus from different samples) if ref_loc not in ids_to_keep[ele[ 0]]: # just one dict here since gsnap doesn't capture read suffix ids_to_keep[ele[0]].append(ref_loc) unique_refs.add(ref_loc) # Write out all the directories for ref in unique_refs: dir = "{0}/{1}".format(output, ref) make_directory(dir) # Regardless of filtering based on alignment single/multiple/discrepancies or not, still # need to filter all the FASTQ reads to just those that aligned to a gene region. filter_fastq(ids_to_keep, args.fastq1, args.fastq2, output)
def main(): parser = argparse.ArgumentParser( description= 'Script to establish the necessary directory structures for the pipelines output.' ) parser.add_argument('--workspace_location', '-wl', type=str, default='.', help='Path to build directories at.') args = parser.parse_args() # directories for the grid to output logs and errors make_directory("{0}/grid_out".format(args.workspace_location)) make_directory("{0}/grid_err".format(args.workspace_location)) make_directory("{0}/gsnap_idx".format( args.workspace_location)) # stores GSNAP index make_directory("{0}/smalt_idx".format( args.workspace_location)) # stores SMALT index # Aligner 1 make_directory("{0}/first_reads".format( args.workspace_location)) # individual read sets per locus make_directory("{0}/first_spades_assemblies".format( args.workspace_location)) # assembly method 1 results make_directory("{0}/first_hga_assemblies".format( args.workspace_location)) # assembly method 2 results make_directory("{0}/first_alignments".format( args.workspace_location)) # alignment results # Aligner 2 make_directory("{0}/second_reads".format(args.workspace_location)) make_directory("{0}/second_spades_assemblies".format( args.workspace_location)) make_directory("{0}/second_hga_assemblies".format(args.workspace_location)) make_directory("{0}/second_alignments".format(args.workspace_location)) # Pull HGA and SB from their own repos, note these are modified from their original implementations for this pipeline hga_url = 'https://raw.githubusercontent.com/jmatsumura/Hierarchical-Genome-Assembly-HGA/master/HGA.py' with urllib.request.urlopen(hga_url) as response, open( "{0}/HGA.py".format(args.workspace_location), 'wb') as out_file: data = response.read() # a `bytes` object out_file.write(data) sb_url = 'https://raw.githubusercontent.com/jmatsumura/Scaffold_builder/master/scaffold_builder.py' with urllib.request.urlopen(sb_url) as response, open( "{0}/scaffold_builder.py".format(args.workspace_location), 'wb') as out_file: data = response.read() # a `bytes` object out_file.write(data) # "touch" these files open("{0}/first_ids_v_cov.tsv".format(args.workspace_location), 'w').close() open("{0}/second_ids_v_cov.tsv".format(args.workspace_location), 'w').close()