def generateFastas(input, index, Contigs, query): # loop through fasta once, generating query and reference contiglist = Contigs[index + 1:] + keepers with open('query_{}.fa'.format(index), 'w') as qFasta: with open('reference_{}.fa'.format(index), 'w') as rFasta: with open(input, 'r') as infile: for Id, Sequence in SimpleFastaParser(infile): if Id == query: qFasta.write('>%s\n%s\n' % (Id, softwrap(Sequence))) elif Id in contiglist: rFasta.write('>%s\n%s\n' % (Id, softwrap(Sequence)))
def main(args): # setup menu with argparse class MyFormatter(argparse.ArgumentDefaultsHelpFormatter): def __init__(self, prog): super(MyFormatter, self).__init__(prog, max_help_position=48) parser = argparse.ArgumentParser(prog='gff2prot.py', description='''Script to convert GFF3 and FASTA proteins.''', epilog="""Written by Jon Palmer (2018) [email protected]""", formatter_class=MyFormatter) parser.add_argument('-g', '--gff3', required=True, help='Genome annotation GFF3 format') parser.add_argument('-f', '--fasta', required=True, help='Genome in FASTA format') parser.add_argument('--no_stop', action='store_true', help='Dont print stop codon') args = parser.parse_args(args) # translate GFF3 to proteins # load into dictionary Genes = {} Genes = lib.gff2dict(args.gff3, args.fasta, Genes) for k, v in natsorted(list(Genes.items())): if v['type'] == 'mRNA': for i, x in enumerate(v['ids']): if args.no_stop: Prot = v['protein'][i].rstrip('*') else: Prot = v['protein'][i] sys.stdout.write('>%s %s\n%s\n' % (x, k, lib.softwrap(Prot)))
def SortRenameHeaders(input, basename, output, minlen=False): Seqs = [] with open(input, 'r') as infile: for header, sequence in SimpleFastaParser(infile): Seqs.append((header, len(sequence), sequence)) # sort by length sortedSeqs = sorted(Seqs, key=lambda x: x[1], reverse=True) # loop through and return contigs and keepers counter = 1 with open(output, 'w') as outfile: for name, length, seq in sortedSeqs: newName = '{:}_{:}'.format(basename, counter) if len(newName) > 16: print(( 'Error. {:} fasta header too long. Choose a different --base name. NCBI/GenBank max is 16 characters' .format(newName))) sys.exit(1) if minlen: if length >= int(minlen): outfile.write('>{:}\n{:}\n'.format(newName, softwrap(seq))) counter += 1 else: outfile.write('>{:}\n{:}\n'.format(newName, softwrap(seq))) counter += 1
def create_partitions(fasta, genes, partition_list, proteins=False, transcripts=False, repeats=False, num=50, tmpdir='.', interval=2000, partitions=True, debug=False): # function to create EVM partition intervals that do not split genes if not os.path.isdir(tmpdir): os.makedirs(tmpdir) SeqRecords = SeqIO.index(fasta, 'fasta') PID = os.getpid() bedGenes = os.path.join(tmpdir, 'genes.{}.bed'.format(PID)) superGenes = os.path.join(tmpdir, 'genes.{}.supergenes.bed'.format(PID)) interGenes = gene_blocks_to_interlap(genes) if proteins: interProteins = exonerate_blocks_to_interlap(proteins) if transcripts: interTranscripts = blocks_to_interlap(transcripts) if repeats: interRepeats = blocks_to_interlap(repeats) Results = [] with open(genes, 'r') as infile: for line in infile: if line.startswith('#') or line.startswith('\n'): continue line = line.rstrip() cols = line.split('\t') if cols[2] == 'gene': Results.append([ cols[0], int(cols[3]), int(cols[4]), cols[8], cols[5], cols[6] ]) # sort the results by contig and position ChrGeneCounts = {} totalGeneCount = 0 sortedResults = natsorted(Results, key=lambda x: (x[0], x[1])) with open(bedGenes, 'w') as outfile: for x in sortedResults: totalGeneCount += 1 outfile.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format( x[0], x[1], x[2], x[3], x[4], x[5])) if not x[0] in ChrGeneCounts: ChrGeneCounts[x[0]] = 1 else: ChrGeneCounts[x[0]] += 1 ChrNoGenes = len(SeqRecords) - len(ChrGeneCounts) superGeneCount = 0 lib.log.debug( '{:,} total contigs; skipping {:,} contigs with no genes'.format( len(SeqRecords), ChrNoGenes)) if partitions: # now merge overlaping genes [strand] to get conservative locus boundaries cmd = ['bedtools', 'merge', '-s', '-i', bedGenes] merged = {} with open(superGenes, 'w') as outfile: for line in lib.execute(cmd): superGeneCount += 1 line = line.rstrip() if line.count('\t') != 2: lib.log.debug( 'Error parsing bedtools merge line:\n{}'.format(line)) continue chr, start, end = line.split('\t') outfile.write('{}\t{}\t{}\tSuperGene_{}\n'.format( chr, start, end, superGeneCount)) if chr not in merged: merged[chr] = [(int(start), int(end), -1)] else: diff = int(start) - merged[chr][-1][1] merged[chr].append((int(start), int(end), diff)) lib.log.debug( 'Merged {} genes into {} supergenes with bedtools'.format( totalGeneCount, superGeneCount)) # parse Results and get coordinates to partitions Partitions = {} Commands = {} for k, v in natsorted(merged.items()): if not k in ChrGeneCounts: # no genes, so can safely skip continue Partitions[k] = [] if len(v) > num: chunks = math.ceil(len(v) / num) num_genes = int(round(len(v) / chunks)) chunks = int(chunks) for i in range(chunks): if k in Commands: continue i = i + 1 if i == 1: start = 1 else: phase = int(round(interval / 3)) if len(Partitions[k]) > 0: start = Partitions[k][-1][1] - phase else: start = 1 loc = i * num_genes if i == chunks: end = len(SeqRecords[k]) else: if loc >= len(v): end = len(SeqRecords[k]) else: end = getBreakPoint(v, loc, direction='reverse', gap=interval) if not end: end = getBreakPoint(v, loc, direction='forward', gap=interval) if not end: Commands[k] = {'n': len(v)} lib.log.debug('{} --> {} bp'.format( k, len(SeqRecords[k]))) else: partLen = end - start if partLen < 10000: continue Partitions[k].append((start, end)) partName = '{}_{}-{}'.format(k, start, end) Commands[partName] = {'n': num_genes, 'chr': k} lib.log.debug('{} --> {} bp'.format(partName, partLen)) else: Commands[k] = {'n': len(v)} lib.log.debug('{} --> {} bp'.format(k, len(SeqRecords[k]))) # now loop through partitions and write files for EVM with open(partition_list, 'w') as partout: for chr, p in natsorted(Partitions.items()): chrDir = os.path.join(tmpdir, chr) if not os.path.isdir(chrDir): os.makedirs(chrDir) if len(p) == 0: partout.write('{}\t{}\t{}\n'.format( chr, os.path.abspath(chrDir), 'N')) chrFasta = os.path.join(chrDir, os.path.basename(fasta)) with open(chrFasta, 'w') as fastaout: fastaout.write('>{}\n{}\n'.format( chr, lib.softwrap(str(SeqRecords[chr].seq)))) genePred = os.path.join(chrDir, os.path.basename(genes)) RangeFinder(interGenes, chr, 1, len(SeqRecords[chr]), genePred) if proteins: protPred = os.path.join(chrDir, os.path.basename(proteins)) RangeFinder(interProteins, chr, 1, len(SeqRecords[chr]), protPred) if transcripts: tranPred = os.path.join(chrDir, os.path.basename(transcripts)) RangeFinder(interTranscripts, chr, 1, len(SeqRecords[chr]), tranPred) if repeats: repPred = os.path.join(chrDir, os.path.basename(repeats)) RangeFinder(interRepeats, chr, 1, len(SeqRecords[chr]), repPred) else: for coords in p: partDir = os.path.join( chrDir, '{}_{}-{}'.format(chr, coords[0], coords[1])) if not os.path.isdir(partDir): os.makedirs(partDir) partout.write('{}\t{}\t{}\t{}\n'.format( chr, os.path.abspath(chrDir), 'Y', os.path.abspath(partDir))) partFasta = os.path.join(partDir, os.path.basename(fasta)) with open(partFasta, 'w') as fastaout: fastaout.write('>{}\n{}\n'.format( chr, lib.softwrap( str(SeqRecords[chr].seq[coords[0] - 1:coords[1]])))) # split genes GFF3 genePred = os.path.join(partDir, 'gene_predictions.gff3') RangeFinder(interGenes, chr, coords[0], coords[1], genePred) if proteins: protPred = os.path.join(partDir, os.path.basename(proteins)) RangeFinder(interProteins, chr, coords[0], coords[1], protPred) if transcripts: tranPred = os.path.join( partDir, os.path.basename(transcripts)) RangeFinder(interTranscripts, chr, coords[0], coords[1], tranPred) if repeats: repPred = os.path.join(partDir, os.path.basename(repeats)) RangeFinder(interRepeats, chr, coords[0], coords[1], repPred) else: Commands = {} with open(partition_list, 'w') as partout: for chr in SeqRecords: if not chr in ChrGeneCounts: # no genes so skip continue Commands[chr] = {'n': len(SeqRecords[chr])} chrDir = os.path.join(tmpdir, chr) if not os.path.isdir(chrDir): os.makedirs(chrDir) partout.write('{}\t{}\t{}\n'.format(chr, os.path.abspath(chrDir), 'N')) chrFasta = os.path.join(chrDir, os.path.basename(fasta)) with open(chrFasta, 'w') as fastaout: fastaout.write('>{}\n{}\n'.format( chr, lib.softwrap(str(SeqRecords[chr].seq)))) genePred = os.path.join(chrDir, os.path.basename(genes)) RangeFinder(interGenes, chr, 1, len(SeqRecords[chr]), genePred) if proteins: protPred = os.path.join(chrDir, os.path.basename(proteins)) RangeFinder(interProteins, chr, 1, len(SeqRecords[chr]), protPred) if transcripts: tranPred = os.path.join(chrDir, os.path.basename(transcripts)) RangeFinder(interTranscripts, chr, 1, len(SeqRecords[chr]), tranPred) if repeats: repPred = os.path.join(chrDir, os.path.basename(repeats)) RangeFinder(interRepeats, chr, 1, len(SeqRecords[chr]), repPred) return Commands
def main(args): # setup menu with argparse class MyFormatter(argparse.ArgumentDefaultsHelpFormatter): def __init__(self, prog): super(MyFormatter, self).__init__(prog, max_help_position=48) parser = argparse.ArgumentParser( prog='contig_cleaner.py', usage="%(prog)s [options] -i genome.fa -o cleaned.fa", description= '''Script that removes short scaffolds that are duplicated elsewhere.''', epilog="""Written by Jon Palmer (2016) [email protected]""", formatter_class=MyFormatter) parser.add_argument('-i', '--input', required=True, help='Multi-fasta genome file') parser.add_argument('-o', '--out', required=True, help='Cleaned output (FASTA)') parser.add_argument('-p', '--pident', type=int, default=95, help='percent identity of contig') parser.add_argument('-c', '--cov', type=int, default=95, help='coverage of contig') parser.add_argument('-m', '--minlen', type=int, default=500, help='Minimum length of contig') parser.add_argument('--cpus', default=2, type=int, help='Number of CPUs to use') parser.add_argument('--exhaustive', action='store_true', help='Compute every contig, else stop at N50') parser.add_argument('--debug', action='store_true', help='Debug the output') args = parser.parse_args(args) # setup some global variables used in functions above global GENOME, CPUS, PIDENT, COV, keepers, repeats GENOME = args.input CPUS = args.cpus PIDENT = args.pident COV = args.cov keepers, repeats = ([], ) * 2 # run some checks of dependencies first programs = ['minimap2'] CheckDependencies(programs) # calculate N50 of assembly n50 = calcN50(args.input) # now get list of scaffolds, shortest->largest if args.exhaustive: scaffolds, keepers = Sortbysize(args.input, False, minlen=args.minlen) else: scaffolds, keepers = Sortbysize(args.input, n50, minlen=args.minlen) print("-----------------------------------------------") PassSize = len(scaffolds) + len(keepers) print( ("{:,} input contigs, {:,} larger than {:,} bp, N50 is {:,} bp".format( countfasta(args.input), PassSize, args.minlen, n50))) if args.exhaustive: print(("Checking duplication of {:,} contigs".format(len(scaffolds)))) else: print(("Checking duplication of {:,} contigs shorter than N50".format( len(scaffolds)))) print("-----------------------------------------------") # now generate pool and parallel process the list mp_output = multithread_aligning(scaffolds) for output, garbage in mp_output: if not garbage: keepers.append(output) else: repeats.append(output) print("-----------------------------------------------") print(( "{:,} input contigs; {:,} larger than {:} bp; {:,} duplicated; {:,} written to file" .format(countfasta(args.input), PassSize, args.minlen, len(repeats), len(keepers)))) if args.debug: print(("\nDuplicated contigs are:\n{:}\n".format(', '.join(repeats)))) print(("Contigs to keep are:\n{:}\n".format(', '.join(keepers)))) # finally write a new reference based on list of keepers with open(args.out, 'w') as output: with open(args.input, 'r') as input: for title, sequence in SimpleFastaParser(input): if title in keepers: output.write('>{}\n{}\n'.format(title, softwrap(sequence)))