def splitfastas(infile, fastadir, filepathinfo, seqlengthfile): """takes input fastafile from filepath or sys.stdin; splits by sample and writes to outdir; also writes filepathinfo.tsv to record sample, fastafilepath, and blastdbfilepath""" from Bio import SeqIO import re, os from pythonmods import runsubprocess #first create splitfasta output directory runsubprocess(['mkdir -p %s' % fastadir], shell=True) #parse fasta and store in recorddict recorddict = {} for recordid, recordseq in SeqIO.FastaIO.SimpleFastaParser(infile): #remove description from fasta id if present newfastaheader = re.sub(r'(\S+)(?: .*)?', r'\1', recordid) newfastaheader = newfastaheader.strip() recordid = newfastaheader #get sample name (fasta header format should be sample or sample|contig) sample = re.match(r'^([^\|]*).*', newfastaheader) sample = sample.group(1) #write to dict if sample not in recorddict: recorddict[sample] = [] recorddict[sample].append((recordid, recordseq)) infile.close() #write records to splitfastas directory, split by sample; write seqlengths to seqlengths.tsv f2 = open(filepathinfo, 'w') f3 = open(seqlengthfile, 'w') samples = set() for sample in recorddict.keys(): samples.add(sample) fastafilepath = '%s/%s.fasta' % (fastadir, sample) blastdbfilepath = os.path.splitext(fastafilepath)[0] blastdbfilepath = '%s_db' % blastdbfilepath f2.write('%s\t%s\t%s\n' % (sample, fastafilepath, blastdbfilepath)) with open(fastafilepath, 'w') as output_handle: for recordid, recordseq in recorddict[sample]: f3.write('%s\t%s\n' % (recordid, len(recordseq))) output_handle.write(">%s\n%s\n" % (recordid, recordseq)) f2.close() f3.close() assert len( samples) > 0, 'Error: no records detected from fasta file provided'
if attribute not in attributes: sys.exit( 'Harmonized attribute name: %s is invalid (not listed in attributenames.tsv)' % attribute) ###run Edirect commands if args.accessiontype == 'nucleotide': print('retrieving nucleotide accession metadata from NCBI') accessionsdf = pd.read_csv('%s' % str(args.accessions), header=None, sep='\t') accessions = accessionsdf.iloc[:, 0].tolist() runsubprocess(['mkdir -p %s' % outputpath], shell=True) f = open('%s/nucleotidemetadata.tsv' % outputpath, 'w') f.write( 'Accession\tCreateDate\tUpdateDate\tMoleculeType\tLength\tCompleteness\tSourceGenomeType\tSourceTaxon\tSourceTaxID\tAssemblyMethod\tGenomeCoverage\tSequencingTechnology\tAnnotationPipeline\tAnnotationMethod\tBioprojectAccession\tBiosampleAccession\tSRAAccession\tAssemblyAccession\tPubMedID\n' ) f.close() f = open('%s/missingaccessions.txt' % outputpath, 'w') f.close() accessionslen = len(accessions) chunklen = int(args.batchsize) runsubprocess([ 'econtact -email %s -tool nucleotidemetadatadownload' % str(args.emailaddress) ],
import sys, os, re from pythonmods import runsubprocess dirpath = sys.argv[1] #args.sequences directory path filepathinfo = sys.argv[2] blastdbdir = sys.argv[3] #actually where blastdbs are stored blasttype = sys.argv[4] runsubprocess(['mkdir -p %s' % blastdbdir], shell=True) directory = str(dirpath).rstrip('/') dircontents = os.listdir(directory) samples = set() f2 = open(filepathinfo, 'w') for dircontent in dircontents: filepath = '%s/%s' % (directory, dircontent) if os.path.isfile(filepath): #check for fasta files... if filepath.endswith('.gz'): gunzipfilepath = re.sub(r'\.gz$', '', filepath) extension = os.path.splitext(gunzipfilepath)[1] sample = os.path.splitext(os.path.basename(gunzipfilepath))[0] else: extension = os.path.splitext(filepath)[1] sample = os.path.splitext(os.path.basename(filepath))[0] if extension in {'.fa', '.fasta', '.fna'}: if sample not in samples: #skip duplicates e.g. sample.fa and sample.fa.gz samples.add(sample) blastdbpath = '%s/%s_db' % (blastdbdir, sample) f2.write('%s\t%s\t%s\n' % (sample, filepath, blastdbpath)) f2.close()
inclusionpresent = 'inclusionabsent' inclusionarg = 'placeholder' elif args.annotationtxt_inclusion != None: inclusionpresent = 'commandline' inclusionarg = str(','.join(args.annotationtxt_inclusion)) else: inclusionpresent = 'filepath' inclusionarg = args.annotationtxt_inclusion_file if os.path.isfile(inclusionarg) == False: print('Error: %s is not a valid filepath' % inclusionarg) sys.exit() #handle filepaths to directory args.inputdir = str(args.inputdir).rstrip('/') runsubprocess(['mkdir -p %s' % outputpath], shell=True) if args.features == None: runsubprocess([ 'Rscript', '%s/genoplotr.R' % sourcedir, str(args.inputdir), str(','.join(args.syntax)), str(args.sequencelengths), str(args.comparisons), str(args.seg_plots), outputpath, str(args.comparisontype), str(args.main), str(args.main_pos), str(';'.join(args.sequencefills)), str(';'.join(args.sequenceoutlines)),
noblasthits=False if os.path.exists(outputpath): sys.exit('Error: %s output directory already exists, delete directory and try again'%outputpath) if args.sequences!=None: blasttype='allvallpairwise' blastdbdir='%s/blastdbs'%outputpath filepathinfo='%s/filepathinfo.tsv'%outputpath subjectsamples='%s/allsubjects.txt'%outputpath if fastafileinput=='file' or fastafileinput=='stdin': splitfastas(args.sequences,blastdbdir,filepathinfo,'%s/seqlengths.tsv'%outputpath) runsubprocess(['bash','%s/makeblastdbs.sh'%sourcedir,filepathinfo,str(args.threads),sourcedir]) laterruntime=runtime() #print(laterruntime-startruntime, 'runtime; finished creating blast databases') print('finished creating blast databases') runsubprocess(['bash','%s/runblast.sh'%sourcedir,outputpath, blastdbdir, filepathinfo, str(args.evalue), str(args.wordsize), str(args.task),str(args.cullinglimit),str(args.threads),str(args.bidirectionalblast),blasttype],preexec_fn='sigpipefix') laterruntime=runtime() #print(laterruntime-startruntime, 'runtime; finished running blast') print('finished running blast') else: runsubprocess(['python','%s/getdirpaths.py'%sourcedir,args.sequences,filepathinfo,blastdbdir,blasttype]) runsubprocess(['python','%s/getseqlengths.py'%sourcedir,'%s/seqlengths.tsv'%outputpath,filepathinfo]) runsubprocess(['bash','%s/makeblastdbs_editfastas.sh'%sourcedir,filepathinfo,str(args.threads),sourcedir]) laterruntime=runtime() #print(laterruntime-startruntime, 'runtime; finished creating blast databases') print('finished creating blast databases') runsubprocess(['bash','%s/runblast_dirinput.sh'%sourcedir,outputpath,sourcedir, filepathinfo, filepathinfo,str(args.evalue), str(args.wordsize), str(args.task),str(args.cullinglimit),str(args.threads),str(args.bidirectionalblast),blasttype],preexec_fn='sigpipefix')
#!/usr/bin/env python import os, datetime from Bio import SeqIO from pythonmods import runsubprocess sourcedir = os.path.dirname(os.path.abspath(__file__)) output_folder = './databases/plasmidfinder_db' cmdArgs = ['mkdir -p %s' % output_folder] runsubprocess(cmdArgs, shell=True) cmdArgs = [ 'git clone https://bitbucket.org/genomicepidemiology/plasmidfinder_db.git ./databases/plasmidfinder_db' ] runsubprocess(cmdArgs, shell=True) print('Retrieved plasmidfinder_db from bitbucket') gramposfastas = [] for filename in os.listdir('./databases/plasmidfinder_db'): if filename.endswith('.fsa') and filename != 'enterobacteriaceae.fsa': gramposfastas.append(filename) #combine gram-positive replicons into single gram-positive database f2 = open('./databases/plasmidfinder_db/gram_positive.fsa', 'w') for filename in gramposfastas: with open(os.path.join(output_folder, filename)) as f: for indx, seq_record in enumerate(SeqIO.parse(f, 'fasta')): fastaheader = str(seq_record.id) newfastaheader = '%s|%s' % (filename.rstrip('.fsa'), fastaheader) seq_record.id = newfastaheader
parser = argparse.ArgumentParser(description="ATCG: Alignment Based Tool for Comparative Genomics; get feature annotation files in correct format for visualisation.py",add_help=False) parser.add_argument('-h', '--help', action='help', default=argparse.SUPPRESS, help='Show this help message and exit.') parser.add_argument('-t', '--annotationtype', help='The type of annotation file that requires conversion to correct format (required)',choices=['prokka','genbank'],type=str,required=True) parser.add_argument('-i', '--inputpath', help='The input directory (containing annotation files) or annotation file to be converted to correct format (required)',required=True) parser.add_argument('-o', '--outdir', help='The output directory (required)',required=True) parser.add_argument('-s', '--seqnames', help='A file containing the sequence names associated with the annotation file(s) in the first column (required if annotationtype is prokka)',required=False) args = parser.parse_args() outputpath=os.path.relpath(args.outdir, cwdir) if args.seqnames==None: if args.annotationtype=='prokka': print('Error: if using prokka annotation file(s) as input, a file containing the associated sequence names (the original names, with no changes introduced by prokka) must be provided') sys.exit() runsubprocess(['mkdir -p %s'%outputpath],shell=True) #check if input is file or directory if os.path.isfile(args.inputpath): inputpathtype='file' elif os.path.isdir(args.inputpath): inputpathtype='directory' else: print('Error: %s is not a file or directory'%args.inputpath) sys.exit() if args.annotationtype=='prokka': if inputpathtype=='directory': runsubprocess(['bash %s/concatenateprokka.sh %s | python %s/fixprokkagff.py %s %s %s'%(sourcedir,str(args.inputpath),sourcedir,str(args.seqnames),outputpath,inputpathtype)],shell=True) else: runsubprocess(['python', '%s/fixprokkagff.py'%sourcedir, str(args.seqnames),outputpath,inputpathtype,str(args.inputpath)])
parser.add_argument('-b','--besthits', help='Text file containing best hits or reciprocal best hits', required=False) parser.add_argument('-o','--out', help='Output directory (required)', required=True) parser.add_argument('-e','--evalue', help='BLAST e-value cutoff (default: 1e-6)', default=1e-6, type=float) parser.add_argument('-i','--pident', help='BLAST percent identity cutoff (default: 40)', default=40, type=int) parser.add_argument('-c','--qcovhsp', help='BLAST hsp query coverage cutoff (default: 80)', default=80, type=int) parser.add_argument('-t','--threads', help='Number of threads to use (default: 1)', default=1, type=int) parser.add_argument('--breakpoint', action='store_true', help='Calculate breakpoint distance statistics (default: do not calculate unless --besthits file is provided)') args = parser.parse_args() outputpath=os.path.relpath(args.out, cwdir) if args.sequences==None and args.besthits==None: parser.error('as input, you must either provide --sequences or --besthits') if args.sequences!=None: runsubprocess(['python','%s/getproteins.py'%sourcedir,outputpath, str(args.sequences)]) runsubprocess(['bash','%s/makeblastdbs.sh'%sourcedir,outputpath, str(args.threads), sourcedir]) runsubprocess(['bash','%s/runblast.sh'%sourcedir,outputpath, str(args.evalue),str(args.threads)]) runsubprocess(['bash','%s/reformatblast.sh'%sourcedir,outputpath,str(args.pident),str(args.qcovhsp)]) runsubprocess(['Rscript','%s/getreciprocalhits.R'%sourcedir,outputpath]) if args.breakpoint==True: rbhinput='metamorth' runsubprocess(['Rscript','%s/getbreakpointdistance.R'%sourcedir,outputpath,str(args.threads),rbhinput]) else: rbhinput='userprovided' runsubprocess(['mkdir -p %s/blast'%outputpath],shell=True) runsubprocess(['mkdir -p %s/output'%outputpath],shell=True) runsubprocess(['Rscript','%s/getbreakpointdistance.R'%sourcedir,outputpath,str(args.threads),rbhinput,str(args.besthits)])
elif rmlstdbexists == False: sys.exit( 'Error: the rMLST database must be installed first (see README)') else: sys.exit( 'Error: the PlasmidFinder database must be installed first (see README)' ) #check --sampleoutput flag used correctly if provided #if args.sampleoutput==True and args.contigsamples==None: # sys.exit('Error: --sampleoutput is only possible if the --contigsamples flag is provided, to specify sample groupings') if args.contigsamples != None: args.sampleoutput = True #always produce sample-level output if args.contigsamples is provided cmdArgs = ['mkdir -p %s' % outputpath] runsubprocess(cmdArgs, shell=True) ###retrieve accessions and sequences from NCBI if args.inhousesequences == None and args.restartwithsequences == False: if args.accessions == None: if args.datequery == None: datepresent = "absent" else: datepresent == "present" runsubprocess([ 'bash', '%s/downloadaccessions.sh' % sourcedir, datepresent, str(args.taxonomyquery), str(args.datequery), str(args.dbsource), outputpath ])