def mapReads(in_fastq, ref_fasta, out_dir, experiment): '''use mapPacBio.sh from bbmap to identify reference sequenecs matched by one or more PacBio reads with no substitutions (indels allowed)''' # mapPacBio path (first part gets path to folder running script) bbmap_pacbio = (os.path.dirname( os.path.realpath(__file__))) + '/bbmap_37_28/mapPacBio.sh' # get sample name from input file # need to strip off .gz and .fastq extensions sequentially sample_name = os.path.splitext( os.path.splitext(os.path.basename(in_fastq))[0])[0] print('Sample name: ' + sample_name) # create output genotyping folder if it doesn't exist sample_dir = utils.createOutputFolder(out_dir + '/genotyping/' + sample_name) # create bbmap command cmd = [ bbmap_pacbio, 'in=' + in_fastq, 'ref=' + ref_fasta, 'covstats=' + sample_dir + '/' + sample_name + '.covstats.tmp.txt', 'outm=' + sample_dir + '/' + sample_name + '.mapped.bam', 'outu=' + sample_dir + '/' + sample_name + '.unmapped.fastq.gz', 'statsfile=' + sample_dir + '/' + sample_name + '.mapping_stats.txt', 'subfilter=0', 'nzo=t', 'ambiguous=all', 'maxlen=1500', 'minid=0.9', 'maxindel=10', 'minratio=0.8', 'twocolumn=t', 'ow=t' ] # print bbmap command status.printStatus(' '.join(cmd)) # call bbmap # suppress stats output (saved to file, no need to clutter stderr) # FNULL = open(os.devnull, 'w') subprocess.call(cmd) # FNULL.close() # add descriptors to covstats output with open(sample_dir + '/' + sample_name + '.covstats.tmp.txt', 'r') as f: with open(sample_dir + '/' + sample_name + '.covstats.txt', 'w') as g: for idx, line in enumerate(f): # print header in first line, otherwise value of sample_name if idx == 0: g.write('sample_name' + '\t' + line.rstrip('\n') + '\t' + 'ref_fasta\tanalysis_path\texperiment\n') else: g.write(sample_name + '\t' + line.rstrip('\n') + '\t' + ref_fasta + '\t' + out_dir + '\t' + experiment + '\n') # remove temporary covstats.tmp.txt file after covstats.txt with sample ID prepared if os.path.exists(sample_dir + '/' + sample_name + '.covstats.tmp.txt'): os.remove(sample_dir + '/' + sample_name + '.covstats.tmp.txt') # copy reference file to output folder copyfile(ref_fasta, out_dir + '/genotyping/' + os.path.basename(ref_fasta)) # return covstats file return sample_dir + '/' + sample_name + '.covstats.txt'
def mapReadsFolder(fastq_folder, ref_fasta, out_dir, experiment): '''map FASTQ reads to reference for all files in folder and make pivottable from results''' # create list to store covstats paths covstats = [] # count number of fastq files that will be processed fastq_count = 0 for filename in os.listdir(fastq_folder): if filename.endswith(".fastq.gz"): fastq_count += 1 # run mapReads on FASTQ files in specific folder for idx, filename in enumerate(os.listdir(fastq_folder)): if filename.endswith(".fastq.gz"): # run mapReads for each file # return covstats file path - add to covstats list status.printStatus('Genotyping FASTQ file ' + str(idx + 1) + ' of ' + str(fastq_count)) covstats.append( mapReads(fastq_folder + '/' + filename, ref_fasta, out_dir, experiment)) continue else: continue status.printStatus('Make pivot table from: ' + ', '.join(covstats)) # create pivottable pivotTable(covstats, out_dir)
def makeFastq(ccs_bam): '''use smrtlink bam2fq to produce gzip compressed FASTQ file from CCS bam''' '''/slipstream/oc/pacbio/smrtlink_v6/smrtcmds/bin/''' # path to smrtlink bam2fastq smrtlink_bam2fastq_path = '/slipstream/oc/pacbio/smrtlink_v6/smrtcmds/bin/bam2fastq' # create fastq output file name ccs_basename = os.path.splitext(os.path.basename(ccs_bam))[0] fastq_output = os.path.dirname(ccs_bam) + '/' + ccs_basename print(fastq_output) # call bam2fastq cmd = [ smrtlink_bam2fastq_path, ccs_bam, '-o', fastq_output, ] status.printStatus('bam2fastq command: ' + ' '.join(cmd)) status.printStatus('bam2fastq processing of ' + ccs_bam + ' started') subprocess.call(cmd) status.printStatus('bam2fastq processing of ' + ccs_bam + ' completed') status.printStatus('gzip compressed FASTQ file saved to ' + fastq_output + '.fastq.gz') # return path to output fastq file return fastq_output + '.fastq.gz'
def importLabkey(df): '''import tabular genotypes into https://dholk.primate.wisc.edu/list/dho/gs/grid.view?listId=1630''' # make list of records from tabular dataframe labkey_data = df.to_dict('records') # add to labkey x = labkeyInteract.LabkeyInsertRows() x.serverContext('/dho/gs/') x.labkey_schema = 'lists' x.labkey_table = 'pacbio_genotypes' x.insertRows(x.labkey_schema, x.labkey_table, labkey_data) # log status.printStatus( str(len(labkey_data)) + ' sample genotypes added to dholk')
def makeCcs(subreads, out_dir, minPredictedAccuracy='0.9', minLength='1000', maxLength='1500'): '''use smrtlink ccs to produce consensus sequence''' # path to smrtlink ccs smrtlink_ccs_path = '/slipstream/SMRT4/SMRT/smrtcmds/bin/ccs' # check that subreads file exists if os.path.exists(subreads) == False: status.printStatus( 'Error: Specified subread file does not exist. Check your file path and try again.' ) return # filename of input file subreads_basename = os.path.splitext(os.path.basename(subreads))[0] print(subreads_basename) # create output directory if it doesn't exist utils.createOutputFolder(out_dir) # call ccs cmd = [ smrtlink_ccs_path, '--minPredictedAccuracy', minPredictedAccuracy, '--minLength', minLength, '--maxLength', maxLength, subreads, out_dir + '/' + subreads_basename + '.ccs.bam' ] status.printStatus('CCS command: ' + ' '.join(cmd)) status.printStatus('CCS processing of ' + subreads + ' started') subprocess.call(cmd) status.printStatus('CCS processing of ' + subreads + ' completed') status.printStatus('Output CCS file saved to ' + out_dir + '/' + subreads_basename + '.ccs.bam') # create fastq file fastq_path = makeFastq(out_dir + '/' + subreads_basename + '.ccs.bam') return fastq_path
def extractSequenceNames(gzip_fastq): '''convert FASTQ to FASTA and then extract sequence names to new file''' # path to reformat.sh, update as needed bbmap_reformat_sh = '/slipstream/oc/jrcanalysisdata/mhcAnalysis/bbmap/reformat.sh' # create temporary whitelist sequence path whitelist_sequences = gzip_fastq + '.whitelist.tmp.txt' print(whitelist_sequences) # create reformat.sh command to convert fastq to fasta cmd = [ bbmap_reformat_sh, 'in=' + gzip_fastq, 'out=' + whitelist_sequences + '.tmp.fasta' ] # print bbmap command status.printStatus(' '.join(cmd)) # call reformat.sh subprocess.call(cmd) # need to remove trailing /ccs from FASTA file # use code from https://stackoverflow.com/questions/17140886/how-to-search-and-replace-text-in-a-file-using-python with fileinput.FileInput(whitelist_sequences + '.tmp.fasta', inplace=True) as file: for line in file: print(line.replace('/ccs', ''), end='') # extract sequence names to new file with open(whitelist_sequences, 'w') as the_file: for seq_record in SeqIO.parse(whitelist_sequences + '.tmp.fasta', "fasta"): the_file.write(seq_record.id + '\n') # return path to fasta_output return whitelist_sequences
def getSamples(pacbio_id): '''retrieve sample information from genotyping Samples table''' # get runId corresponding to pacbio_id runId = getRunId(pacbio_id) # get samples from specified PacBio run pacbio_samples = labkeyInteract.LabkeySelectRows() pacbio_samples.serverContext('dho/pacbio') pacbio_samples.set_filters('run_id', runId) result = pacbio_samples.selectRows(labkey_schema='genotyping', labkey_table='Samples') # log count of samples in pacbio_id status.printStatus(str(result['rowCount']) + ' samples detected in ' + pacbio_id) status.printStatus('Barcode configuration') # log information on each sample print('OC_ID\tForward Barcode\tReverse Barcode') samples = {} # initialize samples dictionary for i in result['rows']: # use oc_id if it exists, otherwise use animal_id to identify sample name if i['oc_animal_id'] == None: sample_name = i['animal_id'] else: sample_name = i['oc_animal_id'] # run normalizeBarcodes to create PacBio standard identifiers renamed_barcodes = normalizeBarcodes(i) # print samples print(sample_name + '\t' + renamed_barcodes[0]+ '\t' + renamed_barcodes[1]) # create dictionary with sample name and barcodes samples[sample_name] = [renamed_barcodes[0], renamed_barcodes[1]] return samples
def getRunId(pacbio_id): '''inherit pacbio_id (e.g., PacBio48) from parent function and retrieve run identifier''' # get Pacbio run ID for specified run # necessary because Samples table stores run_id as foreign key lookup to runs table # debug modification by JRC 09202018 pacbio_run_id = labkeyInteract.LabkeySelectRows() pacbio_run_id.serverContext('dho/pacbio') pacbio_run_id.set_filters('pacbio_id', pacbio_id) result = pacbio_run_id.selectRows(labkey_schema='lists', labkey_table='runs') # debug modification by JRC 09202018 print('result is') print(result) time.sleep(5) # extract run number from result runNumber = result['rows'][0]['run_num'] # log whether pacbio_id corresponding to run_id is found if runNumber != '': status.printStatus(pacbio_id + ' found in dholk.primate.wisc.edu') return runNumber
def makeFastq(ccs_bam): '''use smrtlink bam2fq to produce gzip compressed FASTQ file from CCS bam''' # path to smrtlink bam2fastq smrtlink_bam2fastq_path = '/slipstream/SMRT4/SMRT/smrtcmds/bin/bam2fastq' # create fastq output file name ccs_basename = os.path.splitext(os.path.basename(ccs_bam))[0] fastq_output = os.path.dirname(ccs_bam) + '/' + ccs_basename # call bam2fastq cmd = [ smrtlink_bam2fastq_path, ccs_bam, '-o', fastq_output, ] status.printStatus('bam2fastq command: ' + ' '.join(cmd)) status.printStatus('bam2fastq processing of ' + ccs_bam + ' started') subprocess.call(cmd) status.printStatus('bam2fastq processing of ' + ccs_bam + ' completed') status.printStatus('gzip compressed FASTQ file saved to ' + fastq_output + '.fastq.gz') # return path to output fastq file return fastq_output + '.fastq.gz' # if __name__ == '__main__': # if run directly from the command line # # command line parameters # import argparse # parser = argparse.ArgumentParser() # parser.add_argument("out_dir", help='Folder that will store all output files') # parser.add_argument("--subreads", required=True, # help='Path to file of PacBio subreads. Will be converted to CCS file.') # parser.add_argument("--ccsMinAccuracy", required=False, # help='Set minPredicted accuracy (from 0-1) for retaining CCS reads. Default=0.9. Recommend 0.999 for de novo allele discovery.') # parser.add_argument("--ccsMinLength", required=False, # help='Set minLength in bp for retaining CCS reads. Default=1000. Set to minimum expected amplicon size.') # parser.add_argument("--ccsMaxLength", required=False, # help='Set maxLength in bp for retaining CCS reads. Default=1500. Set to minimum expected amplicon size.') # args = parser.parse_args() # # # make output folder if it doesn't exist # utils.createOutputFolder(args.out_dir) # # # configure log to stdout # logging.basicConfig(filename=args.out_dir + '/log.txt', filemode='w', level=logging.DEBUG, # format='%(asctime)s %(message)s', datefmt='%Y-%m-%d %I:%M:%S') # # # run with command line parameters # d = {} # d['subreads'] = args.subreads # d['out_dir'] = args.out_dir # if args.ccsMinAccuracy is not None: d['minPredictedAccuracy'] = args.ccsMinAccuracy # if args.ccsMinLength is not None: d['minLength'] = args.ccsMinLength # if args.ccsMaxLength is not None: d['maxLength'] = args.ccsMaxLength # # # log command line # status.printStatus('Command line statment: ' + ' '.join(sys.argv)) # # # run makeCcs function # makeCcs(**d) # # test invokation # d = {} # d['subreads'] = '/slipstream/pacbio/pacbio_raw/pacbio48/3_C01/m54178_170519_124037.subreads.bam' # d['out_dir'] = '/slipstream/shared_data/19070/pacbio48-default-minQuality/11' # d['minPredictedAccuracy'] = '0.9' # d['minLength'] = '1000' # d['maxLength'] = '1500' # # # log command line # status.printStatus('CCS paramaeters: ' + d) # # # run makeCcs function # makeCcs(**d) # test bam2fastq # makeFastq('/slipstream/shared_data/19070/pacbio48//20170604082124/ccs//m54178_170519_124037.subreads.ccs.bam')
env["TEXTTEST_TMP"] = os.path.join(options.rootDir, env["FILEPREFIX"]+"texttesttmp") env["TEXTTEST_HOME"] = os.path.join(options.rootDir, options.testsDir) if "SUMO_HOME" not in env: env["SUMO_HOME"] = os.path.join(os.path.dirname(__file__), '..', '..') shutil.rmtree(env["TEXTTEST_TMP"], True) if not os.path.exists(env["SUMO_REPORT"]): os.makedirs(env["SUMO_REPORT"]) for name in ["dfrouter", "duarouter", "jtrrouter", "netconvert", "netgenerate", "od2trips", "sumo", "polyconvert", "sumo-gui", "activitygen"]: binary = os.path.join(options.rootDir, options.binDir, name + programSuffix + ".exe") if name == "sumo-gui": if os.path.exists(binary): env["GUISIM_BINARY"] = binary elif os.path.exists(binary): env[name.upper()+"_BINARY"] = binary log = open(testLog, 'w') # provide more information than just the date: nameopt = " -name %sr%s" % (date.today().strftime("%d%b%y"), svnrev) if options.sumoExe == "meso": runInternalTests.runInternal(programSuffix, "-b "+env["FILEPREFIX"]+nameopt, log) else: subprocess.call("texttest.py -b "+env["FILEPREFIX"]+nameopt, stdout=log, stderr=subprocess.STDOUT, shell=True) subprocess.call("texttest.py -a sumo.gui -b "+env["FILEPREFIX"]+nameopt, stdout=log, stderr=subprocess.STDOUT, shell=True) subprocess.call("texttest.py -b "+env["FILEPREFIX"]+" -coll", stdout=log, stderr=subprocess.STDOUT, shell=True) ago = datetime.datetime.now() - datetime.timedelta(50) subprocess.call('texttest.py -s "batch.ArchiveRepository session='+env["FILEPREFIX"]+' before=%s"' % ago.strftime("%d%b%Y"), stdout=log, stderr=subprocess.STDOUT, shell=True) log.close() log = open(statusLog, 'w') status.printStatus(makeLog, makeAllLog, env["TEXTTEST_TMP"], env["SMTP_SERVER"], log) log.close()
def main(options, platform="x64"): env["FILEPREFIX"] = options.msvc_version + options.suffix + platform prefix = os.path.join(options.remoteDir, env["FILEPREFIX"]) makeLog = prefix + "Release.log" makeAllLog = prefix + "Debug.log" testLog = prefix + "Test.log" testDebugLog = prefix + "DebugTest.log" statusLog = prefix + "status.log" log_handler = status.set_rotating_log(makeLog) status.killall(("", "D"), BINARIES) toClean = [] for ext in ("*.exe", "*.ilk", "*.pdb", "*.py", "*.pyd", "*.dll", "*.lib", "*.exp", "*.jar", "*.manifest", "*.fmu"): toClean += glob.glob(os.path.join(SUMO_HOME, "bin", ext)) toClean += glob.glob(os.path.join(SUMO_HOME, "tools", "lib*", "*lib*")) toClean += glob.glob(os.path.join(SUMO_HOME, "share", "*", "*")) for f in toClean: try: os.remove(f) except Exception: pass for d in (glob.glob(os.path.join(SUMO_HOME, "bin", "osgPlugins*")) + glob.glob(os.path.join(SUMO_HOME, "tools", "*.egg-info"))): shutil.rmtree(d, ignore_errors=True) for d in glob.glob(os.path.join(SUMO_HOME, "docs", "*")): if os.path.basename(d) in ('examples', 'javadoc', 'man', 'pydoc', 'tutorial', 'userdoc'): shutil.rmtree(d, ignore_errors=True) status.printLog("Running %s build using python %s." % (options.msvc_version, sys.version)) gitrev = repositoryUpdate(options) generator = "Visual Studio " + ("12 2013" if options.msvc_version == "msvc12" else "16 2019") buildDir = generateCMake(generator, platform, options.suffix == "extra", options.python) ret = status.log_subprocess(["cmake", "--build", ".", "--config", "Release"], cwd=buildDir) status.log_subprocess(["cmake", "--build", ".", "--config", "Release", "--target", "lisum"], cwd=buildDir) status.log_subprocess(["cmake", "--build", ".", "--config", "Release", "--target", "userdoc", "examples"], cwd=buildDir) status.log_subprocess(["cmake", "--install", "."], cwd=buildDir) plat = platform.lower().replace("x", "win") if options.msvc_version != "msvc16": plat += options.msvc_version for d in glob.glob(os.path.join(buildDir, "sumo-*")): if os.path.isdir(d): installDir = d installBase = os.path.basename(installDir) binaryZip = os.path.join(buildDir, "sumo-%s%s-%s" % (plat, options.suffix, installBase[5:])) if ret == 0: try: for f in (glob.glob(os.path.join(SUMO_HOME, "*.md")) + [os.path.join(SUMO_HOME, n) for n in ("AUTHORS", "ChangeLog", "LICENSE")]): shutil.copy(f, installDir) if options.suffix == "extra": shutil.copy(os.path.join(SUMO_HOME, "build", "wix", "gpl-2.0.txt"), os.path.join(installDir, "LICENSE")) for f in glob.glob(os.path.join(SUMO_HOME, "bin", "*.jar")): shutil.copy(f, os.path.join(installDir, "bin")) shutil.copytree(os.path.join(SUMO_HOME, "docs"), os.path.join(installDir, "docs"), ignore=shutil.ignore_patterns('web')) shutil.copy(os.path.join(buildDir, "src", "version.h"), os.path.join(installDir, "include")) status.printLog("Creating sumo.zip.") shutil.make_archive(binaryZip, 'zip', buildDir, installBase) shutil.copy(binaryZip + ".zip", options.remoteDir) status.printLog("Creating sumo.msi.") if options.suffix == "extra": wix.buildMSI(binaryZip + ".zip", binaryZip + ".msi", license=os.path.join(SUMO_HOME, "build", "wix", "gpl-2.0.rtf")) else: wix.buildMSI(binaryZip + ".zip", binaryZip + ".msi") shutil.copy(binaryZip + ".msi", options.remoteDir) except Exception as ziperr: status.printLog("Warning: Could not zip to %s.zip (%s)!" % (binaryZip, ziperr)) gameZip = os.path.join(buildDir, "sumo-game-%s%s-%s.zip" % (plat, options.suffix, installBase[5:])) status.printLog("Creating sumo-game.zip.") try: status.log_subprocess(["cmake", "--build", ".", "--target", "game"], cwd=buildDir) shutil.move(os.path.join(buildDir, "sumo-game.zip"), gameZip) shutil.copy(gameZip, options.remoteDir) except Exception as e: status.printLog("Warning: Could not create nightly sumo-game.zip! (%s)" % e) debug_handler = status.set_rotating_log(makeAllLog, log_handler) ret = status.log_subprocess(["cmake", "--build", ".", "--config", "Debug"], cwd=buildDir) if ret == 0: debugZip = os.path.join(buildDir, "sumo-%s%sDebug-%s.zip" % (plat, options.suffix, installBase[5:])) status.printLog("Creating sumoDebug.zip.") try: with zipfile.ZipFile(debugZip, 'w', zipfile.ZIP_DEFLATED) as zipf: for ext in ("*D.exe", "*.dll", "*D.pdb"): for f in glob.glob(os.path.join(SUMO_HOME, "bin", ext)): zipf.write(f, os.path.join(installBase, "bin", os.path.basename(f))) shutil.copy(debugZip, options.remoteDir) except IOError as ziperr: status.printLog("Warning: Could not zip to %s (%s)!" % (debugZip, ziperr)) log_handler = status.set_rotating_log(testLog, debug_handler) status.printLog("Running tests.") runTests(options, env, gitrev) with open(statusLog, 'w') as log: status.printStatus(makeLog, makeAllLog, env["SMTP_SERVER"], log, testLog=testLog) if not options.x64only: debug_handler = status.set_rotating_log(testDebugLog, log_handler) status.printLog("Running debug tests.") runTests(options, env, gitrev, "D") with open(prefix + "Dstatus.log", 'w') as log: status.printStatus(makeAllLog, testDebugLog, env["SMTP_SERVER"], log, testLog=testDebugLog)
# provide more information than just the date: nameopt = " -name %sr%s" % (date.today().strftime("%d%b%y"), svnrev) if options.sumoExe == "meso": runInternalTests.runInternal(programSuffix, "-b " + env["FILEPREFIX"] + nameopt, log) else: subprocess.call("texttest.py -b " + env["FILEPREFIX"] + nameopt, stdout=log, stderr=subprocess.STDOUT, shell=True) subprocess.call("texttest.py -a sumo.gui -b " + env["FILEPREFIX"] + nameopt, stdout=log, stderr=subprocess.STDOUT, shell=True) subprocess.call("texttest.py -b " + env["FILEPREFIX"] + " -coll", stdout=log, stderr=subprocess.STDOUT, shell=True) ago = datetime.datetime.now() - datetime.timedelta(50) subprocess.call('texttest.py -s "batch.ArchiveRepository session=' + env["FILEPREFIX"] + ' before=%s"' % ago.strftime("%d%b%Y"), stdout=log, stderr=subprocess.STDOUT, shell=True) log.close() log = open(statusLog, 'w') status.printStatus(makeLog, makeAllLog, env["TEXTTEST_TMP"], env["SMTP_SERVER"], log) log.close()
#!/usr/bin/env python import json, yaml, argparse, httpd, status if __name__ == '__main__': parser = argparse.ArgumentParser(description='Check status script') parser.add_argument('--json', const='json', default='print', dest='output_format', action='store_const', help='return the status as json') parser.add_argument('--www', const=True, default=False, dest='start_httpd', action='store_const', help='return the status as json') args = parser.parse_args() if args.start_httpd: print 'Starting webserver...' httpd.LocalStatusHttpd() else: if args.output_format == 'json': print status.getServicesStatus() else: data = json.loads(status.getServicesStatus()) for svc in data: print "{} ({}): {}".format(svc['service'], svc['port'], status.printStatus(svc['status']))
def runLongAmpliconAnalysis(subreadsetXML, whitelistSequences, outputPrefix, minLength='1000', maxLength='1500', maxReads='20000', maxClusteringReads='5000'): '''run SMRT Link v5 long amplicon analysis''' # runs LAA to generate amplicon sequences from PacBio Sequel data # subreadsetXML can be from a single dataset, or merged datasets where new XML files are created using dataset create # whitelistFasta is a file containing sequences that will be analyzed by LAA, typically sequences from a single sample # defaults are set for typical MHC class I genotyping and should be adjusted depending on target # note: LAA default minLength=3000 will cause most of our analyses to fail so minLength should almost always be set # increasing maxClusteringReads will allow more alleles to be detected at the expense of speed: # LAA default of 500 clustering reads runs each sample in ~2 minutes, MHC class I default of 10000 takes ~30 minutes # but detects more alleles. Setting even higher values like 100,000 clustering reads causes runtimes of several hours. # maxReads can be set very high to ensure that all reads are used to accurately define clusters. This doesn't significantly # impact runtime. # use outputPrefix to specify the folder and prefix for output files # eg '/slipstream/shared_data/19364/09/' # eg '/slipstream/shared_data/19364/09/BM115. # path to SMRT Link v6.0 LAA laa_path = '/slipstream/oc/pacbio/smrtlink_v6/smrtcmds/bin/laa' # create output folder if it doesn't exist utils.createOutputFolder(os.path.dirname(outputPrefix)) # create laa command laa_cmd = [ laa_path, '--whitelist=' + whitelistSequences, '--logFile=' + outputPrefix + '.log.txt', '--resultFile=' + outputPrefix + '.amplicon_analysis.fastq', '--junkFile=' + outputPrefix + '.amplicon_analysis_chimeras_noise.fastq', '--reportFile=' + outputPrefix + '.amplicon_analysis_summary.csv', '--inputReportFile=' + outputPrefix + '.amplicon_analysis_input.csv', '--subreadsReportPrefix=' + outputPrefix + '.amplicon_analysis_subreads', subreadsetXML ] print(laa_cmd) # '--minLength=' + minLength, # '--maxLength=' + maxLength, # '--maxReads=' + maxReads, # '--maxClusteringReads=' + maxClusteringReads,'--whitelist=' + whitelistSequences, # '--logFile=' + outputPrefix + '.log.txt', # '--resultFile=' + outputPrefix + '.amplicon_analysis.fastq', # '--junkFile=' + outputPrefix + '.amplicon_analysis_chimeras_noise.fastq', # '--reportFile=' + outputPrefix + '.amplicon_analysis_summary.csv', # '--inputReportFile=' + outputPrefix + '.amplicon_analysis_input.csv', # '--subreadsReportPrefix=' + outputPrefix + '.amplicon_analysis_subreads', # subreadsetXML] # laa_cmd = [laa_path, # '--minLength=' + minLength, # '--maxLength=' + maxLength, # '--maxReads=' + maxReads, # '--maxClusteringReads=' + maxClusteringReads, # '--whitelist=' + whitelistSequences, # '--logFile=' + outputPrefix + '.log.txt', # '--resultFile=' + outputPrefix + '.amplicon_analysis.fastq', # '--junkFile=' + outputPrefix + '.amplicon_analysis_chimeras_noise.fastq', # '--reportFile=' + outputPrefix + '.amplicon_analysis_summary.csv', # '--inputReportFile=' + outputPrefix + '.amplicon_analysis_input.csv', # '--subreadsReportPrefix=' + outputPrefix + '.amplicon_analysis_subreads', # subreadsetXML] # print laa command status.printStatus(' '.join(laa_cmd)) # call laa subprocess.call(laa_cmd) # return path to LAA fastq output return outputPrefix + 'amplicon_analysis.fastq'
def parseBarcodes(samples, input_ccs_fastq, out_dir): '''parse barcodes from gzip-compressed FASTQ of PacBio CCS reads''' # create output directory if it doesn't exist utils.createOutputFolder(out_dir) # create PacBio barcode dictionary to lookup against pacbioLookup = pacbioBarcodeDict() # create dictionary of sample IDs and barcode sequences searchDict = {} for seq_name, barcode_seqs in samples.items(): searchDict[seq_name] = [pacbioLookup[barcode_seqs[0]], pacbioLookup[barcode_seqs[1]]] # open gzip-compressed FASTQ with gzip.open(input_ccs_fastq, "rt") as handle: # make dictionary to hold barcode-split seq records perBarcodeDict = {} # initialize dictionary with names of each sample for j in searchDict: perBarcodeDict[j]=[] # log every 1000 sequences processed log_every_n = 1000 # iterate through generator containing FASTQ sequences for idx, i in enumerate(SeqIO.parse(handle, "fastq")): # print status message every 1000 sequences processed if (idx % log_every_n) == 0: status.printStatus(str(idx) + ' FASTQ reads demultiplexed') # for each sequence, look for the presence of barcodes at the start and end for j in searchDict: # redo to use re.search to find barcodes not at very end of sequence # if i.seq.startswith(searchDict[j][0]) and i.seq.endswith(searchDict[j][1]): # regular expression to find barcodes in forward orientation prog = re.compile(searchDict[j][0] + ('.*') + searchDict[j][1]) # test if regular expression is found in sequence # need to cast i.seq to string to use re.search if prog.search(str(i.seq)): # write matching barcodes to perBarcodeDict - store in memory x = perBarcodeDict[j] x.append(i) perBarcodeDict[j]= x # handle inserts in the opposite orientation # create Biopython sequence object containing barcode sequences forward_seq = Seq(searchDict[j][0]) reverse_seq = Seq(searchDict[j][1]) # reverse complement forward_seq_rc = forward_seq.reverse_complement() reverse_seq_rc = reverse_seq.reverse_complement() # find FASTQ sequences matching reverse complemented barcodes # if i.seq.startswith(forward_seq_rc) and i.seq.endswith(reverse_seq_rc): # because of the SMRTBell orientation, second barcode gets listed first in reverse complement orientation prog = re.compile(str(reverse_seq_rc) + '.*' + str(forward_seq_rc)) # need to cast i.seq to string to use re.search if prog.search(str(i.seq)): # store matches in dictionary x = perBarcodeDict[j] x.append(i) perBarcodeDict[j]= x # write output files containing reads matching each barcode for i in perBarcodeDict: count = SeqIO.write(perBarcodeDict[i], out_dir + '/' + i + '.fastq', 'fastq') # compress fastq file and remove uncompressed version with open(out_dir + '/' + i + '.fastq', 'rb') as f_in: with gzip.open(out_dir + '/' + i + '.fastq.gz', 'wb') as f_out: shutil.copyfileobj(f_in, f_out) os.remove(out_dir + '/' + i + '.fastq') # remove uncompressed # log status.printStatus(str(count) + ' barcoded reads saved from sample ' + i ) status.printStatus('gzip-compressed demultipled FASTQ file saved to ' + out_dir + '/' + i + '.fastq.gz')
print("Warning: Could not create nightly sumo-game.zip! (%s)" % e, file=log) log.close() with open(makeAllLog, 'a') as log: subprocess.call(["cmake", "--build", ".", "--config", "Debug"], cwd=buildDir, stdout=log, stderr=subprocess.STDOUT) if sumoAllZip: try: debugZip = sumoAllZip.replace("-all-", "Debug-%s-" % env["FILEPREFIX"]) zipf = zipfile.ZipFile(debugZip, 'w', zipfile.ZIP_DEFLATED) debugDllPath = os.path.join(options.rootDir, "..", "debugDll") if platform == "x64": debugDllPath += "64" for dllPath in (os.path.join(options.rootDir, dllDir), debugDllPath): for f in glob.glob(os.path.join(dllPath, "*.dll")) + glob.glob(os.path.join(dllPath, "*", "*.dll")): zipf.write(f, os.path.join(binDir, f[len(dllPath) + 1:])) for f in (glob.glob(os.path.join(options.rootDir, options.binDir, "*D.exe")) + glob.glob(os.path.join(options.rootDir, options.binDir, "*D.pdb"))): zipf.write(f, os.path.join(binDir, os.path.basename(f))) zipf.close() except IOError as ziperr: (errno, strerror) = ziperr.args print("Warning: Could not zip to %s!" % binaryZip, file=log) print("I/O error(%s): %s" % (errno, strerror), file=log) runTests(options, env, gitrev, options.extended_tests and platform == "x64") with open(statusLog, 'w') as log: status.printStatus(makeLog, makeAllLog, env["SMTP_SERVER"], log) if options.extended_tests: runTests(options, env, gitrev, True, "D") with open(prefix + "Dstatus.log", 'w') as log: status.printStatus(makeAllLog, makeAllLog, env["SMTP_SERVER"], log)
import argparse parser = argparse.ArgumentParser() parser.add_argument("out_dir", help='Folder that will store all output files') parser.add_argument( "fastq_folder", help='Path to folder containing FASTQ files to genotype') parser.add_argument( "ref_fasta", help='Path to reference FASTA file to map reads against') parser.add_argument("experiment", help='Experiment number') args = parser.parse_args() # make output folder if it doesn't exist utils.createOutputFolder(args.out_dir) # configure log to stdout logging.basicConfig(filename=args.out_dir + '/log.txt', filemode='w', level=logging.DEBUG, format='%(asctime)s %(message)s', datefmt='%Y-%m-%d %I:%M:%S') # log command line status.printStatus('Command line statment: ' + ' '.join(sys.argv)) # map reads and summarize results mapReadsFolder(args.fastq_folder, args.ref_fasta, args.out_dir, args.experiment) # example invokation # anaconda3/bin/python /slipstream/shared_data/pycharm/dhogal/19070/genotyping.py /slipstream/shared_data/19070/pacbio48-default-minQuality/16/ /slipstream/shared_data/19070/pacbio48-default-minQuality/12/fastq/ /slipstream/shared_data/19070/pacbio48-default-minQuality/ipd-mhc-20170523.fasta
for f in glob.glob(os.path.join(dllPath, "*.dll")) + glob.glob( os.path.join(dllPath, "*", "*.dll")): zipf.write(f, os.path.join(binDir, f[len(dllPath) + 1:])) buildDir = os.path.dirname( os.path.join(options.rootDir, options.project)) for f in glob.glob( os.path.join(options.rootDir, options.binDir, "*D.exe")): exe = os.path.basename(f) pdb = exe[:-3] + "pdb" zipf.write(f, os.path.join(binDir, exe)) if platform == "x64": pdbPath = os.path.join(buildDir, exe[:-5], "x64", "Debug", pdb) else: pdbPath = os.path.join(buildDir, exe[:-5], "Debug", pdb) if os.path.exists(pdbPath): zipf.write(pdbPath, os.path.join(binDir, pdb)) zipf.close() except IOError as ziperr: (errno, strerror) = ziperr.args print("Warning: Could not zip to %s!" % binaryZip, file=log) print("I/O error(%s): %s" % (errno, strerror), file=log) runTests(options, env, gitrev) log = open(statusLog, 'w') status.printStatus(makeLog, makeAllLog, env["SMTP_SERVER"], log) log.close() runTests(options, env, gitrev, "D") log = open(prefix + "Dstatus.log", 'w') status.printStatus(makeAllLog, makeAllLog, env["SMTP_SERVER"], log) log.close()
def main(): runLog.logger.info("Starting GCO.py") egpg = easygopigo3.EasyGoPiGo3( use_mutex=True) # Create an instance of the EasyGoPiGo3 class # Adjust GOPIGO3 CONSTANTS to my bot default EasyGoPiGo3.WHEEL_DIAMETER = 66.5 mm myconfig.setParameters(egpg) ds = myDistSensor.init(egpg) tp = tiltpan.TiltPan(egpg) tp.tiltpan_center() dist_list_mm = [] at_angle_list = [] scan360speed = 150 safe_distance = 20.32 # cm 8 inches wheels to wall/object ds_to_wheels = 7 # cm distance sensor is 2.75 inches in front of wheels try: # spin360 taking distance measurement print("\n360 degree scan at speed={}".format(scan360speed)) dist_list_mm, at_angle_list = scan360.spin_and_scan( egpg, ds, tp, 360, speed=scan360speed) # spin taking distance readings range_list_cm = [dist / 10 for dist in dist_list_mm] printmaps.view360( range_list_cm, at_angle_list) # print view (all r positive, theta 0=left print("Readings:{}".format(len(at_angle_list))) sleep(3) # spin to face closest object dist_to_target, scan_angle_to_target = closest_obj( range_list_cm, at_angle_list) angle_to_target = scan_angle_to_target - 90 # adjust for 0=left print("\nClosest object is {:.1f} cm at {:.0f} degrees".format( dist_to_target, angle_to_target)) sleep(3) print("\nTurning {:.0f} at {} dps to face closest object".format( angle_to_target, egpg.get_speed())) egpg.turn_degrees(angle_to_target) sleep(3) # travel to point where wheels are 10 inches from object (will back up if too close) dist_to_guard_spot = dist_to_target + ds_to_wheels - safe_distance print("\nMoving {:.0f} cm to guard spot".format(dist_to_guard_spot)) egpg.drive_cm(dist_to_guard_spot) sleep(3) # perform a 160 degree scan with obj in the center # spin 180 to face away from object print("\nTurning 180 to guard direction") egpg.turn_degrees(180) sleep(3) # loop # perform a quick 160 degree scan # if something gets closer, wag head and announce "I saw that." while True: dist_l, angl_l = servoscan.ds_map(ds, tp, num_of_readings=72) printmaps.view180(dist_l, angl_l, grid_width=80, units="cm", ignore_over=230) # turn distance sensor (eyes) to face closest object dist_to_closest, scan_angle_to_closest = closest_obj( dist_l, angl_l) angle_to_closest = scan_angle_to_closest # - 90 # adjust for 0=left print("\nClosest object is {:.1f} cm at {:.0f} degrees".format( dist_to_closest, angle_to_closest)) print("\nPointing {:.0f} to face closest object".format( angle_to_closest)) tp.pan(angle_to_closest) sleep(2) status.printStatus(egpg, ds) sleep(30) tp.tiltpan_center() # status.batterySafetyCheck() except KeyboardInterrupt: # except the program gets interrupted by Ctrl+C on the keyboard. egpg.stop() # stop motors runLog.logger.info("Exiting GCO.py") print("Ctrl-C detected - Finishing up") egpg.stop()