def __init__(self, config, log, sampleinfo={}, force=False): self.config = config self.logger = log self.db_access = DB_Manipulator(config, log) self.updated = list() # Fetch names of existing refs self.refs = self.db_access.profiles organisms = self.refs.keys() self.organisms = [*organisms] self.force = force self.sampleinfo = sampleinfo self.sample = None if isinstance(self.sampleinfo, list) and len(self.sampleinfo) > 1: self.name = self.sampleinfo[0].get("CG_ID_project") self.sample = self.sampleinfo[0] for entry in self.sampleinfo: if entry.get("CG_ID_sample") == self.name: raise Exception( "Mixed projects in samples_info file. Do not know how to proceed" ) else: if isinstance(self.sampleinfo, list): self.sampleinfo = self.sampleinfo[0] self.name = self.sampleinfo.get("CG_ID_sample") self.sample = self.sampleinfo
def __init__(self, config, log, sampleinfo={}, input=""): self.config = config self.logger = log self.db_pusher = DB_Manipulator(config, log) self.referencer = Referencer(config, log) self.job_fallback = Job_Creator(config=config, log=log, sampleinfo=sampleinfo) self.infolder = os.path.abspath(input) self.sampledir = "" # Since all scraped folders are generated by Job_Creator, datestring is automatically provided in last folder last_folder = self.infolder.split("/")[-1] self.name = last_folder.split("_")[0] self.sampleinfo = sampleinfo self.sample = None if isinstance(self.sampleinfo, list) and len(self.sampleinfo) > 1: self.name = self.sampleinfo[0].get("CG_ID_project") self.sample = self.sampleinfo[0] for entry in self.sampleinfo: if entry.get("CG_ID_sample") == self.name: raise Exception( "Mixed projects in samples_info file. Do not know how to proceed" ) else: if isinstance(self.sampleinfo, list): self.sampleinfo = self.sampleinfo[0] self.name = self.sampleinfo.get("CG_ID_sample") self.sample = self.sampleinfo self.gene2resistance = self.load_resistances()
def dbm(): db_file = re.search( 'sqlite:///(.+)', preset_config['database']['SQLALCHEMY_DATABASE_URI']).group(1) os.remove(db_file) dbm = DB_Manipulator(config=preset_config, log=logger) dbm.create_tables() return dbm
def __init__(self, config, log, sampleinfo={}, name="", output="", collection=False): self.db_pusher = DB_Manipulator(config, log) self.name = name self.collection = collection if output == "": self.output = os.getcwd() else: self.output = output + "/" self.config = config self.logger = log for k, v in config.items(): app.config[k] = v self.server = Process(target=app.run) self.attachments = list() self.filelist = list() self.error = False self.dt = datetime.now() self.filelist = list() self.now = time.strftime("{}.{}.{}_{}.{}.{}".format( self.dt.year, self.dt.month, self.dt.day, self.dt.hour, self.dt.minute, self.dt.second, )) self.sampleinfo = sampleinfo self.sample = None if isinstance(self.sampleinfo, list) and len(self.sampleinfo) > 1: self.name = self.sampleinfo[0].get("CG_ID_project") self.sample = self.sampleinfo[0] for entry in self.sampleinfo: if entry.get("CG_ID_sample") == self.name: raise Exception( "Mixed projects in samples_info file. Do not know how to proceed" ) else: if isinstance(self.sampleinfo, list): self.sampleinfo = self.sampleinfo[0] self.name = self.sampleinfo.get("CG_ID_project") self.sample = self.sampleinfo
class Reporter: def __init__( self, config, log, sampleinfo={}, name="", output="", collection=False ): self.db_pusher = DB_Manipulator(config, log) self.name = name self.collection = collection if output == "": self.output = os.getcwd() else: self.output = output + "/" self.config = config self.logger = log for k, v in config.items(): app.config[k] = v self.server = Process(target=app.run) self.attachments = list() self.filedict = dict() self.error = False self.dt = datetime.now() self.now = time.strftime( "{}.{}.{}_{}.{}.{}".format( self.dt.year, self.dt.month, self.dt.day, self.dt.hour, self.dt.minute, self.dt.second, ) ) self.sampleinfo = sampleinfo self.sample = None if isinstance(self.sampleinfo, list) and len(self.sampleinfo) > 1: self.name = self.sampleinfo[0].get("CG_ID_project") self.sample = self.sampleinfo[0] for entry in self.sampleinfo: if entry.get("CG_ID_sample") == self.name: raise Exception( "Mixed projects in samples_info file. Do not know how to proceed" ) else: if isinstance(self.sampleinfo, list): self.sampleinfo = self.sampleinfo[0] self.name = self.sampleinfo.get("CG_ID_project") self.sample = self.sampleinfo def create_subfolders(self): os.makedirs("{0}/deliverables".format(self.config["folders"]["reports"]), exist_ok=True) os.makedirs("{0}/json".format(self.config["folders"]["reports"]), exist_ok=True) os.makedirs("{0}/analysis".format(self.config["folders"]["reports"]), exist_ok=True) def report(self, type="default", customer="all"): self.create_subfolders() if type in ["default", "typing", "qc"]: # Only typing and qc reports are version controlled self.gen_version(self.name) if type in ["default", "typing", "qc", "st_update"]: self.restart_web() if type == "default": self.gen_typing() self.gen_qc() self.gen_json(silent=True) self.gen_delivery() elif type == "typing": self.gen_typing() elif type == "qc": self.gen_qc() elif type == "st_update": self.gen_STtracker(customer) self.kill_flask() elif type in ["json_dump", "motif_overview"]: if type == "json_dump": self.gen_json() self.gen_delivery() elif type == "motif_overview": self.gen_motif(motif="resistance") self.gen_motif(motif="expec") else: raise Exception("Report function recieved invalid format") self.mail() #If no output dir is specified; Don't store report locally. Rely on e-mail if not self.output == "" or self.output == os.getcwd(): for k,v in self.filedict.items(): if v == "": os.remove(k) else: copyfile(k, v) def gen_version(self, name): self.db_pusher.get_report(name) self.db_pusher.set_report(name) def gen_STtracker(self, customer="all", silent=False): self.name = "Sequence Type Update" try: r = requests.get( "http://127.0.0.1:5000/microSALT/STtracker/{}".format(customer), allow_redirects=True, ) outname = "{}/ST_updates_{}.html".format(self.output, self.now) outfile = open(outname, "wb") outfile.write(r.content.decode("iso-8859-1").encode("utf8")) outfile.close() self.filedict[outname] = "" if not silent: self.attachments.append(outname) except Exception as e: self.logger.error( "Flask instance currently occupied. Possible rogue process. Retry command" ) self.error = True def gen_qc(self, silent=False): try: last_version = self.db_pusher.get_report(self.name).version except Exception as e: self.logger.error("Project {} does not exist".format(self.name)) self.kill_flask() sys.exit(-1) try: q = requests.get( "http://127.0.0.1:5000/microSALT/{}/qc".format(self.name), allow_redirects=True, ) outfile = "{}_QC_{}.html".format( self.sample.get("Customer_ID_project"), last_version ) local = "{}/{}".format(self.output, outfile) output = "{}/analysis/{}".format(self.config["folders"]["reports"], outfile) outfile = open(output, "wb") outfile.write(q.content.decode("iso-8859-1").encode("utf8")) outfile.close() if os.path.isfile(output): self.filedict[output] = local if not silent: self.attachments.append(output) except Exception as e: self.logger.error( "Flask instance currently occupied. Possible rogue process. Retry command" ) self.error = True def gen_typing(self, silent=False): try: last_version = self.db_pusher.get_report(self.name).version except Exception as e: self.logger.error("Project {} does not exist".format(self.name)) self.kill_flask() sys.exit(-1) try: r = requests.get( "http://127.0.0.1:5000/microSALT/{}/typing/all".format(self.name), allow_redirects=True, ) outfile = "{}_Typing_{}.html".format( self.sample.get("Customer_ID_project"), last_version ) local = "{}/{}".format(self.output, outfile) output = "{}/analysis/{}".format(self.config["folders"]["reports"], outfile) outfile = open(output, "wb") outfile.write(r.content.decode("iso-8859-1").encode("utf8")) outfile.close() if os.path.isfile(output): self.filedict[output] = local if not silent: self.attachments.append(output) except Exception as e: self.logger.error( "Flask instance currently occupied. Possible rogue process. Retry command" ) self.error = True def gen_motif(self, motif="resistance", silent=False): if motif not in ["resistance", "expec"]: self.logger.error("Invalid motif type specified for gen_motif function") if self.collection: sample_info = gen_collectiondata(self.name) else: sample_info = gen_reportdata(self.name) output = "{}/{}_{}_{}.csv".format(self.output, self.name, motif, self.now) # Load motif & gene names into dict motifdict = dict() for s in sample_info["samples"]: if motif == "resistance": for r in s.resistances: if ( not (r.resistance in motifdict.keys()) and r.threshold == "Passed" ): if r.resistance is None: r.resistance = "None" motifdict[r.resistance] = list() if ( r.threshold == "Passed" and not r.gene in motifdict[r.resistance] ): motifdict[r.resistance].append(r.gene) elif motif == "expec": for e in s.expacs: if ( not (e.virulence in motifdict.keys()) and e.threshold == "Passed" ): if e.virulence is None: e.virulence = "None" motifdict[e.virulence] = list() if e.threshold == "Passed" and not e.gene in motifdict[e.virulence]: motifdict[e.virulence].append(e.gene) for k, v in motifdict.items(): motifdict[k] = sorted(v) # Top 2 Header sepfix = "sep=," topline = "Identity {}% & Span {}%,,,".format( self.config["threshold"]["motif_id"], self.config["threshold"]["motif_span"] ) botline = "CG Sample ID,Sample ID,Organism,Sequence Type,Thresholds" for k in sorted(motifdict.keys()): genes = [""] * len(motifdict[k]) active_gene = k.replace(",", " &") if active_gene == "": active_gene = "Uncategorized hits" geneholder = ",".join(genes) topline += ",,{}{}".format(active_gene, geneholder) resnames = ",".join(sorted(motifdict[k])) botline += ",,{}".format(resnames) try: excel = open(output, "w+") excel.write("{}\n".format(sepfix)) excel.write("{}\n".format(topline)) excel.write("{}\n".format(botline)) # Create each individual row past the 2nd, per iteration for s in sample_info["samples"]: rowdict = dict() pref = "{},{},{},{},{}".format( s.CG_ID_sample, s.Customer_ID_sample, s.organism, s.ST_status.replace(",", ";"), s.threshold, ) # Load single sample if motif == "resistance": for r in s.resistances: if ( not (r.resistance in rowdict.keys()) and r.threshold == "Passed" ): rowdict[r.resistance] = dict() if ( r.threshold == "Passed" and not r.gene in rowdict[r.resistance] ): rowdict[r.resistance][r.gene] = r.identity elif motif == "expec": for e in s.expacs: if ( not (e.virulence in rowdict.keys()) and e.threshold == "Passed" ): rowdict[e.virulence] = dict() if ( e.threshold == "Passed" and not e.gene in rowdict[e.virulence] ): rowdict[e.virulence][e.gene] = e.identity # Compare single sample to all hits = "" for res in sorted(motifdict.keys()): if res in rowdict.keys(): hits += ",1" for gen in sorted(motifdict[res]): hits += "," if gen in rowdict[res].keys(): # UPD: Change this to identity of hit hits += "{}".format(rowdict[res][gen]) else: hits += "0" else: # Commas eq to res + gen length hits += ",0,0" pad = ["0"] * len(motifdict[res]) hits += ",".join(pad) excel.write("{}{}\n".format(pref, hits)) excel.close() if os.path.isfile(output): self.filedict[output] = "" if not silent: self.attachments.append(output) except FileNotFoundError as e: self.logger.error( "Gen_motif unable to produce excel file. Path {} does not exist".format( os.path.basename(output) ) ) def gen_delivery(self): deliv = dict() deliv['files'] = list() last_version = self.db_pusher.get_report(self.name).version output = "{}/deliverables/{}_deliverables.yaml".format(self.config["folders"]["reports"], self.sample.get("Customer_ID_project")) local = "{}/{}_deliverables.yaml".format(self.output, self.sample.get("Customer_ID_project")) #Project-wide #Sampleinfo deliv['files'].append({'format':'json','id':self.sample.get("Customer_ID_project"), 'path':"{}/sampleinfo.json".format(self.output), 'path_index':'~','step':'analysis','tag':'sampleinfo'}) #QC report deliv['files'].append({'format':'html','id':self.sample.get("Customer_ID_project"), 'path':"{}/{}_QC_{}.html".format(self.output, self.sample.get("Customer_ID_project"), last_version), 'path_index':'~','step':'result_aggregation','tag':'microsalt-qc'}) #Typing report deliv['files'].append({'format':'html','id':self.sample.get("Customer_ID_project"), 'path':"{}/{}_Typing_{}.html".format(self.output, self.sample.get("Customer_ID_project"), last_version), 'path_index':'~','step':'result_aggregation','tag':'microsalt-type'}) #Json (vogue) report deliv['files'].append({'format':'json','id':self.sample.get("Customer_ID_project"), 'path':"{}/{}.json".format(self.output, self.sample.get("CG_ID_project")), 'path_index':'~','step':'result_aggregation','tag':'microsalt-json'}) #Settings dump deliv['files'].append({'format':'txt','id':self.sample.get("Customer_ID_project"), 'path':"{}/config.log".format(self.output), 'path_index':'~','step':'analysis','tag':'runtime-settings'}) #Sample-wide #Single sample if self.sampleinfo == self.sample: hklist = list() hklist.append(self.sampleinfo) resultsdir = self.output #Project else: hklist = self.sampleinfo for s in hklist: if len(hklist) > 1: resultsdir = os.path.join(self.output, s["CG_ID_sample"]) #Contig/Assembly file deliv['files'].append({'format':'fasta','id':s["CG_ID_sample"], 'path':"{0}/assembly/{1}_trimmed_contigs.fasta".format(resultsdir, s["CG_ID_sample"]), 'path_index':'~','step':'assembly','tag':'assembly'}) #Concat trimmed reads forwards deliv['files'].append({'format':'fastq','id':s["CG_ID_sample"], 'path':"{0}/trimmed/{1}_trim_front_pair.fastq.gz".format(resultsdir, s["CG_ID_sample"]), 'path_index':'~','step':'concatination','tag':'trimmed-forward-reads'}) #Concat trimmed reads reverse deliv['files'].append({'format':'fastq','id':s["CG_ID_sample"], 'path':"{0}/trimmed/{1}_trim_rev_pair.fastq.gz".format(resultsdir, s["CG_ID_sample"]), 'path_index':'~','step':'concatination','tag':'trimmed-reverse-reads'}) #Concat trimmed reads unpaired deliv['files'].append({'format':'fastq','id':s["CG_ID_sample"], 'path':"{0}/trimmed/{1}_trim_unpair.fastq.gz".format(resultsdir, s["CG_ID_sample"]), 'path_index':'~','step':'concatination','tag':'trimmed-unpaired-reads'}) #Slurm dump deliv['files'].append({'format':'txt','id':s["CG_ID_sample"], 'path':"{0}/slurm_{1}.log".format(resultsdir, s["CG_ID_sample"]), 'path_index':'~','step':'analysis','tag':'logfile'}) #Quast (assembly) qc report deliv['files'].append({'format':'tsv','id':s["CG_ID_sample"], 'path':"{0}/assembly/quast/{1}_report.tsv".format(resultsdir, s["CG_ID_sample"]), 'path_index':'~','step':'assembly','tag':'quast-results'}) #Alignment (bam, sorted) deliv['files'].append({'format':'bam','id':s["CG_ID_sample"], 'path':"{0}/alignment/{1}_{2}.bam_sort".format(resultsdir, s["CG_ID_sample"], s["reference"]), 'path_index':'~','step':'alignment','tag':'reference-alignment-sorted'}) #Alignment (bam, sorted, deduplicated) deliv['files'].append({'format':'bam','id':s["CG_ID_sample"], 'path':"{0}/alignment/{1}_{2}.bam_sort_rmdup".format(resultsdir, s["CG_ID_sample"], s["reference"]), 'path_index':'~','step':'alignment','tag':'reference-alignment-deduplicated'}) #Picard insert size stats deliv['files'].append({'format':'meta','id':s["CG_ID_sample"], 'path':"{0}/alignment/{1}_{2}.stats.ins".format(resultsdir, s["CG_ID_sample"], s["reference"]), 'path_index':'~','step':'insertsize_calc','tag':'picard-insertsize'}) with open(output, 'w') as delivfile: documents = yaml.dump(deliv, delivfile) with open(output, 'r') as delivfile: postfix = delivfile.read() postfix = postfix.replace("'~'", "~") with open(output, 'w') as delivfile: delivfile.write(postfix) if os.path.isfile(output): self.filedict[output] = local def gen_json(self, silent=False): report = dict() local = "{}/{}.json".format(self.output, self.name) output = "{}/json/{}.json".format(self.config["folders"]["reports"], self.name) sample_info = gen_reportdata(self.name) analyses = [ "blast_pubmlst", "quast_assembly", "blast_resfinder_resistence", "picard_markduplicate", "microsalt_samtools_stats", ] for s in sample_info["samples"]: t = dict() # Since some apps are too basic to filter irrelevant non-standard values.. t["ST_status"] = ( "" if s.ST_status is None or s.ST_status != str(s.ST) else s.ST_status ) t["threshold"] = ( "" if s.threshold is None or s.threshold not in ["Passed", "Failed"] else s.threshold ) t["genome_length"] = ( "" if s.genome_length is None or s.genome_length < 1 else s.genome_length ) t["reference_length"] = ( "" if s.reference_length is None or s.reference_length < 1 else s.reference_length ) t["gc_percentage"] = ( "" if s.gc_percentage is None or s.gc_percentage < 0.1 else str(s.gc_percentage) ) t["n50"] = "" if s.n50 is None or s.n50 < 1 else s.n50 t["contigs"] = "" if s.contigs is None or s.contigs < 1 else s.contigs t["insert_size"] = ( "" if s.insert_size is None or s.insert_size < 1 else s.insert_size ) t["duplication_rate"] = ( "" if s.duplication_rate is None or s.duplication_rate < 0.1 else s.duplication_rate ) t["total_reads"] = ( "" if s.total_reads is None or s.total_reads < 1 else s.total_reads ) t["mapped_rate"] = ( "" if s.mapped_rate is None or s.mapped_rate < 0.1 else s.mapped_rate ) t["average_coverage"] = ( "" if s.average_coverage is None or s.average_coverage < 0.1 else s.average_coverage ) t["coverage_10x"] = ( "" if s.coverage_10x is None or s.coverage_10x < 0.1 else s.coverage_10x ) t["coverage_30x"] = ( "" if s.coverage_30x is None or s.coverage_30x < 0.1 else s.coverage_30x ) t["coverage_50x"] = ( "" if s.coverage_50x is None or s.coverage_50x < 0.1 else s.coverage_50x ) t["coverage_100x"] = ( "" if s.coverage_100x is None or s.coverage_100x < 0.1 else s.coverage_100x ) report[s.CG_ID_sample] = dict() for a in analyses: if a == "blast_resfinder_resistence": report[s.CG_ID_sample][a] = list() else: report[s.CG_ID_sample][a] = dict() report[s.CG_ID_sample]["blast_pubmlst"] = { "sequence_type": t["ST_status"], "thresholds": t["threshold"], } report[s.CG_ID_sample]["quast_assembly"] = { "estimated_genome_length": t["genome_length"], "gc_percentage": t["gc_percentage"], "n50": t["n50"], "necessary_contigs": t["contigs"], } report[s.CG_ID_sample]["picard_markduplicate"] = { "insert_size": t["insert_size"], "duplication_rate": t["duplication_rate"], } report[s.CG_ID_sample]["microsalt_samtools_stats"] = { "total_reads": t["total_reads"], "mapped_rate": t["mapped_rate"], "average_coverage": t["average_coverage"], "coverage_10x": t["coverage_10x"], "coverage_30x": t["coverage_30x"], "coverage_50x": t["coverage_50x"], "coverage_100x": t["coverage_100x"], } for r in s.resistances: if ( not (r.gene in report[s.CG_ID_sample]["blast_resfinder_resistence"]) and r.threshold == "Passed" ): report[s.CG_ID_sample]["blast_resfinder_resistence"].append(r.gene) # json.dumps(report) #Dumps the json directly try: with open(output, "w") as outfile: json.dump(report, outfile) if os.path.isfile(output): self.filedict[output] = local if not silent: self.attachments.append(output) except FileNotFoundError as e: self.logger.error( "Gen_json unable to produce json file. Path {} does not exist".format( os.path.basename(output) ) ) def mail(self): msg = MIMEMultipart() if not self.error and self.attachments: msg["Subject"] = "{} ({}) Reports".format( self.name, self.attachments[0].split("_")[0] ) else: msg["Subject"] = "{} Failed Generating Report".format(self.name) sender = socket.gethostname() sender_fixed = "{}.com".format(os.path.splitext(sender)[0]) msg["From"] = sender_fixed msg["To"] = self.config["regex"]["mail_recipient"] if not self.error: for file in self.attachments: part = MIMEApplication(open(file).read()) part.add_header( "Content-Disposition", 'attachment; filename="%s"' % os.path.basename(file), ) msg.attach(part) s = smtplib.SMTP("localhost") s.connect() s.sendmail(msg["From"], msg["To"], msg.as_string()) s.quit() self.logger.info( "Mail containing report sent to {} from {}".format(msg["To"], msg["From"]) ) def start_web(self): self.server.start() self.logger.info("Started webserver on http://127.0.0.1:5000/") # Hinders requests before server goes up time.sleep(0.15) def kill_flask(self): self.server.terminate() self.server.join() self.logger.info("Closed webserver on http://127.0.0.1:5000/") def restart_web(self): try: self.kill_flask() except Exception as e: pass self.start_web()
class Referencer: def __init__(self, config, log, sampleinfo={}, force=False): self.config = config self.logger = log self.db_access = DB_Manipulator(config, log) self.updated = list() # Fetch names of existing refs self.refs = self.db_access.profiles organisms = self.refs.keys() self.organisms = [*organisms] self.force = force self.sampleinfo = sampleinfo self.sample = None if isinstance(self.sampleinfo, list) and len(self.sampleinfo) > 1: self.name = self.sampleinfo[0].get("CG_ID_project") self.sample = self.sampleinfo[0] for entry in self.sampleinfo: if entry.get("CG_ID_sample") == self.name: raise Exception( "Mixed projects in samples_info file. Do not know how to proceed" ) else: if isinstance(self.sampleinfo, list): self.sampleinfo = self.sampleinfo[0] self.name = self.sampleinfo.get("CG_ID_sample") self.sample = self.sampleinfo def identify_new(self, cg_id="", project=False): """ Automatically downloads pubMLST & NCBI organisms not already downloaded """ neworgs = list() newrefs = list() try: if not isinstance(self.sampleinfo, list): samples = [self.sampleinfo] else: samples = self.sampleinfo for entry in samples: org = entry.get("organism") ref = self.organism2reference(org) if ref not in self.organisms and org not in neworgs: neworgs.append(org) if (not "{}.fasta".format(entry.get("reference")) in os.listdir( self.config["folders"]["genomes"]) and not entry.get("reference") in newrefs): newrefs.append(entry.get("reference")) for org in neworgs: self.add_pubmlst(org) for org in newrefs: self.download_ncbi(org) except Exception as e: self.logger.error( "Reference update function failed prematurely. Review immediately" ) def update_refs(self): """Updates all references. Order is important, since no object is updated twice""" # Updates self.fetch_pubmlst(self.force) self.fetch_external(self.force) self.fetch_resistances(self.force) # Reindexes self.index_db(os.path.dirname(self.config["folders"]["expec"]), ".fsa") def index_db(self, full_dir, suffix): """Check for indexation, makeblastdb job if not enough of them.""" reindexation = False files = os.listdir(full_dir) sufx_files = glob.glob("{}/*{}".format(full_dir, suffix)) # List of source files for file in sufx_files: subsuf = "\{}$".format(suffix) base = re.sub(subsuf, "", file) bases = 0 newer = 0 for elem in files: # Number of files with same base name (7) if os.path.basename(base) == elem[:elem.rfind(".")]: bases = bases + 1 # Number of index files fresher than source (6) if (os.stat(file).st_mtime < os.stat("{}/{}".format( full_dir, elem)).st_mtime): newer = newer + 1 # 7 for parse_seqids, 4 for not. if not (bases == 7 or newer == 6) and not (bases == 4 and newer == 3): reindexation = True try: # Resistence files if ".fsa" in suffix: bash_cmd = "makeblastdb -in {}/{} -dbtype nucl -out {}".format( full_dir, os.path.basename(file), os.path.basename(base)) # MLST locis else: bash_cmd = "makeblastdb -in {}/{} -dbtype nucl -parse_seqids -out {}".format( full_dir, os.path.basename(file), os.path.basename(base)) proc = subprocess.Popen(bash_cmd.split(), cwd=full_dir, stdout=subprocess.PIPE) output, error = proc.communicate() except Exception as e: self.logger.error( "Unable to index requested target {} in {}".format( file, full_dir)) if reindexation: self.logger.info("Re-indexed contents of {}".format(full_dir)) def fetch_external(self, force=False): """ Updates reference for data that IS ONLY LINKED to pubMLST """ prefix = "https://pubmlst.org" query = urllib.request.urlopen("{}/data/".format(prefix)) soup = BeautifulSoup(query, "html.parser") tr_sub = soup.find_all("tr", class_="td1") tr_sub = tr_sub + soup.find_all("tr", class_="td2") # Only search every other instance iterator = iter(tr_sub) unfound = True try: while unfound: entry = iterator.__next__() # Gather general info from first object sample = entry.get_text().split("\n") organ = sample[1].lower().replace(" ", "_") # In order to get ecoli #1 if "escherichia_coli" in organ and "#1" in organ: organ = organ[:-2] currver = self.db_access.get_version( "profile_{}".format(organ)) profile_no = re.search(r"\d+", sample[2]).group(0) if (organ in self.organisms and organ.replace("_", " ") not in self.updated and (int(profile_no.replace("-", "")) > int( currver.replace("-", "")) or force)): # Download definition files st_link = prefix + entry.find_all("a")[1]["href"] output = "{}/{}".format(self.config["folders"]["profiles"], organ) urllib.request.urlretrieve(st_link, output) # Update database self.db_access.upd_rec( {"name": "profile_{}".format(organ)}, "Versions", {"version": profile_no}, ) self.db_access.reload_profiletable(organ) # Gather loci from second object entry = iterator.__next__() # Clear existing directory and download allele files out = "{}/{}".format(self.config["folders"]["references"], organ) shutil.rmtree(out) os.makedirs(out) for loci in entry.find_all("a"): loci = loci["href"] lociname = os.path.basename(os.path.normpath(loci)) input = prefix + loci urllib.request.urlretrieve( input, "{}/{}".format(out, lociname)) # Create new indexes self.index_db(out, ".tfa") else: iterator.__next__() except StopIteration: pass def resync(self, type="", sample="", ignore=False): """Manipulates samples that have an internal ST that differs from pubMLST ST""" if type == "list": # Add single sample support later self.db_access.list_unresolved() elif type == "overwrite": if ignore: self.db_access.rm_novel(sample=sample) else: self.db_access.sync_novel(overwrite=True, sample=sample) else: self.db_access.sync_novel(overwrite=False, sample=sample) def fetch_resistances(self, force=False): cwd = os.getcwd() url = "https://bitbucket.org/genomicepidemiology/resfinder_db.git" hiddensrc = "{}/.resfinder_db".format( self.config["folders"]["resistances"]) wipeIndex = False if not os.path.exists(hiddensrc) or len(os.listdir(hiddensrc)) == 0: self.logger.info("resFinder database not found. Caching..") if not os.path.exists(hiddensrc): os.makedirs(hiddensrc) cmd = "git clone {} --quiet".format(url) process = subprocess.Popen( cmd.split(), cwd=self.config["folders"]["resistances"], stdout=subprocess.PIPE, ) output, error = process.communicate() os.rename( "{}/resfinder_db".format( self.config["folders"]["resistances"]), hiddensrc, ) wipeIndex = True else: if not wipeIndex: actual = os.listdir(self.config["folders"]["resistances"]) for file in os.listdir(hiddensrc): if file not in actual and (".fsa" in file): self.logger.info( "resFinder database files corrupted. Syncing...") wipeIndex = True break cmd = "git pull origin master" process = subprocess.Popen( cmd.split(), cwd=hiddensrc, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, ) output, error = process.communicate() if not "Already up-to-date." in str(output): self.logger.info( "Remote resFinder database updated. Syncing...") wipeIndex = True else: self.logger.info( "Cached resFinder database identical to remote.") # Actual update of resistance folder if wipeIndex: for file in os.listdir(hiddensrc): if os.path.isfile("{}/{}".format(hiddensrc, file)): # Copy fresh shutil.copy( "{}/{}".format(hiddensrc, file), self.config["folders"]["resistances"], ) # Double checks indexation is current. self.index_db(self.config["folders"]["resistances"], ".fsa") def existing_organisms(self): """ Returns list of all organisms currently added """ return self.organisms def organism2reference(self, normal_organism_name): """Finds which reference contains the same words as the organism and returns it in a format for database calls. Returns empty string if none found""" orgs = os.listdir(self.config["folders"]["references"]) organism = re.split(r"\W+", normal_organism_name.lower()) try: refs = 0 for target in orgs: hit = 0 for piece in organism: if len(piece) == 1: if target.startswith(piece): hit += 1 else: if piece in target: hit += 1 # For when people misspell the strain in the orderform elif piece == "pneumonsiae" and "pneumoniae" in target: hit += 1 else: break if hit == len(organism): return target except Exception as e: self.logger.warn( "Unable to find existing reference for {}, strain {} has no reference match\nSource: {}" .format(organism, normal_organism_name, e)) def download_ncbi(self, reference): """ Checks available references, downloads from NCBI if not present """ try: DEVNULL = open(os.devnull, "wb") Entrez.email = "*****@*****.**" record = Entrez.efetch(db="nucleotide", id=reference, rettype="fasta", retmod="text") sequence = record.read() output = "{}/{}.fasta".format(self.config["folders"]["genomes"], reference) with open(output, "w") as f: f.write(sequence) bwaindex = "bwa index {}".format(output) proc = subprocess.Popen( bwaindex.split(), cwd=self.config["folders"]["genomes"], stdout=DEVNULL, stderr=DEVNULL, ) out, err = proc.communicate() samindex = "samtools faidx {}".format(output) proc = subprocess.Popen( samindex.split(), cwd=self.config["folders"]["genomes"], stdout=DEVNULL, stderr=DEVNULL, ) out, err = proc.communicate() self.logger.info("Downloaded reference {}".format(reference)) except Exception as e: self.logger.warning( "Unable to download genome '{}' from NCBI".format(reference)) def add_pubmlst(self, organism): """ Checks pubmlst for references of given organism and downloads them """ # Organism must be in binomial format and only resolve to one hit errorg = organism try: organism = organism.lower().replace(".", " ") if organism.replace(" ", "_") in self.organisms and not self.force: self.logger.info( "Organism {} already stored in microSALT".format(organism)) return db_query = self.query_pubmlst() # Doublecheck organism name is correct and unique orgparts = organism.split(" ") counter = 0.0 for item in db_query: for subtype in item["databases"]: missingPart = False for part in orgparts: if len(part) == 1: if not subtype["description"].lower().startswith( part): missingPart = True else: if not part in subtype["description"].lower(): missingPart = True if not missingPart: # Seqdef always appear after isolates, so this is fine seqdef_url = subtype["href"] desc = subtype["description"] counter += 1.0 self.logger.info( "Located pubMLST hit {} for sample".format(desc)) if counter > 2.0: raise Exception( "Reference '{}' resolved to {} organisms. Please be more stringent" .format(errorg, int(counter / 2))) elif counter < 1.0: # add external raise Exception( "Unable to find requested organism '{}' in pubMLST database" .format(errorg)) else: truename = desc.lower().split(" ") truename = "{}_{}".format(truename[0], truename[1]) self.download_pubmlst(truename, seqdef_url) # Update organism list self.refs = self.db_access.profiles self.logger.info("Created table profile_{}".format(truename)) except Exception as e: self.logger.warning(e.args[0]) def query_pubmlst(self): """ Returns a json object containing all organisms available via pubmlst.org """ # Example request URI: http://rest.pubmlst.org/db/pubmlst_neisseria_seqdef/schemes/1/profiles_csv seqdef_url = dict() databases = "http://rest.pubmlst.org/db" db_req = urllib.request.Request(databases) with urllib.request.urlopen(db_req) as response: db_query = json.loads(response.read().decode("utf-8")) return db_query def download_pubmlst(self, organism, subtype_href, force=False): """ Downloads ST and loci for a given organism stored on pubMLST if it is more recent. Returns update date """ organism = organism.lower().replace(" ", "_") # Pull version ver_req = urllib.request.Request( "{}/schemes/1/profiles".format(subtype_href)) with urllib.request.urlopen(ver_req) as response: ver_query = json.loads(response.read().decode("utf-8")) currver = self.db_access.get_version("profile_{}".format(organism)) if (int(ver_query["last_updated"].replace("-", "")) <= int( currver.replace("-", "")) and not force): # self.logger.info("Profile for {} already at latest version".format(organism.replace('_' ,' ').capitalize())) return currver # Pull ST file st_target = "{}/{}".format(self.config["folders"]["profiles"], organism) input = "{}/schemes/1/profiles_csv".format(subtype_href) urllib.request.urlretrieve(input, st_target) # Pull locus files loci_input = "{}/schemes/1".format(subtype_href) loci_req = urllib.request.Request(loci_input) with urllib.request.urlopen(loci_req) as response: loci_query = json.loads(response.read().decode("utf-8")) output = "{}/{}".format(self.config["folders"]["references"], organism) try: if os.path.isdir(output): shutil.rmtree(output) except FileNotFoundError as e: pass os.makedirs(output) for locipath in loci_query["loci"]: loci = os.path.basename(os.path.normpath(locipath)) urllib.request.urlretrieve("{}/alleles_fasta".format(locipath), "{}/{}.tfa".format(output, loci)) # Create new indexes self.index_db(output, ".tfa") def external_version(self, organism, subtype_href): ver_req = urllib.request.Request( "{}/schemes/1/profiles".format(subtype_href)) with urllib.request.urlopen(ver_req) as response: ver_query = json.loads(response.read().decode("utf-8")) return ver_query["last_updated"] def fetch_pubmlst(self, force=False): """ Updates reference for data that is stored on pubMLST """ seqdef_url = dict() db_query = self.query_pubmlst() # Fetch seqdef locations for item in db_query: for subtype in item["databases"]: for name in self.organisms: if name.replace("_", " ") in subtype["description"].lower(): # Seqdef always appear after isolates, so this is fine self.updated.append(name.replace("_", " ")) seqdef_url[name] = subtype["href"] for key, val in seqdef_url.items(): internal_ver = self.db_access.get_version("profile_{}".format(key)) external_ver = self.external_version(key, val) if internal_ver < external_ver: self.logger.info( "pubMLST reference for {} updated to {} from {}".format( key.replace("_", " ").capitalize(), external_ver, internal_ver)) self.download_pubmlst(key, val, force) self.db_access.upd_rec( {"name": "profile_{}".format(key)}, "Versions", {"version": external_ver}, ) self.db_access.reload_profiletable(key)
def __init__(self, config, log, sampleinfo={}, run_settings={}): self.config = config self.logger = log self.batchfile = "/tmp/batchfile.sbatch" self.filelist = list() if isinstance(run_settings.get("input"), list): self.filelist = run_settings.get("input") run_settings["input"] = "/tmp/" self.run_settings = run_settings self.indir = os.path.abspath(run_settings.get("input", "/tmp/")) self.trimmed = run_settings.get("trimmed", True) self.qc_only = run_settings.get("qc_only", False) self.careful = run_settings.get("careful", True) self.pool = run_settings.get("pool", []) self.finishdir = run_settings.get("finishdir", "") self.sampleinfo = sampleinfo self.sample = None if isinstance(self.sampleinfo, list) and len(self.sampleinfo) > 1: self.name = self.sampleinfo[0].get("CG_ID_project") self.sample = self.sampleinfo[0] for entry in self.sampleinfo: if entry.get("CG_ID_sample") == self.name: raise Exception( "Mixed projects in samples_info file. Do not know how to proceed" ) else: if isinstance(self.sampleinfo, list): self.sampleinfo = self.sampleinfo[0] self.name = self.sampleinfo.get("CG_ID_sample") self.sample = self.sampleinfo # If timestamp is provided. Use it as analysis time. Else use current time if run_settings.get("timestamp") is not None: self.now = run_settings.get("timestamp") temp = run_settings.get("timestamp").replace("_", ".").split(".") self.dt = datetime( int(temp[0]), int(temp[1]), int(temp[2]), int(temp[3]), int(temp[4]), int(temp[5]), ) else: self.dt = datetime.now() self.now = time.strftime("{}.{}.{}_{}.{}.{}".format( self.dt.year, self.dt.month, self.dt.day, self.dt.hour, self.dt.minute, self.dt.second, )) if run_settings.get("finishdir") is None: self.finishdir = "{}/{}_{}".format(config["folders"]["results"], self.name, self.now) self.db_pusher = DB_Manipulator(config, log) self.concat_files = dict() self.ref_resolver = Referencer(config, log)
class Job_Creator: def __init__(self, config, log, sampleinfo={}, run_settings={}): self.config = config self.logger = log self.batchfile = "/tmp/batchfile.sbatch" self.filelist = list() if isinstance(run_settings.get("input"), list): self.filelist = run_settings.get("input") run_settings["input"] = "/tmp/" self.run_settings = run_settings self.indir = os.path.abspath(run_settings.get("input", "/tmp/")) self.trimmed = run_settings.get("trimmed", True) self.qc_only = run_settings.get("qc_only", False) self.careful = run_settings.get("careful", True) self.pool = run_settings.get("pool", []) self.finishdir = run_settings.get("finishdir", "") self.sampleinfo = sampleinfo self.sample = None if isinstance(self.sampleinfo, list) and len(self.sampleinfo) > 1: self.name = self.sampleinfo[0].get("CG_ID_project") self.sample = self.sampleinfo[0] for entry in self.sampleinfo: if entry.get("CG_ID_sample") == self.name: raise Exception( "Mixed projects in samples_info file. Do not know how to proceed" ) else: if isinstance(self.sampleinfo, list): self.sampleinfo = self.sampleinfo[0] self.name = self.sampleinfo.get("CG_ID_sample") self.sample = self.sampleinfo # If timestamp is provided. Use it as analysis time. Else use current time if run_settings.get("timestamp") is not None: self.now = run_settings.get("timestamp") temp = run_settings.get("timestamp").replace("_", ".").split(".") self.dt = datetime( int(temp[0]), int(temp[1]), int(temp[2]), int(temp[3]), int(temp[4]), int(temp[5]), ) else: self.dt = datetime.now() self.now = time.strftime("{}.{}.{}_{}.{}.{}".format( self.dt.year, self.dt.month, self.dt.day, self.dt.hour, self.dt.minute, self.dt.second, )) if run_settings.get("finishdir") is None: self.finishdir = "{}/{}_{}".format(config["folders"]["results"], self.name, self.now) self.db_pusher = DB_Manipulator(config, log) self.concat_files = dict() self.ref_resolver = Referencer(config, log) def get_sbatch(self): """ Returns sbatchfile, slightly superflous""" return self.batchfile def get_headerargs(self): headerline = "-A {} -p {} -n {} -t {} -J {}_{} --qos {} --output {}/slurm_{}.log".format( self.config["slurm_header"]["project"], self.config["slurm_header"]["type"], self.config["slurm_header"]["threads"], self.config["slurm_header"]["time"], self.config["slurm_header"]["job_prefix"], self.name, self.config["slurm_header"]["qos"], self.finishdir, self.name, ) return headerline def verify_fastq(self): """ Uses arg indir to return a dict of PE fastq tuples fulfilling naming convention """ verified_files = list() files = os.listdir(self.indir) if files == []: raise Exception("Directory {} lacks fastq files.".format( self.indir)) for file in files: file_match = re.match(self.config["regex"]["file_pattern"], file) if file_match: # Check that symlinks resolve path = "{}/{}".format(self.indir, file) if os.path.islink(path): if not os.path.exists(os.readlink(path)): raise Exception( "Some fastq files are unresolved symlinks in directory {}." .format(self.indir)) # Make sure both mates exist if (file_match[1] == "1" or file_match[1] == "2" or file_match[1] == "forward" or file_match[1] == "reverse"): if file_match[1] == "forward" or file_match[1] == "reverse": pairno = "forward" if "forward" in file_match[1]: pairno = "reverse" pairname = file_match[0].replace(file_match[1], pairno) else: pairno = 2 - 1 % int(file_match[1]) # 1->2, 2->1 # Construct mate name pairname = "{}{}{}".format( file_match.string[:file_match.end(1) - 1], pairno, file_match.string[file_match.end(1):file_match.end( )], ) if pairname in files: files.pop(files.index(pairname)) verified_files.append(file_match[0]) verified_files.append(pairname) else: raise Exception( "Some fastq files have no mate in directory {}.". format(self.indir)) if verified_files == []: raise Exception( "No files in directory {} match file_pattern '{}'.".format( self.indir, self.config["regex"]["file_pattern"])) # Warn about file sizes for vfile in verified_files: try: bsize = os.stat("{}/{}".format(self.indir, vfile)).st_size bsize = bsize >> 20 if bsize > 1000: self.logger.warning( "Input fastq {} exceeds 1000MB".format(vfile)) except Exception as e: self.logger.warning( "Unable to verify size of input file {}/{}".format( self.indir, vfile)) # Warn about invalid fastq files for vfile in verified_files: f = gzip.open("{}/{}".format(self.indir, vfile), "r") lines = f.read().splitlines() if len(lines) < 2 or not "+" in str(lines[-2]): self.logger.warning( "Input fastq {} does not seem to end properly".format( vfile)) return sorted(verified_files) def create_assemblysection(self): batchfile = open(self.batchfile, "a+") # memory is actually 128 per node regardless of cores. batchfile.write("# Spades assembly\n") if self.trimmed: trimline = "-s {}".format(self.concat_files["i"]) else: trimline = "" if self.careful: careline = "--careful" else: careline = "" batchfile.write( "spades.py --threads {} {} --memory {} -o {}/assembly -1 {} -2 {} {}\n" .format( self.config["slurm_header"]["threads"], careline, 8 * int(self.config["slurm_header"]["threads"]), self.finishdir, self.concat_files["f"], self.concat_files["r"], trimline, )) batchfile.write( "mv {0}/assembly/contigs.fasta {0}/assembly/{1}_contigs.fasta\n". format(self.finishdir, self.name)) batchfile.write( "sed -n '/NODE_1000_/q;p' {0}/assembly/{1}_contigs.fasta > {0}/assembly/{1}_trimmed_contigs.fasta\n" .format(self.finishdir, self.name)) # batchfile.write("##Input cleanup\n") # batchfile.write("rm -r {}/trimmed\n".format(self.finishdir)) batchfile.write("\n\n") batchfile.close() def blast_subset(self, name, search_string): # Create run file_list = glob.glob(search_string) batchfile = open(self.batchfile, "a+") batchfile.write("mkdir {}/blast_search/{}\n".format( self.finishdir, name)) blast_format = '"7 stitle sstrand qaccver saccver pident evalue bitscore qstart qend sstart send length"' if len(file_list) > 1: for ref in file_list: if re.search(r"(\w+(?:\-\w+)*)\.\w+", os.path.basename(ref)) is None: self.logger.error( "File {} does not match typical format. Consider deleting and redownloading" ) else: ref_nosuf = re.search(r"(\w+(?:\-\w+)*)\.\w+", os.path.basename(ref)).group(1) batchfile.write("# BLAST {} search for {}, {}\n".format( name, self.sample.get("organism"), ref_nosuf)) if name == "mlst": batchfile.write( "blastn -db {}/{} -query {}/assembly/{}_contigs.fasta -out {}/blast_search/{}/loci_query_{}.txt -task megablast -num_threads {} -outfmt {}\n" .format( os.path.dirname(ref), ref_nosuf, self.finishdir, self.name, self.finishdir, name, ref_nosuf, self.config["slurm_header"]["threads"], blast_format, )) else: batchfile.write( "blastn -db {}/{} -query {}/assembly/{}_contigs.fasta -out {}/blast_search/{}/{}.txt -task megablast -num_threads {} -outfmt {}\n" .format( os.path.dirname(ref), ref_nosuf, self.finishdir, self.name, self.finishdir, name, ref_nosuf, self.config["slurm_header"]["threads"], blast_format, )) elif len(file_list) == 1: ref_nosuf = re.search(r"(\w+(?:\-\w+)*)\.\w+", os.path.basename(file_list[0])).group(1) batchfile.write("## BLAST {} search in {}\n".format( name, self.sample.get("organism").replace("_", " ").capitalize())) batchfile.write( "blastn -db {}/{} -query {}/assembly/{}_contigs.fasta -out {}/blast_search/{}/{}.txt -task megablast -num_threads {} -outfmt {}\n" .format( os.path.dirname(search_string), ref_nosuf, self.finishdir, self.name, self.finishdir, name, ref_nosuf, self.config["slurm_header"]["threads"], blast_format, )) batchfile.write("\n") batchfile.close() def create_variantsection(self): """ Creates a job for variant calling based on local alignment """ ref = "{}/{}.fasta".format(self.config["folders"]["genomes"], self.sample.get("reference")) localdir = "{}/alignment".format(self.finishdir) outbase = "{}/{}_{}".format(localdir, self.name, self.sample.get("reference")) # Create run batchfile = open(self.batchfile, "a+") batchfile.write("# Variant calling based on local alignment\n") batchfile.write("mkdir {}\n".format(localdir)) batchfile.write("## Alignment & Deduplication\n") batchfile.write("bwa mem -M -t {} {} {} {} > {}.sam\n".format( self.config["slurm_header"]["threads"], ref, self.concat_files["f"], self.concat_files["r"], outbase, )) batchfile.write( "samtools view --threads {} -b -o {}.bam -T {} {}.sam\n".format( self.config["slurm_header"]["threads"], outbase, ref, outbase)) batchfile.write( "samtools sort --threads {} -o {}.bam_sort {}.bam\n".format( self.config["slurm_header"]["threads"], outbase, outbase)) batchfile.write( "picard MarkDuplicates I={}.bam_sort O={}.bam_sort_rmdup M={}.stats.dup REMOVE_DUPLICATES=true\n" .format(outbase, outbase, outbase)) batchfile.write("samtools index {}.bam_sort_rmdup\n".format(outbase)) batchfile.write( "samtools idxstats {}.bam_sort_rmdup &> {}.stats.ref\n".format( outbase, outbase)) # Removal of temp aligment files batchfile.write("rm {}.bam {}.sam\n".format(outbase, outbase)) batchfile.write("## Primary stats generation\n") # Insert stats, dedupped batchfile.write( "picard CollectInsertSizeMetrics I={}.bam_sort_rmdup O={}.stats.ins H={}.hist.ins\n" .format(outbase, outbase, outbase)) # Coverage batchfile.write( "samtools stats --coverage 1,10000,1 {}.bam_sort_rmdup |grep ^COV | cut -f 2- &> {}.stats.cov\n" .format(outbase, outbase)) # Mapped rate, no dedup,dedup in MWGS (trimming has no effect)! batchfile.write( "samtools flagstat {}.bam_sort &> {}.stats.map\n".format( outbase, outbase)) # Total reads, no dedup,dedup in MWGS (trimming has no effect)! batchfile.write( "samtools view -c {}.bam_sort &> {}.stats.raw\n".format( outbase, outbase)) batchfile.write("\n\n") batchfile.close() def create_preprocsection(self): """Concatinates data, possibly trims it, then makes the unstranded reads usable""" forward = list() reverse = list() for root, dirs, files in os.walk(self.config["folders"]["adapters"]): if not "NexteraPE-PE.fa" in files: self.logger.error( "Adapters folder at {} does not contain NexteraPE-PE.fa. Review paths.yml" ) else: break trimdir = "{}/trimmed".format(self.finishdir) files = self.verify_fastq() batchfile = open(self.batchfile, "a+") batchfile.write("#Trimmomatic section\n") batchfile.write("mkdir {}\n".format(trimdir)) batchfile.write("##Pre-concatination\n") for file in files: fullfile = "{}/{}".format(self.indir, file) # Even indexes = Forward if not files.index(file) % 2: forward.append(fullfile) elif files.index(file) % 2: reverse.append(fullfile) outfile = files[0].split("_")[0] self.concat_files["f"] = "{}/trimmed/{}_forward_reads.fastq.gz".format( self.finishdir, self.name) self.concat_files["r"] = "{}/trimmed/{}_reverse_reads.fastq.gz".format( self.finishdir, self.name) batchfile.write("cat {} > {}\n".format(" ".join(forward), self.concat_files.get("f"))) batchfile.write("cat {} > {}\n".format(" ".join(reverse), self.concat_files.get("r"))) if self.trimmed: fp = "{}/{}_trim_front_pair.fastq.gz".format(trimdir, outfile) fu = "{}/{}_trim_front_unpair.fastq.gz".format(trimdir, outfile) rp = "{}/{}_trim_rev_pair.fastq.gz".format(trimdir, outfile) ru = "{}/{}_trim_rev_unpair.fastq.gz".format(trimdir, outfile) batchfile.write("##Trimming section\n") batchfile.write( "trimmomatic PE -threads {} -phred33 {} {} {} {} {} {}\ ILLUMINACLIP:{}/NexteraPE-PE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36\n" .format( self.config["slurm_header"]["threads"], self.concat_files.get("f"), self.concat_files.get("r"), fp, fu, rp, ru, self.config["folders"]["adapters"], )) batchfile.write("## Interlaced trimmed files\n") self.concat_files["f"] = fp self.concat_files["r"] = rp self.concat_files["i"] = "{}/{}_trim_unpair.fastq.gz".format( trimdir, outfile) batchfile.write("cat {} >> {}\n".format(" ".join( [fu, ru]), self.concat_files.get("i"))) batchfile.write("\n") batchfile.close() def create_assemblystats_section(self): batchfile = open(self.batchfile, "a+") batchfile.write("# QUAST QC metrics\n") batchfile.write("mkdir {}/assembly/quast\n".format(self.finishdir)) batchfile.write( "quast.py {}/assembly/{}_contigs.fasta -o {}/assembly/quast\n". format(self.finishdir, self.name, self.finishdir)) batchfile.write( "mv {}/assembly/quast/report.tsv {}/assembly/quast/{}_report.tsv\n\n" .format(self.finishdir, self.finishdir, self.name)) batchfile.close() def create_snpsection(self): snplist = self.filelist.copy() batchfile = open(self.batchfile, "a+") name = "" # VCFTools filters: vcffilter = "--minQ 30 --thin 50 --minDP 3 --min-meanDP 20" # BCFTools filters: bcffilter = "GL[0]<-500 & GL[1]=0 & QR/RO>30 & QA/AO>30 & QUAL>5000 & ODDS>1100 & GQ>140 & DP>100 & MQM>59 & SAP<15 & PAIRED>0.9 & EPP>3" for item in snplist: if item.count("/") >= 2: name = item.split("/")[-2] if "_" in name: name = name.split("_")[0] batchfile.write("# Basecalling for sample {}\n".format(name)) ref = "{}/{}.fasta".format(self.config["folders"]["genomes"], self.sample.get("reference")) outbase = "{}/{}_{}".format(item, name, self.sample.get("reference")) batchfile.write( "samtools view -h -q 1 -F 4 -F 256 {}.bam_sort_rmdup | grep -v XA:Z | grep -v SA:Z| samtools view -b - > {}/{}.unique\n" .format(outbase, self.finishdir, name)) batchfile.write( "freebayes -= --pvar 0.7 -j -J --standard-filters -C 6 --min-coverage 30 --ploidy 1 -f {} -b {}/{}.unique -v {}/{}.vcf\n" .format(ref, self.finishdir, name, self.finishdir, name)) batchfile.write( "bcftools view {}/{}.vcf -o {}/{}.bcf.gz -O b --exclude-uncalled --types snps\n" .format(self.finishdir, name, self.finishdir, name)) batchfile.write("bcftools index {}/{}.bcf.gz\n".format( self.finishdir, name)) batchfile.write("\n") batchfile.write( "vcftools --bcf {}/{}.bcf.gz {} --remove-filtered-all --recode-INFO-all --recode-bcf --out {}/{}\n" .format(self.finishdir, name, vcffilter, self.finishdir, name)) batchfile.write( 'bcftools view {}/{}.recode.bcf -i "{}" -o {}/{}.recode.bcf.gz -O b --exclude-uncalled --types snps\n' .format(self.finishdir, name, bcffilter, self.finishdir, name)) batchfile.write("bcftools index {}/{}.recode.bcf.gz\n\n".format( self.finishdir, name)) batchfile.write("# SNP pair-wise distance\n") batchfile.write("touch {}/stats.out\n".format(self.finishdir)) while len(snplist) > 1: nameOne = "" nameTwo = "" top = snplist.pop(0) if top.count("/") >= 2: nameOne = top.split("/")[-2] if "_" in nameOne: nameOne = nameOne.split("_")[0] for entry in snplist: if entry.count("/") >= 2: nameTwo = entry.split("/")[-2] if "_" in nameTwo: nameTwo = nameTwo.split("_")[0] pair = "{}_{}".format(nameOne, nameTwo) batchfile.write( "bcftools isec {}/{}.recode.bcf.gz {}/{}.recode.bcf.gz -n=1 -c all -p {}/tmp -O b\n" .format(self.finishdir, nameOne, self.finishdir, nameTwo, self.finishdir)) batchfile.write( "bcftools merge -O b -o {}/{}.bcf.gz --force-samples {}/tmp/0000.bcf {}/tmp/0001.bcf\n" .format(self.finishdir, pair, self.finishdir, self.finishdir)) batchfile.write("bcftools index {}/{}.bcf.gz\n".format( self.finishdir, pair)) batchfile.write( "echo {} $( bcftools stats {}/{}.bcf.gz |grep SNPs: | cut -d $'\\t' -f4 ) >> {}/stats.out\n" .format(pair, self.finishdir, pair, self.finishdir)) batchfile.write("\n") batchfile.close() def create_collection(self): """Creates collection entry in database""" if self.db_pusher.exists("Collections", {"ID_collection": self.name}): self.db_pusher.purge_rec(name=self.name, type="Collections") for sample in self.pool: self.db_pusher.add_rec( { "ID_collection": self.name, "CG_ID_sample": sample }, "Collections") addedprojs = list() for sample in self.pool: proj = re.search(r"(\w+)A(?:\w+)", sample).group(1) if proj not in addedprojs: self.create_project(proj) addedprojs.append(proj) def create_project(self, name): """Creates project in database""" proj_col = dict() proj_col["CG_ID_project"] = name proj_col["Customer_ID_project"] = self.sample.get( "Customer_ID_project") proj_col["Customer_ID"] = self.sample.get("Customer_ID") self.db_pusher.add_rec(proj_col, "Projects") self.db_pusher.upd_rec({"CG_ID_project": name}, "Projects", proj_col) def create_sample(self, name): """Creates sample in database""" try: sample_col = self.db_pusher.get_columns("Samples") sample_col["CG_ID_sample"] = self.sample.get("CG_ID_sample") sample_col["CG_ID_project"] = self.sample.get("CG_ID_project") sample_col["Customer_ID_sample"] = self.sample.get( "Customer_ID_sample") sample_col["reference_genome"] = self.sample.get("reference") sample_col["reference_length"] = self.sample.get( "reference_length") sample_col["date_analysis"] = self.dt sample_col["organism"] = self.sample.get("organism") sample_col["application_tag"] = self.sample.get("application_tag") sample_col["priority"] = self.sample.get("priority") sample_col["date_arrival"] = datetime.strptime( self.sample.get("date_arrival"), "%Y-%m-%d %H:%M:%S") sample_col["date_sequencing"] = datetime.strptime( self.sample.get("date_sequencing"), "%Y-%m-%d %H:%M:%S") sample_col["date_libprep"] = datetime.strptime( self.sample.get("date_libprep"), "%Y-%m-%d %H:%M:%S") sample_col["method_libprep"] = self.sample.get("method_libprep") sample_col["method_sequencing"] = self.sample.get( "method_sequencing") # self.db_pusher.purge_rec(sample_col['CG_ID_sample'], 'sample') self.db_pusher.add_rec(sample_col, "Samples") except Exception as e: self.logger.error("Unable to add sample {} to database".format( self.name)) def project_job(self, single_sample=False): if "dry" in self.config and self.config["dry"] == True: dry = True else: dry = False jobarray = list() if not os.path.exists(self.finishdir): os.makedirs(self.finishdir) # Loads project level info. try: if single_sample: self.create_project(self.sample.get("CG_ID_project")) elif self.pool: self.create_collection() else: self.create_project(self.name) except Exception as e: self.logger.error( "LIMS interaction failed. Unable to read/write project {}". format(self.name)) # Writes the job creation sbatch if single_sample: try: self.sample_job() headerargs = self.get_headerargs() outfile = self.get_sbatch() bash_cmd = "sbatch {} {}".format(headerargs, outfile) if not dry and outfile != "": samproc = subprocess.Popen(bash_cmd.split(), stdout=subprocess.PIPE) output, error = samproc.communicate() jobno = re.search(r"(\d+)", str(output)).group(0) jobarray.append(jobno) else: self.logger.info("Suppressed command: {}".format(bash_cmd)) except Exception as e: self.logger.error("Unable to analyze single sample {}".format( self.name)) else: for ldir in glob.glob("{}/*/".format(self.indir)): ldir = os.path.basename(os.path.normpath(ldir)) try: sample_in = "{}/{}".format(self.indir, ldir) sample_out = "{}/{}".format(self.finishdir, ldir) linkedjson = None local_sampleinfo = [ p for p in self.sampleinfo if p["CG_ID_sample"] == ldir ] if local_sampleinfo == []: raise Exception( "Sample {} has no counterpart in json file".format( ldir)) else: local_sampleinfo = local_sampleinfo[0] sample_settings = dict(self.run_settings) sample_settings["input"] = sample_in sample_settings["finishdir"] = sample_out sample_settings["timestamp"] = self.now sample_instance = Job_Creator( config=self.config, log=self.logger, sampleinfo=local_sampleinfo, run_settings=sample_settings, ) sample_instance.sample_job() headerargs = sample_instance.get_headerargs() outfile = "" if os.path.isfile(sample_instance.get_sbatch()): outfile = sample_instance.get_sbatch() bash_cmd = "sbatch {} {}".format(headerargs, outfile) if not dry and outfile != "": projproc = subprocess.Popen(bash_cmd.split(), stdout=subprocess.PIPE) output, error = projproc.communicate() jobno = re.search(r"(\d+)", str(output)).group(0) jobarray.append(jobno) else: self.logger.info( "Suppressed command: {}".format(bash_cmd)) except Exception as e: pass if not dry: self.finish_job(jobarray, single_sample) def finish_job(self, joblist, single_sample=False): """ Uploads data and sends an email once all analysis jobs are complete. """ report = "default" if self.qc_only: report = "qc" custom_conf = "" if "config_path" in self.config: custom_conf = "--config {}".format(self.config["config_path"]) process = subprocess.Popen("id -un".split(), stdout=subprocess.PIPE) user, error = process.communicate() user = str(user).replace(".", " ").title() # if not os.path.exists(self.finishdir): # os.makedirs(self.finishdir) startfile = "{}/run_started.out".format(self.finishdir) configfile = "{}/config.log".format(self.finishdir) mailfile = "{}/mailjob.sh".format(self.finishdir) samplefile = "{}/sampleinfo.json".format(self.finishdir) with open(samplefile, "w+") as outfile: json.dump(self.sampleinfo, outfile) sb = open(startfile, "w+") cb = open(configfile, "w+") mb = open(mailfile, "w+") sb.write("#!/usr/bin/env bash\n") sb.close() configout = self.config.copy() if "genologics" in configout: del configout["genologics"] cb.write("ANALYSIS STARTED BY: {}\n".format(user)) cb.write(json.dumps(configout, indent=2, separators=(",", ":"))) cb.close() mb.write("#!/usr/bin/env bash\n\n") mb.write( "#Uploading of results to database and production of report\n") if "MICROSALT_CONFIG" in os.environ: mb.write("export MICROSALT_CONFIG={}\n".format( os.environ["MICROSALT_CONFIG"])) mb.write("source activate $CONDA_DEFAULT_ENV\n") mb.write( "microSALT utils finish {0}/sampleinfo.json --input {0} --email {1} --report {2} {3}\n" .format( self.finishdir, self.config["regex"]["mail_recipient"], report, custom_conf, )) mb.write("touch {}/run_complete.out".format(self.finishdir)) mb.close() massagedJobs = list() final = ":".join(joblist) # Create subtracker if more than 50 samples maxlen = 50 if len(joblist) > maxlen: i = 1 while i <= len(joblist): if i + maxlen < len(joblist): massagedJobs.append(":".join(joblist[i - 1:i + maxlen - 1])) else: massagedJobs.append(":".join(joblist[i - 1:-1])) i += maxlen for entry in massagedJobs: if massagedJobs.index(entry) < len(massagedJobs) - 1: head = "-A {} -p core -n 1 -t 00:00:10 -J {}_{}_SUBTRACKER --qos {} --dependency=afterany:{}".format( self.config["slurm_header"]["project"], self.config["slurm_header"]["job_prefix"], self.name, self.config["slurm_header"]["qos"], entry, ) bash_cmd = "sbatch {} {}".format(head, startfile) mailproc = subprocess.Popen(bash_cmd.split(), stdout=subprocess.PIPE) output, error = mailproc.communicate() jobno = re.search(r"(\d+)", str(output)).group(0) massagedJobs[massagedJobs.index(entry) + 1] += ":{}".format(jobno) else: final = entry break head = "-A {} -p core -n 1 -t 6:00:00 -J {}_{}_MAILJOB --qos {} --open-mode append --dependency=afterany:{} --output {}".format( self.config["slurm_header"]["project"], self.config["slurm_header"]["job_prefix"], self.name, self.config["slurm_header"]["qos"], final, self.config["folders"]["log_file"], self.config["regex"]["mail_recipient"], ) bash_cmd = "sbatch {} {}".format(head, mailfile) mailproc = subprocess.Popen(bash_cmd.split(), stdout=subprocess.PIPE) output, error = mailproc.communicate() try: jobno = str(re.search(r"(\d+)", str(output)).group(0)) joblist.append(jobno) except Exception as e: self.logger.info("Unable to grab SLURMID for {0}".format( self.name)) try: #Generates file with all slurm ids slurmname = "{}_slurm_ids.yaml".format(self.name) slurmreport_storedir = Path(self.config["folders"]["reports"], "trailblazer", slurmname) slurmreport_workdir = Path(self.finishdir, slurmname) yaml.safe_dump(data={"jobs": [str(job) for job in joblist]}, stream=open(slurmreport_workdir, "w")) shutil.copyfile(slurmreport_workdir, slurmreport_storedir) self.logger.info( "Saved Trailblazer slurm report file to %s and %s", slurmreport_storedir, slurmreport_workdir, ) except Exception as e: self.logger.info( "Unable to generate Trailblazer slurm report file") def sample_job(self): """ Writes necessary sbatch job for each individual sample """ try: if not os.path.exists(self.finishdir): os.makedirs(self.finishdir) try: # This is one job self.batchfile = "{}/runfile.sbatch".format(self.finishdir) batchfile = open(self.batchfile, "w+") batchfile.write("#!/bin/sh\n\n") batchfile.write("mkdir -p {}\n".format(self.finishdir)) batchfile.close() self.create_preprocsection() self.create_variantsection() if not self.qc_only: self.create_assemblysection() self.create_assemblystats_section() self.create_blast_search() batchfile = open(self.batchfile, "a+") batchfile.close() self.logger.info( "Created runfile for sample {} in folder {}".format( self.name, self.finishdir)) except Exception as e: raise try: self.create_sample(self.name) except Exception as e: self.logger.error( "Unable to access LIMS info for sample {}".format( self.name)) except Exception as e: self.logger.error( "Unable to create job for sample {}\nSource: {}".format( self.name, str(e))) shutil.rmtree(self.finishdir, ignore_errors=True) raise def create_blast_search(self): reforganism = self.ref_resolver.organism2reference( self.sample.get("organism")) self.batchfile = "{}/runfile.sbatch".format(self.finishdir) batchfile = open(self.batchfile, "a+") batchfile.write("mkdir -p {}/blast_search\n".format(self.finishdir)) batchfile.close() self.blast_subset( "mlst", "{}/{}/*.tfa".format(self.config["folders"]["references"], reforganism), ) self.blast_subset( "resistance", "{}/*.fsa".format(self.config["folders"]["resistances"])) if reforganism == "escherichia_coli": ss = "{}/*{}".format( os.path.dirname(self.config["folders"]["expec"]), os.path.splitext(self.config["folders"]["expec"])[1], ) self.blast_subset("expec", ss) def snp_job(self): """ Writes a SNP calling job for a set of samples """ if not os.path.exists(self.finishdir): os.makedirs(self.finishdir) self.batchfile = "{}/runfile.sbatch".format(self.finishdir) batchfile = open(self.batchfile, "w+") batchfile.write("#!/usr/bin/env bash\n") batchfile.write("mkdir -p {}\n\n".format(self.finishdir)) batchfile.close() self.create_snpsection() batchfile = open(self.batchfile, "a+") batchfile.close() headerline = ( "-A {} -p {} -n 1 -t 24:00:00 -J {}_{} --qos {} --output {}/slurm_{}.log" .format( self.config["slurm_header"]["project"], self.config["slurm_header"]["type"], self.config["slurm_header"]["job_prefix"], self.name, self.config["slurm_header"]["qos"], self.finishdir, self.name, )) outfile = self.get_sbatch() bash_cmd = "sbatch {} {}".format(headerline, outfile) samproc = subprocess.Popen(bash_cmd.split(), stdout=subprocess.PIPE) output, error = samproc.communicate()
def dbm(): db_file = re.search('sqlite:///(.+)', preset_config['database']['SQLALCHEMY_DATABASE_URI']).group(1) dbm = DB_Manipulator(config=preset_config,log=logger) dbm.create_tables() for antry in unpack_db_json('sampleinfo_projects.json'): dbm.add_rec(antry, 'Projects') for entry in unpack_db_json('sampleinfo_mlst.json'): dbm.add_rec(entry, 'Seq_types') for bentry in unpack_db_json('sampleinfo_resistance.json'): dbm.add_rec(bentry, 'Resistances') for centry in unpack_db_json('sampleinfo_expec.json'): dbm.add_rec(centry, 'Expacs') for dentry in unpack_db_json('sampleinfo_reports.json'): dbm.add_rec(dentry, 'Reports') return dbm
class Scraper: def __init__(self, config, log, sampleinfo={}, input=""): self.config = config self.logger = log self.db_pusher = DB_Manipulator(config, log) self.referencer = Referencer(config, log) self.job_fallback = Job_Creator(config=config, log=log, sampleinfo=sampleinfo) self.infolder = os.path.abspath(input) self.sampledir = "" # Since all scraped folders are generated by Job_Creator, datestring is automatically provided in last folder last_folder = self.infolder.split("/")[-1] self.name = last_folder.split("_")[0] self.sampleinfo = sampleinfo self.sample = None if isinstance(self.sampleinfo, list) and len(self.sampleinfo) > 1: self.name = self.sampleinfo[0].get("CG_ID_project") self.sample = self.sampleinfo[0] for entry in self.sampleinfo: if entry.get("CG_ID_sample") == self.name: raise Exception( "Mixed projects in samples_info file. Do not know how to proceed" ) else: if isinstance(self.sampleinfo, list): self.sampleinfo = self.sampleinfo[0] self.name = self.sampleinfo.get("CG_ID_sample") self.sample = self.sampleinfo self.gene2resistance = self.load_resistances() def scrape_project(self, project=None): """Scrapes a project folder for information""" if project is None: project = self.name self.db_pusher.purge_rec(project, "Projects") if not self.db_pusher.exists("Projects", {"CG_ID_project": project}): self.logger.warning("Replacing project {}".format(project)) self.job_fallback.create_project(project) # Scrape order matters a lot! for dir in os.listdir(self.infolder): subdir = "{}/{}".format(self.infolder, dir) local_param = [ p for p in self.sampleinfo if p["CG_ID_sample"] == dir ] if local_param != []: local_param = local_param[0] sample_scraper = Scraper( config=self.config, log=self.logger, sampleinfo=local_param, input=subdir, ) sample_scraper.scrape_sample() else: self.logger.warning( "Skipping {} due to lacking info in sample_json file". format(dir)) def scrape_sample(self, sample=None): """Scrapes a sample folder for information""" if sample is None: sample = self.name self.db_pusher.purge_rec(sample, "Samples") if not self.db_pusher.exists( "Projects", {"CG_ID_project": self.sample.get("CG_ID_project")}): self.logger.warning("Replacing project {}".format( self.sample.get("CG_ID_project"))) self.job_fallback.create_project(self.sample.get("CG_ID_project")) if not self.db_pusher.exists("Samples", {"CG_ID_sample": sample}): self.logger.warning("Replacing sample {}".format(sample)) self.job_fallback.create_sample(sample) # Scrape order matters a lot! self.sampledir = self.infolder self.scrape_blast(type="seq_type") self.scrape_blast(type="resistance") if (self.referencer.organism2reference( self.sample.get("organism")) == "escherichia_coli"): self.scrape_blast(type="expec") self.scrape_alignment() self.scrape_quast() def scrape_quast(self, filename=""): """Scrapes a quast report for assembly information""" if filename == "": filename = "{}/assembly/quast/report.tsv".format(self.sampledir) quast = dict() try: with open(filename, "r") as infile: for line in infile: lsplit = line.rstrip().split("\t") if lsplit[0] == "# contigs": quast["contigs"] = int(lsplit[1]) elif lsplit[0] == "Total length": quast["genome_length"] = int(lsplit[1]) elif lsplit[0] == "GC (%)": quast["gc_percentage"] = float(lsplit[1]) elif lsplit[0] == "N50": quast["n50"] = int(lsplit[1]) self.db_pusher.upd_rec({"CG_ID_sample": self.name}, "Samples", quast) self.logger.debug("Project {} recieved quast stats: {}".format( self.name, quast)) except Exception as e: self.logger.warning( "Cannot generate quast statistics for {}".format(self.name)) def get_locilengths(self, foldername, suffix): """ Generate a dict of length for any given loci """ # Create dict with full name as key, associated nucleotides as value. alleles = dict() finalalleles = dict() for file in os.listdir(foldername): if file.endswith(suffix): lastallele = "" f = open("{}/{}".format(foldername, file), "r") for row in f: if ">" in row: lastallele = row.strip() alleles[lastallele] = "" else: alleles[lastallele] = alleles[lastallele] + row.strip() f.close() for k, v in alleles.items(): finalalleles[k] = len(v) return finalalleles def scrape_blast(self, type="", file_list=[]): hypo = list() type2db = type.capitalize() + "s" if type == "expec": type2db = "Expacs" if file_list == []: if type == "seq_type": file_list = glob.glob("{}/blast_search/mlst/*".format( self.sampledir)) else: file_list = glob.glob("{}/blast_search/{}/*".format( self.sampledir, type)) organism = self.referencer.organism2reference( self.sample.get("organism")) if organism: self.db_pusher.upd_rec({"CG_ID_sample": self.name}, "Samples", {"organism": organism}) res_cols = self.db_pusher.get_columns("{}".format(type2db)) try: old_ref = "" for file in file_list: filename = os.path.basename(file).rsplit( ".", 1)[0] # Removes suffix if filename == "lactam": filename = "beta-lactam" if type == "resistance": ref_folder = self.config["folders"]["resistances"] suffix = "fsa" elif type == "expec": ref_folder = os.path.dirname( self.config["folders"]["expec"]) suffix = os.path.basename( self.config["folders"]["expec"]).rsplit(".", 1)[1] elif type == "seq_type": ref_folder = "{}/{}".format( self.config["folders"]["references"], organism) suffix = "tfa" locilengths = self.get_locilengths(ref_folder, suffix) with open("{}".format(file), "r") as sample: for line in sample: # Ignore commented fields if not line[0] == "#": elem_list = line.rstrip().split("\t") if not elem_list[1] == "N/A": hypo.append(dict()) hypo[-1]["CG_ID_sample"] = self.name hypo[-1]["identity"] = elem_list[4] hypo[-1]["evalue"] = elem_list[5] hypo[-1]["bitscore"] = elem_list[6] if int(elem_list[7]) < int(elem_list[8]): hypo[-1]["contig_start"] = int( elem_list[7]) hypo[-1]["contig_end"] = int(elem_list[8]) else: hypo[-1]["contig_start"] = int( elem_list[8]) hypo[-1]["contig_end"] = int(elem_list[7]) hypo[-1]["subject_length"] = int(elem_list[11]) if type == "resistance": hypo[-1]["instance"] = filename partials = re.search( r"(?:\>)*(.+)_(\d+){1,3}(?:_(.+))*", elem_list[3], ) hypo[-1]["reference"] = partials.group(3) hypo[-1]["gene"] = partials.group(1) if hypo[-1][ "gene"] in self.gene2resistance.keys( ): hypo[-1][ "resistance"] = self.gene2resistance[ hypo[-1]["gene"]] else: hypo[-1]["{}".format(type)] = hypo[-1][ "instance"].capitalize() hypo[-1]["span"] = ( float(hypo[-1]["subject_length"]) / locilengths[">{}".format( elem_list[3])]) elif type == "expec": hypo[-1]["instance"] = filename # Thanks, precompiled list standards if ">" in elem_list[3]: partials = re.search( r">*(\w+_\w+\.*\w+).+\((\w+)\).+\((\w+)\)_(\w+)_\[.+\]", elem_list[3], ) else: partials = re.search( r"(\w+)\(gb\|\w+\)_\((\S+)\)_(.+)_\[(\S+)_.+\]_\[\S+\]", elem_list[3], ) if not partials: partials = re.search( r"(\w+\.*\w+)\:*\w*_*(?:\(\w+\-\w+\))*_\((\w+)\)_([^[]+)\[\S+\]", elem_list[3], ) # NC/Protein reference hypo[-1]["reference"] = partials.group(1) # Full gene name hypo[-1]["gene"] = partials.group(2) # More generic group hypo[-1]["instance"] = partials.group( 3).strip("_") # Description if len(partials.groups()) >= 4: hypo[-1]["virulence"] = ( partials.group(4).replace( "_", " ").capitalize()) else: hypo[-1]["virulence"] = "" hypo[-1]["span"] = ( float(hypo[-1]["subject_length"]) / locilengths[">{}".format( elem_list[3])]) elif type == "seq_type": partials = re.search( r"(.+)_(\d+){1,3}(?:_(\w+))*", elem_list[3]) hypo[-1]["loci"] = partials.group(1) hypo[-1]["allele"] = int(partials.group(2)) hypo[-1]["span"] = ( float(hypo[-1]["subject_length"]) / locilengths[">{}".format( elem_list[3])]) # split elem 2 into contig node_NO, length, cov nodeinfo = elem_list[2].split("_") hypo[-1]["contig_name"] = "{}_{}".format( nodeinfo[0], nodeinfo[1]) hypo[-1]["contig_length"] = int(nodeinfo[3]) hypo[-1]["contig_coverage"] = nodeinfo[5] self.logger.debug( "scrape_blast scrape loop hit") self.logger.info("{} candidate {} hits found".format( len(hypo), type2db)) except Exception as e: self.logger.error("Unable to process the pattern of {}".format( str(e))) # Cleanup of overlapping hits if type == "seq_type": identifier = "loci" elif type == "resistance" or type == "expec": identifier = "gene" ind = 0 while ind < len(hypo) - 1: targ = ind + 1 while targ < len(hypo): ignore = False if (hypo[ind]["contig_name"] == hypo[targ]["contig_name"] or hypo[ind][identifier] == hypo[targ][identifier]): # Overlapping or shared gene if ((hypo[ind].get("contig_start") >= hypo[targ].get("contig_start") and hypo[ind].get("contig_start") <= hypo[targ].get("contig_end")) or (hypo[ind].get("contig_end") >= hypo[targ].get("contig_start") and hypo[ind].get("contig_end") <= hypo[targ].get("contig_end")) or (hypo[ind].get(identifier) == hypo[targ].get(identifier))): # Rightmost is worse if float(hypo[ind].get("identity")) * ( 1 - abs(1 - hypo[ind].get("span"))) > float( hypo[targ].get("identity")) * ( 1 - abs(1 - hypo[targ].get("span"))): del hypo[targ] ignore = True # Leftmost is worse elif float(hypo[ind].get("identity")) * ( 1 - abs(1 - hypo[ind].get("span"))) < float( hypo[targ].get("identity")) * ( 1 - abs(1 - hypo[targ].get("span"))): del hypo[ind] targ = ind + 1 ignore = True # Identical identity and span, seperating based on contig coverage else: # Rightmost is worse if float( hypo[ind].get("contig_coverage")) >= float( hypo[targ].get("contig_coverage")): del hypo[targ] ignore = True # Leftmost is worse elif float( hypo[ind].get("contig_coverage")) < float( hypo[targ].get("contig_coverage")): del hypo[ind] targ = ind + 1 ignore = True if not ignore: targ += 1 else: pass ind += 1 self.logger.info( "{} {} hits were added after removing overlaps and duplicate hits". format(len(hypo), type)) for hit in hypo: self.logger.debug("Kept {}:{} with span {} and id {}".format( hit.get("loci"), hit.get("allele"), hit.get("span"), hit.get("identity"), )) self.db_pusher.add_rec(hit, "{}".format(type2db)) if type == "seq_type": try: ST = self.db_pusher.alleles2st(self.name) self.db_pusher.upd_rec({"CG_ID_sample": self.name}, "Samples", {"ST": ST}) self.logger.info("Sample {} received ST {}".format( self.name, ST)) except Exception as e: self.logger.warning( "Unable to type sample {} due to data value '{}'".format( self.name, str(e))) def load_resistances(self): """Legacy function, loads common resistance names for genes from notes file""" conversions = dict() try: with open("{}/notes.txt".format( self.config["folders"]["resistances"])) as fh: for line in fh: if "#" not in line: line = line.split(":") cropped = re.sub(" resistance", "", line[1]) conversions[line[0]] = cropped # Workaround for case issues conversions[line[0].lower()] = cropped except Exception as e: self.logger.error( "Unable to initialize trivial names for resistances ({})". format(e)) return conversions def scrape_alignment(self, file_list=[]): """Scrapes a single alignment result""" if file_list == []: file_list = glob.glob("{}/alignment/*.stats.*".format( self.sampledir)) ins_list = list() cov_dict = dict() align_dict = dict() align_dict["reference_genome"] = self.sample.get("reference") # Reading file_list = glob.glob("{}/alignment/*.stats.*".format(self.sampledir)) map_rate = 0.0 median_ins = 0 ref_len = 0.0 tot_reads = 0 tot_map = 0 duprate = 0.0 for file in file_list: with open(file, "r") as fh: type = file.split(".")[-1] for line in fh.readlines(): lsplit = line.rstrip().split("\t") if type == "raw": try: tot_reads = int(lsplit[0]) except Exception as e: pass elif type == "ins": if len(lsplit) >= 18 and lsplit[-12] in ["FF", "FR"]: try: median_ins = int(lsplit[0]) except Exception as e: pass elif type == "cov": cov_dict[lsplit[1]] = int(lsplit[2]) elif type == "ref": if lsplit[0] != "*" and len(lsplit) >= 2: ref_len = ref_len + int(lsplit[1]) elif type == "dup": if lsplit[0] == "Unknown Library": try: duprate = float(lsplit[8]) except Exception as e: duprate = -1.0 elif type == "map": dsplit = line.rstrip().split(" ") if len(dsplit) >= 5 and dsplit[4] == "total": tot_map = int(dsplit[0]) elif len(dsplit) >= 4 and dsplit[3] == "mapped": if tot_map > 0: map_rate = int(dsplit[0]) / float(tot_map) # Mangling sumz, plus10, plus30, plus50, plus100, total = 0, 0, 0, 0, 0, 0 for k, v in cov_dict.items(): sumz += int(k) * v total += v if int(k) > 10: plus10 += v if int(k) > 30: plus30 += v if int(k) > 50: plus50 += v if int(k) > 100: plus100 += v if total > 0: align_dict["coverage_10x"] = plus10 / float(ref_len) align_dict["coverage_30x"] = plus30 / float(ref_len) align_dict["coverage_50x"] = plus50 / float(ref_len) align_dict["coverage_100x"] = plus100 / float(ref_len) else: align_dict["coverage_10x"] = 0.0 align_dict["coverage_30x"] = 0.0 align_dict["coverage_50x"] = 0.0 align_dict["coverage_100x"] = 0.0 align_dict["mapped_rate"] = map_rate align_dict["insert_size"] = median_ins if ref_len > 0: align_dict["duplication_rate"] = duprate align_dict["average_coverage"] = sumz / float(ref_len) else: align_dict["duplication_rate"] = 0.0 align_dict["average_coverage"] = 0.0 align_dict["total_reads"] = tot_reads self.db_pusher.upd_rec({"CG_ID_sample": self.name}, "Samples", align_dict)