def __init__(self, config, log, sampleinfo={}, input=""): self.config = config self.logger = log self.db_pusher = DB_Manipulator(config, log) self.referencer = Referencer(config, log) self.job_fallback = Job_Creator(config=config, log=log, sampleinfo=sampleinfo) self.infolder = os.path.abspath(input) self.sampledir = "" # Since all scraped folders are generated by Job_Creator, datestring is automatically provided in last folder last_folder = self.infolder.split("/")[-1] self.name = last_folder.split("_")[0] self.sampleinfo = sampleinfo self.sample = None if isinstance(self.sampleinfo, list) and len(self.sampleinfo) > 1: self.name = self.sampleinfo[0].get("CG_ID_project") self.sample = self.sampleinfo[0] for entry in self.sampleinfo: if entry.get("CG_ID_sample") == self.name: raise Exception( "Mixed projects in samples_info file. Do not know how to proceed" ) else: if isinstance(self.sampleinfo, list): self.sampleinfo = self.sampleinfo[0] self.name = self.sampleinfo.get("CG_ID_sample") self.sample = self.sampleinfo self.gene2resistance = self.load_resistances()
def test_project_job(subproc, testdata): #Sets up subprocess mocking process_mock = mock.Mock() attrs = {'communicate.return_value': ('output 123456789', 'error')} process_mock.configure_mock(**attrs) subproc.return_value = process_mock jc = Job_Creator(config=preset_config, log=logger, sampleinfo=testdata, run_settings={ 'pool': ["AAA1234A1", "AAA1234A2"], 'input': '/tmp/AAA1234' }) jc.project_job()
def test_verify_fastq(gopen, stat, listdir, testdata): listdir.return_value = [ "ACC6438A3_HVMHWDSXX_L1_1.fastq.gz", "ACC6438A3_HVMHWDSXX_L1_2.fastq.gz", "ACC6438A3_HVMHWDSXX_L2_2.fastq.gz", "ACC6438A3_HVMHWDSXX_L2_2.fastq.gz" ] stata = mock.MagicMock() stata.st_size = 2000 stat.return_value = stata jc = Job_Creator(run_settings={'input': '/tmp/'}, config=preset_config, log=logger, sampleinfo=testdata) t = jc.verify_fastq() assert len(t) > 0
def test_create_snpsection(subproc, testdata): #Sets up subprocess mocking process_mock = mock.Mock() attrs = {'communicate.return_value': ('output 123456789', 'error')} process_mock.configure_mock(**attrs) subproc.return_value = process_mock testdata = [testdata[0]] jc = Job_Creator(run_settings={'input': ['AAA1234A1', 'AAA1234A2']}, config=preset_config, log=logger, sampleinfo=testdata) jc.snp_job() outfile = open(jc.get_sbatch(), 'r') count = 0 for x in outfile.readlines(): if "# SNP pair-wise distance" in x: count = count + 1 assert count > 0
def analyse(ctx, sampleinfo_file, input, config, dry, email, skip_update, force_update, untrimmed, uncareful): """Sequence analysis, typing and resistance identification""" # Run section pool = [] trimmed = not untrimmed careful = not uncareful set_cli_config(config) ctx.obj["config"]["regex"]["mail_recipient"] = email ctx.obj["config"]["dry"] = dry if not os.path.isdir(input): click.echo( "ERROR - Sequence data folder {} does not exist.".format(input)) ctx.abort() for subfolder in os.listdir(input): if os.path.isdir("{}/{}".format(input, subfolder)): pool.append(subfolder) run_settings = { "input": input, "dry": dry, "email": email, "skip_update": skip_update, "trimmed": not untrimmed, "careful": not uncareful, "pool": pool, } # Samples section sampleinfo = review_sampleinfo(sampleinfo_file) run_creator = Job_Creator( config=ctx.obj["config"], log=ctx.obj["log"], sampleinfo=sampleinfo, run_settings=run_settings, ) ext_refs = Referencer(config=ctx.obj["config"], log=ctx.obj["log"], sampleinfo=sampleinfo, force=force_update) click.echo("INFO - Checking versions of references..") try: if not skip_update: ext_refs.identify_new(project=True) ext_refs.update_refs() click.echo("INFO - Version check done. Creating sbatch jobs") else: click.echo("INFO - Skipping version check.") except Exception as e: click.echo("{}".format(e)) if len(sampleinfo) > 1: run_creator.project_job() elif len(sampleinfo) == 1: run_creator.project_job(single_sample=True) else: ctx.abort() done()
def test_blast_subset(glob_search, research, testdata): jc = Job_Creator(run_settings={'input': '/tmp/'}, config=preset_config, log=logger, sampleinfo=testdata) researcha = mock.MagicMock() researcha.group = fake_search research.return_value = researcha glob_search.return_value = ["/a/a/a", "/a/a/b", "/a/a/c"] jc.blast_subset('mlst', '/tmp/*') jc.blast_subset('other', '/tmp/*') outfile = open(jc.get_sbatch(), 'r') count = 0 for x in outfile.readlines(): if "blastn -db" in x: count = count + 1 assert count > 0
class Scraper: def __init__(self, config, log, sampleinfo={}, input=""): self.config = config self.logger = log self.db_pusher = DB_Manipulator(config, log) self.referencer = Referencer(config, log) self.job_fallback = Job_Creator(config=config, log=log, sampleinfo=sampleinfo) self.infolder = os.path.abspath(input) self.sampledir = "" # Since all scraped folders are generated by Job_Creator, datestring is automatically provided in last folder last_folder = self.infolder.split("/")[-1] self.name = last_folder.split("_")[0] self.sampleinfo = sampleinfo self.sample = None if isinstance(self.sampleinfo, list) and len(self.sampleinfo) > 1: self.name = self.sampleinfo[0].get("CG_ID_project") self.sample = self.sampleinfo[0] for entry in self.sampleinfo: if entry.get("CG_ID_sample") == self.name: raise Exception( "Mixed projects in samples_info file. Do not know how to proceed" ) else: if isinstance(self.sampleinfo, list): self.sampleinfo = self.sampleinfo[0] self.name = self.sampleinfo.get("CG_ID_sample") self.sample = self.sampleinfo self.gene2resistance = self.load_resistances() def scrape_project(self, project=None): """Scrapes a project folder for information""" if project is None: project = self.name self.db_pusher.purge_rec(project, "Projects") if not self.db_pusher.exists("Projects", {"CG_ID_project": project}): self.logger.warning("Replacing project {}".format(project)) self.job_fallback.create_project(project) # Scrape order matters a lot! for dir in os.listdir(self.infolder): subdir = "{}/{}".format(self.infolder, dir) local_param = [ p for p in self.sampleinfo if p["CG_ID_sample"] == dir ] if local_param != []: local_param = local_param[0] sample_scraper = Scraper( config=self.config, log=self.logger, sampleinfo=local_param, input=subdir, ) sample_scraper.scrape_sample() else: self.logger.warning( "Skipping {} due to lacking info in sample_json file". format(dir)) def scrape_sample(self, sample=None): """Scrapes a sample folder for information""" if sample is None: sample = self.name self.db_pusher.purge_rec(sample, "Samples") if not self.db_pusher.exists( "Projects", {"CG_ID_project": self.sample.get("CG_ID_project")}): self.logger.warning("Replacing project {}".format( self.sample.get("CG_ID_project"))) self.job_fallback.create_project(self.sample.get("CG_ID_project")) if not self.db_pusher.exists("Samples", {"CG_ID_sample": sample}): self.logger.warning("Replacing sample {}".format(sample)) self.job_fallback.create_sample(sample) # Scrape order matters a lot! self.sampledir = self.infolder self.scrape_blast(type="seq_type") self.scrape_blast(type="resistance") if (self.referencer.organism2reference( self.sample.get("organism")) == "escherichia_coli"): self.scrape_blast(type="expec") self.scrape_alignment() self.scrape_quast() def scrape_quast(self, filename=""): """Scrapes a quast report for assembly information""" if filename == "": filename = "{}/assembly/quast/report.tsv".format(self.sampledir) quast = dict() try: with open(filename, "r") as infile: for line in infile: lsplit = line.rstrip().split("\t") if lsplit[0] == "# contigs": quast["contigs"] = int(lsplit[1]) elif lsplit[0] == "Total length": quast["genome_length"] = int(lsplit[1]) elif lsplit[0] == "GC (%)": quast["gc_percentage"] = float(lsplit[1]) elif lsplit[0] == "N50": quast["n50"] = int(lsplit[1]) self.db_pusher.upd_rec({"CG_ID_sample": self.name}, "Samples", quast) self.logger.debug("Project {} recieved quast stats: {}".format( self.name, quast)) except Exception as e: self.logger.warning( "Cannot generate quast statistics for {}".format(self.name)) def get_locilengths(self, foldername, suffix): """ Generate a dict of length for any given loci """ # Create dict with full name as key, associated nucleotides as value. alleles = dict() finalalleles = dict() for file in os.listdir(foldername): if file.endswith(suffix): lastallele = "" f = open("{}/{}".format(foldername, file), "r") for row in f: if ">" in row: lastallele = row.strip() alleles[lastallele] = "" else: alleles[lastallele] = alleles[lastallele] + row.strip() f.close() for k, v in alleles.items(): finalalleles[k] = len(v) return finalalleles def scrape_blast(self, type="", file_list=[]): hypo = list() type2db = type.capitalize() + "s" if type == "expec": type2db = "Expacs" if file_list == []: if type == "seq_type": file_list = glob.glob("{}/blast_search/mlst/*".format( self.sampledir)) else: file_list = glob.glob("{}/blast_search/{}/*".format( self.sampledir, type)) organism = self.referencer.organism2reference( self.sample.get("organism")) if organism: self.db_pusher.upd_rec({"CG_ID_sample": self.name}, "Samples", {"organism": organism}) res_cols = self.db_pusher.get_columns("{}".format(type2db)) try: old_ref = "" for file in file_list: filename = os.path.basename(file).rsplit( ".", 1)[0] # Removes suffix if filename == "lactam": filename = "beta-lactam" if type == "resistance": ref_folder = self.config["folders"]["resistances"] suffix = "fsa" elif type == "expec": ref_folder = os.path.dirname( self.config["folders"]["expec"]) suffix = os.path.basename( self.config["folders"]["expec"]).rsplit(".", 1)[1] elif type == "seq_type": ref_folder = "{}/{}".format( self.config["folders"]["references"], organism) suffix = "tfa" locilengths = self.get_locilengths(ref_folder, suffix) with open("{}".format(file), "r") as sample: for line in sample: # Ignore commented fields if not line[0] == "#": elem_list = line.rstrip().split("\t") if not elem_list[1] == "N/A": hypo.append(dict()) hypo[-1]["CG_ID_sample"] = self.name hypo[-1]["identity"] = elem_list[4] hypo[-1]["evalue"] = elem_list[5] hypo[-1]["bitscore"] = elem_list[6] if int(elem_list[7]) < int(elem_list[8]): hypo[-1]["contig_start"] = int( elem_list[7]) hypo[-1]["contig_end"] = int(elem_list[8]) else: hypo[-1]["contig_start"] = int( elem_list[8]) hypo[-1]["contig_end"] = int(elem_list[7]) hypo[-1]["subject_length"] = int(elem_list[11]) if type == "resistance": hypo[-1]["instance"] = filename partials = re.search( r"(?:\>)*(.+)_(\d+){1,3}(?:_(.+))*", elem_list[3], ) hypo[-1]["reference"] = partials.group(3) hypo[-1]["gene"] = partials.group(1) if hypo[-1][ "gene"] in self.gene2resistance.keys( ): hypo[-1][ "resistance"] = self.gene2resistance[ hypo[-1]["gene"]] else: hypo[-1]["{}".format(type)] = hypo[-1][ "instance"].capitalize() hypo[-1]["span"] = ( float(hypo[-1]["subject_length"]) / locilengths[">{}".format( elem_list[3])]) elif type == "expec": hypo[-1]["instance"] = filename # Thanks, precompiled list standards if ">" in elem_list[3]: partials = re.search( r">*(\w+_\w+\.*\w+).+\((\w+)\).+\((\w+)\)_(\w+)_\[.+\]", elem_list[3], ) else: partials = re.search( r"(\w+)\(gb\|\w+\)_\((\S+)\)_(.+)_\[(\S+)_.+\]_\[\S+\]", elem_list[3], ) if not partials: partials = re.search( r"(\w+\.*\w+)\:*\w*_*(?:\(\w+\-\w+\))*_\((\w+)\)_([^[]+)\[\S+\]", elem_list[3], ) # NC/Protein reference hypo[-1]["reference"] = partials.group(1) # Full gene name hypo[-1]["gene"] = partials.group(2) # More generic group hypo[-1]["instance"] = partials.group( 3).strip("_") # Description if len(partials.groups()) >= 4: hypo[-1]["virulence"] = ( partials.group(4).replace( "_", " ").capitalize()) else: hypo[-1]["virulence"] = "" hypo[-1]["span"] = ( float(hypo[-1]["subject_length"]) / locilengths[">{}".format( elem_list[3])]) elif type == "seq_type": partials = re.search( r"(.+)_(\d+){1,3}(?:_(\w+))*", elem_list[3]) hypo[-1]["loci"] = partials.group(1) hypo[-1]["allele"] = int(partials.group(2)) hypo[-1]["span"] = ( float(hypo[-1]["subject_length"]) / locilengths[">{}".format( elem_list[3])]) # split elem 2 into contig node_NO, length, cov nodeinfo = elem_list[2].split("_") hypo[-1]["contig_name"] = "{}_{}".format( nodeinfo[0], nodeinfo[1]) hypo[-1]["contig_length"] = int(nodeinfo[3]) hypo[-1]["contig_coverage"] = nodeinfo[5] self.logger.debug( "scrape_blast scrape loop hit") self.logger.info("{} candidate {} hits found".format( len(hypo), type2db)) except Exception as e: self.logger.error("Unable to process the pattern of {}".format( str(e))) # Cleanup of overlapping hits if type == "seq_type": identifier = "loci" elif type == "resistance" or type == "expec": identifier = "gene" ind = 0 while ind < len(hypo) - 1: targ = ind + 1 while targ < len(hypo): ignore = False if (hypo[ind]["contig_name"] == hypo[targ]["contig_name"] or hypo[ind][identifier] == hypo[targ][identifier]): # Overlapping or shared gene if ((hypo[ind].get("contig_start") >= hypo[targ].get("contig_start") and hypo[ind].get("contig_start") <= hypo[targ].get("contig_end")) or (hypo[ind].get("contig_end") >= hypo[targ].get("contig_start") and hypo[ind].get("contig_end") <= hypo[targ].get("contig_end")) or (hypo[ind].get(identifier) == hypo[targ].get(identifier))): # Rightmost is worse if float(hypo[ind].get("identity")) * ( 1 - abs(1 - hypo[ind].get("span"))) > float( hypo[targ].get("identity")) * ( 1 - abs(1 - hypo[targ].get("span"))): del hypo[targ] ignore = True # Leftmost is worse elif float(hypo[ind].get("identity")) * ( 1 - abs(1 - hypo[ind].get("span"))) < float( hypo[targ].get("identity")) * ( 1 - abs(1 - hypo[targ].get("span"))): del hypo[ind] targ = ind + 1 ignore = True # Identical identity and span, seperating based on contig coverage else: # Rightmost is worse if float( hypo[ind].get("contig_coverage")) >= float( hypo[targ].get("contig_coverage")): del hypo[targ] ignore = True # Leftmost is worse elif float( hypo[ind].get("contig_coverage")) < float( hypo[targ].get("contig_coverage")): del hypo[ind] targ = ind + 1 ignore = True if not ignore: targ += 1 else: pass ind += 1 self.logger.info( "{} {} hits were added after removing overlaps and duplicate hits". format(len(hypo), type)) for hit in hypo: self.logger.debug("Kept {}:{} with span {} and id {}".format( hit.get("loci"), hit.get("allele"), hit.get("span"), hit.get("identity"), )) self.db_pusher.add_rec(hit, "{}".format(type2db)) if type == "seq_type": try: ST = self.db_pusher.alleles2st(self.name) self.db_pusher.upd_rec({"CG_ID_sample": self.name}, "Samples", {"ST": ST}) self.logger.info("Sample {} received ST {}".format( self.name, ST)) except Exception as e: self.logger.warning( "Unable to type sample {} due to data value '{}'".format( self.name, str(e))) def load_resistances(self): """Legacy function, loads common resistance names for genes from notes file""" conversions = dict() try: with open("{}/notes.txt".format( self.config["folders"]["resistances"])) as fh: for line in fh: if "#" not in line: line = line.split(":") cropped = re.sub(" resistance", "", line[1]) conversions[line[0]] = cropped # Workaround for case issues conversions[line[0].lower()] = cropped except Exception as e: self.logger.error( "Unable to initialize trivial names for resistances ({})". format(e)) return conversions def scrape_alignment(self, file_list=[]): """Scrapes a single alignment result""" if file_list == []: file_list = glob.glob("{}/alignment/*.stats.*".format( self.sampledir)) ins_list = list() cov_dict = dict() align_dict = dict() align_dict["reference_genome"] = self.sample.get("reference") # Reading file_list = glob.glob("{}/alignment/*.stats.*".format(self.sampledir)) map_rate = 0.0 median_ins = 0 ref_len = 0.0 tot_reads = 0 tot_map = 0 duprate = 0.0 for file in file_list: with open(file, "r") as fh: type = file.split(".")[-1] for line in fh.readlines(): lsplit = line.rstrip().split("\t") if type == "raw": try: tot_reads = int(lsplit[0]) except Exception as e: pass elif type == "ins": if len(lsplit) >= 18 and lsplit[-12] in ["FF", "FR"]: try: median_ins = int(lsplit[0]) except Exception as e: pass elif type == "cov": cov_dict[lsplit[1]] = int(lsplit[2]) elif type == "ref": if lsplit[0] != "*" and len(lsplit) >= 2: ref_len = ref_len + int(lsplit[1]) elif type == "dup": if lsplit[0] == "Unknown Library": try: duprate = float(lsplit[8]) except Exception as e: duprate = -1.0 elif type == "map": dsplit = line.rstrip().split(" ") if len(dsplit) >= 5 and dsplit[4] == "total": tot_map = int(dsplit[0]) elif len(dsplit) >= 4 and dsplit[3] == "mapped": if tot_map > 0: map_rate = int(dsplit[0]) / float(tot_map) # Mangling sumz, plus10, plus30, plus50, plus100, total = 0, 0, 0, 0, 0, 0 for k, v in cov_dict.items(): sumz += int(k) * v total += v if int(k) > 10: plus10 += v if int(k) > 30: plus30 += v if int(k) > 50: plus50 += v if int(k) > 100: plus100 += v if total > 0: align_dict["coverage_10x"] = plus10 / float(ref_len) align_dict["coverage_30x"] = plus30 / float(ref_len) align_dict["coverage_50x"] = plus50 / float(ref_len) align_dict["coverage_100x"] = plus100 / float(ref_len) else: align_dict["coverage_10x"] = 0.0 align_dict["coverage_30x"] = 0.0 align_dict["coverage_50x"] = 0.0 align_dict["coverage_100x"] = 0.0 align_dict["mapped_rate"] = map_rate align_dict["insert_size"] = median_ins if ref_len > 0: align_dict["duplication_rate"] = duprate align_dict["average_coverage"] = sumz / float(ref_len) else: align_dict["duplication_rate"] = 0.0 align_dict["average_coverage"] = 0.0 align_dict["total_reads"] = tot_reads self.db_pusher.upd_rec({"CG_ID_sample": self.name}, "Samples", align_dict)