示例#1
0
    def __init__(self, config, log, sampleinfo={}, force=False):
        self.config = config
        self.logger = log
        self.db_access = DB_Manipulator(config, log)
        self.updated = list()
        # Fetch names of existing refs
        self.refs = self.db_access.profiles
        organisms = self.refs.keys()
        self.organisms = [*organisms]
        self.force = force

        self.sampleinfo = sampleinfo
        self.sample = None
        if isinstance(self.sampleinfo, list) and len(self.sampleinfo) > 1:
            self.name = self.sampleinfo[0].get("CG_ID_project")
            self.sample = self.sampleinfo[0]
            for entry in self.sampleinfo:
                if entry.get("CG_ID_sample") == self.name:
                    raise Exception(
                        "Mixed projects in samples_info file. Do not know how to proceed"
                    )
        else:
            if isinstance(self.sampleinfo, list):
                self.sampleinfo = self.sampleinfo[0]
            self.name = self.sampleinfo.get("CG_ID_sample")
            self.sample = self.sampleinfo
示例#2
0
    def __init__(self, config, log, sampleinfo={}, input=""):
        self.config = config
        self.logger = log
        self.db_pusher = DB_Manipulator(config, log)
        self.referencer = Referencer(config, log)
        self.job_fallback = Job_Creator(config=config,
                                        log=log,
                                        sampleinfo=sampleinfo)
        self.infolder = os.path.abspath(input)
        self.sampledir = ""

        # Since all scraped folders are generated by Job_Creator, datestring is automatically provided in last folder
        last_folder = self.infolder.split("/")[-1]
        self.name = last_folder.split("_")[0]

        self.sampleinfo = sampleinfo
        self.sample = None
        if isinstance(self.sampleinfo, list) and len(self.sampleinfo) > 1:
            self.name = self.sampleinfo[0].get("CG_ID_project")
            self.sample = self.sampleinfo[0]
            for entry in self.sampleinfo:
                if entry.get("CG_ID_sample") == self.name:
                    raise Exception(
                        "Mixed projects in samples_info file. Do not know how to proceed"
                    )
        else:
            if isinstance(self.sampleinfo, list):
                self.sampleinfo = self.sampleinfo[0]
            self.name = self.sampleinfo.get("CG_ID_sample")
            self.sample = self.sampleinfo

        self.gene2resistance = self.load_resistances()
示例#3
0
def dbm():
    db_file = re.search(
        'sqlite:///(.+)',
        preset_config['database']['SQLALCHEMY_DATABASE_URI']).group(1)
    os.remove(db_file)
    dbm = DB_Manipulator(config=preset_config, log=logger)
    dbm.create_tables()
    return dbm
示例#4
0
    def __init__(self,
                 config,
                 log,
                 sampleinfo={},
                 name="",
                 output="",
                 collection=False):
        self.db_pusher = DB_Manipulator(config, log)
        self.name = name
        self.collection = collection
        if output == "":
            self.output = os.getcwd()
        else:
            self.output = output + "/"
        self.config = config
        self.logger = log
        for k, v in config.items():
            app.config[k] = v
        self.server = Process(target=app.run)
        self.attachments = list()
        self.filelist = list()
        self.error = False
        self.dt = datetime.now()
        self.filelist = list()
        self.now = time.strftime("{}.{}.{}_{}.{}.{}".format(
            self.dt.year,
            self.dt.month,
            self.dt.day,
            self.dt.hour,
            self.dt.minute,
            self.dt.second,
        ))

        self.sampleinfo = sampleinfo
        self.sample = None
        if isinstance(self.sampleinfo, list) and len(self.sampleinfo) > 1:
            self.name = self.sampleinfo[0].get("CG_ID_project")
            self.sample = self.sampleinfo[0]
            for entry in self.sampleinfo:
                if entry.get("CG_ID_sample") == self.name:
                    raise Exception(
                        "Mixed projects in samples_info file. Do not know how to proceed"
                    )
        else:
            if isinstance(self.sampleinfo, list):
                self.sampleinfo = self.sampleinfo[0]
            self.name = self.sampleinfo.get("CG_ID_project")
            self.sample = self.sampleinfo
示例#5
0
class Reporter:
    def __init__(
        self, config, log, sampleinfo={}, name="", output="", collection=False
    ):
        self.db_pusher = DB_Manipulator(config, log)
        self.name = name
        self.collection = collection
        if output == "":
            self.output = os.getcwd()
        else:
            self.output = output + "/"
        self.config = config
        self.logger = log
        for k, v in config.items():
            app.config[k] = v
        self.server = Process(target=app.run)
        self.attachments = list()
        self.filedict = dict()
        self.error = False
        self.dt = datetime.now()
        self.now = time.strftime(
            "{}.{}.{}_{}.{}.{}".format(
                self.dt.year,
                self.dt.month,
                self.dt.day,
                self.dt.hour,
                self.dt.minute,
                self.dt.second,
            )
        )

        self.sampleinfo = sampleinfo
        self.sample = None
        if isinstance(self.sampleinfo, list) and len(self.sampleinfo) > 1:
            self.name = self.sampleinfo[0].get("CG_ID_project")
            self.sample = self.sampleinfo[0]
            for entry in self.sampleinfo:
                if entry.get("CG_ID_sample") == self.name:
                    raise Exception(
                        "Mixed projects in samples_info file. Do not know how to proceed"
                    )
        else:
            if isinstance(self.sampleinfo, list):
                self.sampleinfo = self.sampleinfo[0]
            self.name = self.sampleinfo.get("CG_ID_project")
            self.sample = self.sampleinfo

    def create_subfolders(self):
        os.makedirs("{0}/deliverables".format(self.config["folders"]["reports"]), exist_ok=True)
        os.makedirs("{0}/json".format(self.config["folders"]["reports"]), exist_ok=True)
        os.makedirs("{0}/analysis".format(self.config["folders"]["reports"]), exist_ok=True)

    def report(self, type="default", customer="all"):
        self.create_subfolders()
        if type in ["default", "typing", "qc"]:
            # Only typing and qc reports are version controlled
            self.gen_version(self.name)
        if type in ["default", "typing", "qc", "st_update"]:
            self.restart_web()
            if type == "default":
                self.gen_typing()
                self.gen_qc()
                self.gen_json(silent=True)
                self.gen_delivery()
            elif type == "typing":
                self.gen_typing()
            elif type == "qc":
                self.gen_qc()
            elif type == "st_update":
                self.gen_STtracker(customer)
            self.kill_flask()
        elif type in ["json_dump", "motif_overview"]:
            if type == "json_dump":
                self.gen_json()
                self.gen_delivery()
            elif type == "motif_overview":
                self.gen_motif(motif="resistance")
                self.gen_motif(motif="expec")
        else:
            raise Exception("Report function recieved invalid format")
        self.mail()
        #If no output dir is specified; Don't store report locally. Rely on e-mail
        if not self.output == "" or self.output == os.getcwd():
            for k,v in self.filedict.items():
                if v == "": 
                    os.remove(k)
                else:
                    copyfile(k, v)

    def gen_version(self, name):
        self.db_pusher.get_report(name)
        self.db_pusher.set_report(name)

    def gen_STtracker(self, customer="all", silent=False):
        self.name = "Sequence Type Update"
        try:
            r = requests.get(
                "http://127.0.0.1:5000/microSALT/STtracker/{}".format(customer),
                allow_redirects=True,
            )
            outname = "{}/ST_updates_{}.html".format(self.output, self.now)
            outfile = open(outname, "wb")
            outfile.write(r.content.decode("iso-8859-1").encode("utf8"))
            outfile.close()
            self.filedict[outname] = ""
            if not silent:
                self.attachments.append(outname)
        except Exception as e:
            self.logger.error(
                "Flask instance currently occupied. Possible rogue process. Retry command"
            )
            self.error = True

    def gen_qc(self, silent=False):
        try:
            last_version = self.db_pusher.get_report(self.name).version
        except Exception as e:
            self.logger.error("Project {} does not exist".format(self.name))
            self.kill_flask()
            sys.exit(-1)
        try:
            q = requests.get(
                "http://127.0.0.1:5000/microSALT/{}/qc".format(self.name),
                allow_redirects=True,
            )
            outfile = "{}_QC_{}.html".format(
                self.sample.get("Customer_ID_project"), last_version
            )
            local = "{}/{}".format(self.output, outfile)
            output = "{}/analysis/{}".format(self.config["folders"]["reports"], outfile)

            outfile = open(output, "wb")
            outfile.write(q.content.decode("iso-8859-1").encode("utf8"))
            outfile.close()

            if os.path.isfile(output):
                self.filedict[output] = local
                if not silent:
                    self.attachments.append(output)
        except Exception as e:
            self.logger.error(
                "Flask instance currently occupied. Possible rogue process. Retry command"
            )
            self.error = True

    def gen_typing(self, silent=False):
        try:
            last_version = self.db_pusher.get_report(self.name).version
        except Exception as e:
            self.logger.error("Project {} does not exist".format(self.name))
            self.kill_flask()
            sys.exit(-1)
        try:
            r = requests.get(
                "http://127.0.0.1:5000/microSALT/{}/typing/all".format(self.name),
                allow_redirects=True,
            )
            outfile = "{}_Typing_{}.html".format(
                self.sample.get("Customer_ID_project"), last_version
            )
            local = "{}/{}".format(self.output, outfile)
            output = "{}/analysis/{}".format(self.config["folders"]["reports"], outfile)

            outfile = open(output, "wb")
            outfile.write(r.content.decode("iso-8859-1").encode("utf8"))
            outfile.close()

            if os.path.isfile(output):
                self.filedict[output] = local
                if not silent:
                    self.attachments.append(output)
        except Exception as e:
            self.logger.error(
                "Flask instance currently occupied. Possible rogue process. Retry command"
            )
            self.error = True

    def gen_motif(self, motif="resistance", silent=False):
        if motif not in ["resistance", "expec"]:
            self.logger.error("Invalid motif type specified for gen_motif function")
        if self.collection:
            sample_info = gen_collectiondata(self.name)
        else:
            sample_info = gen_reportdata(self.name)
        output = "{}/{}_{}_{}.csv".format(self.output, self.name, motif, self.now)

        # Load motif & gene names into dict
        motifdict = dict()
        for s in sample_info["samples"]:
            if motif == "resistance":
                for r in s.resistances:
                    if (
                        not (r.resistance in motifdict.keys())
                        and r.threshold == "Passed"
                    ):
                        if r.resistance is None:
                            r.resistance = "None"
                        motifdict[r.resistance] = list()
                    if (
                        r.threshold == "Passed"
                        and not r.gene in motifdict[r.resistance]
                    ):
                        motifdict[r.resistance].append(r.gene)
            elif motif == "expec":
                for e in s.expacs:
                    if (
                        not (e.virulence in motifdict.keys())
                        and e.threshold == "Passed"
                    ):
                        if e.virulence is None:
                            e.virulence = "None"
                        motifdict[e.virulence] = list()
                    if e.threshold == "Passed" and not e.gene in motifdict[e.virulence]:
                        motifdict[e.virulence].append(e.gene)
        for k, v in motifdict.items():
            motifdict[k] = sorted(v)

        # Top 2 Header
        sepfix = "sep=,"
        topline = "Identity {}% & Span {}%,,,".format(
            self.config["threshold"]["motif_id"], self.config["threshold"]["motif_span"]
        )
        botline = "CG Sample ID,Sample ID,Organism,Sequence Type,Thresholds"
        for k in sorted(motifdict.keys()):
            genes = [""] * len(motifdict[k])
            active_gene = k.replace(",", " &")
            if active_gene == "":
                active_gene = "Uncategorized hits"
            geneholder = ",".join(genes)
            topline += ",,{}{}".format(active_gene, geneholder)
            resnames = ",".join(sorted(motifdict[k]))
            botline += ",,{}".format(resnames)

        try:
            excel = open(output, "w+")
            excel.write("{}\n".format(sepfix))
            excel.write("{}\n".format(topline))
            excel.write("{}\n".format(botline))

            # Create each individual row past the 2nd, per iteration
            for s in sample_info["samples"]:
                rowdict = dict()
                pref = "{},{},{},{},{}".format(
                    s.CG_ID_sample,
                    s.Customer_ID_sample,
                    s.organism,
                    s.ST_status.replace(",", ";"),
                    s.threshold,
                )
                # Load single sample
                if motif == "resistance":
                    for r in s.resistances:
                        if (
                            not (r.resistance in rowdict.keys())
                            and r.threshold == "Passed"
                        ):
                            rowdict[r.resistance] = dict()
                        if (
                            r.threshold == "Passed"
                            and not r.gene in rowdict[r.resistance]
                        ):
                            rowdict[r.resistance][r.gene] = r.identity
                elif motif == "expec":
                    for e in s.expacs:
                        if (
                            not (e.virulence in rowdict.keys())
                            and e.threshold == "Passed"
                        ):
                            rowdict[e.virulence] = dict()
                        if (
                            e.threshold == "Passed"
                            and not e.gene in rowdict[e.virulence]
                        ):
                            rowdict[e.virulence][e.gene] = e.identity
                # Compare single sample to all
                hits = ""
                for res in sorted(motifdict.keys()):
                    if res in rowdict.keys():
                        hits += ",1"
                        for gen in sorted(motifdict[res]):
                            hits += ","
                            if gen in rowdict[res].keys():
                                # UPD: Change this to identity of hit
                                hits += "{}".format(rowdict[res][gen])
                            else:
                                hits += "0"
                    else:
                        # Commas eq to res + gen length
                        hits += ",0,0"
                        pad = ["0"] * len(motifdict[res])
                        hits += ",".join(pad)

                excel.write("{}{}\n".format(pref, hits))

            excel.close()
            if os.path.isfile(output):
                self.filedict[output] = ""
                if not silent:
                    self.attachments.append(output)
        except FileNotFoundError as e:
            self.logger.error(
                "Gen_motif unable to produce excel file. Path {} does not exist".format(
                    os.path.basename(output)
                )
            )

    def gen_delivery(self):
        deliv = dict()
        deliv['files'] = list()
        last_version = self.db_pusher.get_report(self.name).version
        output = "{}/deliverables/{}_deliverables.yaml".format(self.config["folders"]["reports"], self.sample.get("Customer_ID_project"))
        local = "{}/{}_deliverables.yaml".format(self.output, self.sample.get("Customer_ID_project"))

        #Project-wide
        #Sampleinfo
        deliv['files'].append({'format':'json','id':self.sample.get("Customer_ID_project"),
                               'path':"{}/sampleinfo.json".format(self.output),
                               'path_index':'~','step':'analysis','tag':'sampleinfo'})
        #QC report
        deliv['files'].append({'format':'html','id':self.sample.get("Customer_ID_project"),
                               'path':"{}/{}_QC_{}.html".format(self.output, self.sample.get("Customer_ID_project"), last_version),
                               'path_index':'~','step':'result_aggregation','tag':'microsalt-qc'})
        #Typing report
        deliv['files'].append({'format':'html','id':self.sample.get("Customer_ID_project"),
                               'path':"{}/{}_Typing_{}.html".format(self.output, self.sample.get("Customer_ID_project"), last_version),
                               'path_index':'~','step':'result_aggregation','tag':'microsalt-type'})
        #Json (vogue) report
        deliv['files'].append({'format':'json','id':self.sample.get("Customer_ID_project"),
                               'path':"{}/{}.json".format(self.output, self.sample.get("CG_ID_project")),
                               'path_index':'~','step':'result_aggregation','tag':'microsalt-json'})
        #Settings dump
        deliv['files'].append({'format':'txt','id':self.sample.get("Customer_ID_project"),
                               'path':"{}/config.log".format(self.output),
                               'path_index':'~','step':'analysis','tag':'runtime-settings'})

        #Sample-wide
        #Single sample
        if self.sampleinfo == self.sample:
            hklist = list()
            hklist.append(self.sampleinfo)
            resultsdir = self.output
        #Project
        else:
            hklist = self.sampleinfo

        for s in hklist:
            if len(hklist) > 1:
                resultsdir = os.path.join(self.output, s["CG_ID_sample"])
            #Contig/Assembly file
            deliv['files'].append({'format':'fasta','id':s["CG_ID_sample"],
                                   'path':"{0}/assembly/{1}_trimmed_contigs.fasta".format(resultsdir, s["CG_ID_sample"]),
                                   'path_index':'~','step':'assembly','tag':'assembly'})
            #Concat trimmed reads forwards
            deliv['files'].append({'format':'fastq','id':s["CG_ID_sample"],
                                   'path':"{0}/trimmed/{1}_trim_front_pair.fastq.gz".format(resultsdir, s["CG_ID_sample"]),
                                   'path_index':'~','step':'concatination','tag':'trimmed-forward-reads'}) 
            #Concat trimmed reads reverse
            deliv['files'].append({'format':'fastq','id':s["CG_ID_sample"],
                                   'path':"{0}/trimmed/{1}_trim_rev_pair.fastq.gz".format(resultsdir, s["CG_ID_sample"]),
                                   'path_index':'~','step':'concatination','tag':'trimmed-reverse-reads'})
            #Concat trimmed reads unpaired
            deliv['files'].append({'format':'fastq','id':s["CG_ID_sample"],
                                   'path':"{0}/trimmed/{1}_trim_unpair.fastq.gz".format(resultsdir, s["CG_ID_sample"]),
                                   'path_index':'~','step':'concatination','tag':'trimmed-unpaired-reads'})            
            #Slurm dump
            deliv['files'].append({'format':'txt','id':s["CG_ID_sample"],
                                   'path':"{0}/slurm_{1}.log".format(resultsdir, s["CG_ID_sample"]),
                                   'path_index':'~','step':'analysis','tag':'logfile'})
            #Quast (assembly) qc report
            deliv['files'].append({'format':'tsv','id':s["CG_ID_sample"],
                                   'path':"{0}/assembly/quast/{1}_report.tsv".format(resultsdir, s["CG_ID_sample"]),
                                   'path_index':'~','step':'assembly','tag':'quast-results'})
            #Alignment (bam, sorted)
            deliv['files'].append({'format':'bam','id':s["CG_ID_sample"],
                                   'path':"{0}/alignment/{1}_{2}.bam_sort".format(resultsdir, s["CG_ID_sample"], s["reference"]),
                                   'path_index':'~','step':'alignment','tag':'reference-alignment-sorted'})
            #Alignment (bam, sorted, deduplicated)
            deliv['files'].append({'format':'bam','id':s["CG_ID_sample"],
                                   'path':"{0}/alignment/{1}_{2}.bam_sort_rmdup".format(resultsdir, s["CG_ID_sample"], s["reference"]),
                                   'path_index':'~','step':'alignment','tag':'reference-alignment-deduplicated'})
            #Picard insert size stats
            deliv['files'].append({'format':'meta','id':s["CG_ID_sample"],
                                   'path':"{0}/alignment/{1}_{2}.stats.ins".format(resultsdir, s["CG_ID_sample"], s["reference"]),
                                   'path_index':'~','step':'insertsize_calc','tag':'picard-insertsize'})


        with open(output, 'w') as delivfile:
            documents = yaml.dump(deliv, delivfile)

        with open(output, 'r') as delivfile:
            postfix = delivfile.read()
        postfix = postfix.replace("'~'", "~")

        with open(output, 'w') as delivfile:
            delivfile.write(postfix)

        if os.path.isfile(output):
            self.filedict[output] = local


    def gen_json(self, silent=False):
        report = dict()
        local = "{}/{}.json".format(self.output, self.name)
        output = "{}/json/{}.json".format(self.config["folders"]["reports"], self.name)

        sample_info = gen_reportdata(self.name)
        analyses = [
            "blast_pubmlst",
            "quast_assembly",
            "blast_resfinder_resistence",
            "picard_markduplicate",
            "microsalt_samtools_stats",
        ]
        for s in sample_info["samples"]:
            t = dict()

            # Since some apps are too basic to filter irrelevant non-standard values..
            t["ST_status"] = (
                "" if s.ST_status is None or s.ST_status != str(s.ST) else s.ST_status
            )
            t["threshold"] = (
                ""
                if s.threshold is None or s.threshold not in ["Passed", "Failed"]
                else s.threshold
            )
            t["genome_length"] = (
                ""
                if s.genome_length is None or s.genome_length < 1
                else s.genome_length
            )
            t["reference_length"] = (
                ""
                if s.reference_length is None or s.reference_length < 1
                else s.reference_length
            )
            t["gc_percentage"] = (
                ""
                if s.gc_percentage is None or s.gc_percentage < 0.1
                else str(s.gc_percentage)
            )
            t["n50"] = "" if s.n50 is None or s.n50 < 1 else s.n50
            t["contigs"] = "" if s.contigs is None or s.contigs < 1 else s.contigs
            t["insert_size"] = (
                "" if s.insert_size is None or s.insert_size < 1 else s.insert_size
            )
            t["duplication_rate"] = (
                ""
                if s.duplication_rate is None or s.duplication_rate < 0.1
                else s.duplication_rate
            )
            t["total_reads"] = (
                "" if s.total_reads is None or s.total_reads < 1 else s.total_reads
            )
            t["mapped_rate"] = (
                "" if s.mapped_rate is None or s.mapped_rate < 0.1 else s.mapped_rate
            )
            t["average_coverage"] = (
                ""
                if s.average_coverage is None or s.average_coverage < 0.1
                else s.average_coverage
            )
            t["coverage_10x"] = (
                "" if s.coverage_10x is None or s.coverage_10x < 0.1 else s.coverage_10x
            )
            t["coverage_30x"] = (
                "" if s.coverage_30x is None or s.coverage_30x < 0.1 else s.coverage_30x
            )
            t["coverage_50x"] = (
                "" if s.coverage_50x is None or s.coverage_50x < 0.1 else s.coverage_50x
            )
            t["coverage_100x"] = (
                ""
                if s.coverage_100x is None or s.coverage_100x < 0.1
                else s.coverage_100x
            )

            report[s.CG_ID_sample] = dict()
            for a in analyses:
                if a == "blast_resfinder_resistence":
                    report[s.CG_ID_sample][a] = list()
                else:
                    report[s.CG_ID_sample][a] = dict()

            report[s.CG_ID_sample]["blast_pubmlst"] = {
                "sequence_type": t["ST_status"],
                "thresholds": t["threshold"],
            }
            report[s.CG_ID_sample]["quast_assembly"] = {
                "estimated_genome_length": t["genome_length"],
                "gc_percentage": t["gc_percentage"],
                "n50": t["n50"],
                "necessary_contigs": t["contigs"],
            }
            report[s.CG_ID_sample]["picard_markduplicate"] = {
                "insert_size": t["insert_size"],
                "duplication_rate": t["duplication_rate"],
            }
            report[s.CG_ID_sample]["microsalt_samtools_stats"] = {
                "total_reads": t["total_reads"],
                "mapped_rate": t["mapped_rate"],
                "average_coverage": t["average_coverage"],
                "coverage_10x": t["coverage_10x"],
                "coverage_30x": t["coverage_30x"],
                "coverage_50x": t["coverage_50x"],
                "coverage_100x": t["coverage_100x"],
            }

            for r in s.resistances:
                if (
                    not (r.gene in report[s.CG_ID_sample]["blast_resfinder_resistence"])
                    and r.threshold == "Passed"
                ):
                    report[s.CG_ID_sample]["blast_resfinder_resistence"].append(r.gene)

        # json.dumps(report) #Dumps the json directly
        try:
            with open(output, "w") as outfile:
                json.dump(report, outfile)

            if os.path.isfile(output):
                self.filedict[output] = local
                if not silent:
                    self.attachments.append(output)
        except FileNotFoundError as e:
            self.logger.error(
                "Gen_json unable to produce json file. Path {} does not exist".format(
                    os.path.basename(output)
                )
            )

    def mail(self):
        msg = MIMEMultipart()
        if not self.error and self.attachments:
            msg["Subject"] = "{} ({}) Reports".format(
                self.name, self.attachments[0].split("_")[0]
            )
        else:
            msg["Subject"] = "{} Failed Generating Report".format(self.name)

        sender = socket.gethostname()
        sender_fixed = "{}.com".format(os.path.splitext(sender)[0])
        msg["From"] = sender_fixed

        msg["To"] = self.config["regex"]["mail_recipient"]

        if not self.error:
            for file in self.attachments:
                part = MIMEApplication(open(file).read())
                part.add_header(
                    "Content-Disposition",
                    'attachment; filename="%s"' % os.path.basename(file),
                )
                msg.attach(part)

        s = smtplib.SMTP("localhost")
        s.connect()
        s.sendmail(msg["From"], msg["To"], msg.as_string())
        s.quit()
        self.logger.info(
            "Mail containing report sent to {} from {}".format(msg["To"], msg["From"])
        )

    def start_web(self):
        self.server.start()
        self.logger.info("Started webserver on http://127.0.0.1:5000/")
        # Hinders requests before server goes up
        time.sleep(0.15)

    def kill_flask(self):
        self.server.terminate()
        self.server.join()
        self.logger.info("Closed webserver on http://127.0.0.1:5000/")

    def restart_web(self):
        try:
          self.kill_flask()
        except Exception as e:
          pass
        self.start_web()
示例#6
0
class Referencer:
    def __init__(self, config, log, sampleinfo={}, force=False):
        self.config = config
        self.logger = log
        self.db_access = DB_Manipulator(config, log)
        self.updated = list()
        # Fetch names of existing refs
        self.refs = self.db_access.profiles
        organisms = self.refs.keys()
        self.organisms = [*organisms]
        self.force = force

        self.sampleinfo = sampleinfo
        self.sample = None
        if isinstance(self.sampleinfo, list) and len(self.sampleinfo) > 1:
            self.name = self.sampleinfo[0].get("CG_ID_project")
            self.sample = self.sampleinfo[0]
            for entry in self.sampleinfo:
                if entry.get("CG_ID_sample") == self.name:
                    raise Exception(
                        "Mixed projects in samples_info file. Do not know how to proceed"
                    )
        else:
            if isinstance(self.sampleinfo, list):
                self.sampleinfo = self.sampleinfo[0]
            self.name = self.sampleinfo.get("CG_ID_sample")
            self.sample = self.sampleinfo

    def identify_new(self, cg_id="", project=False):
        """ Automatically downloads pubMLST & NCBI organisms not already downloaded """
        neworgs = list()
        newrefs = list()

        try:
            if not isinstance(self.sampleinfo, list):
                samples = [self.sampleinfo]
            else:
                samples = self.sampleinfo

            for entry in samples:
                org = entry.get("organism")
                ref = self.organism2reference(org)
                if ref not in self.organisms and org not in neworgs:
                    neworgs.append(org)
            if (not "{}.fasta".format(entry.get("reference")) in os.listdir(
                    self.config["folders"]["genomes"])
                    and not entry.get("reference") in newrefs):
                newrefs.append(entry.get("reference"))

            for org in neworgs:
                self.add_pubmlst(org)
            for org in newrefs:
                self.download_ncbi(org)
        except Exception as e:
            self.logger.error(
                "Reference update function failed prematurely. Review immediately"
            )

    def update_refs(self):
        """Updates all references. Order is important, since no object is updated twice"""
        # Updates
        self.fetch_pubmlst(self.force)
        self.fetch_external(self.force)
        self.fetch_resistances(self.force)

        # Reindexes
        self.index_db(os.path.dirname(self.config["folders"]["expec"]), ".fsa")

    def index_db(self, full_dir, suffix):
        """Check for indexation, makeblastdb job if not enough of them."""
        reindexation = False
        files = os.listdir(full_dir)
        sufx_files = glob.glob("{}/*{}".format(full_dir,
                                               suffix))  # List of source files
        for file in sufx_files:
            subsuf = "\{}$".format(suffix)
            base = re.sub(subsuf, "", file)

            bases = 0
            newer = 0
            for elem in files:
                # Number of files with same base name (7)
                if os.path.basename(base) == elem[:elem.rfind(".")]:
                    bases = bases + 1
                    # Number of index files fresher than source (6)
                    if (os.stat(file).st_mtime < os.stat("{}/{}".format(
                            full_dir, elem)).st_mtime):
                        newer = newer + 1
            # 7 for parse_seqids, 4 for not.
            if not (bases == 7 or newer == 6) and not (bases == 4
                                                       and newer == 3):
                reindexation = True
                try:
                    # Resistence files
                    if ".fsa" in suffix:
                        bash_cmd = "makeblastdb -in {}/{} -dbtype nucl -out {}".format(
                            full_dir, os.path.basename(file),
                            os.path.basename(base))
                    # MLST locis
                    else:
                        bash_cmd = "makeblastdb -in {}/{} -dbtype nucl -parse_seqids -out {}".format(
                            full_dir, os.path.basename(file),
                            os.path.basename(base))
                    proc = subprocess.Popen(bash_cmd.split(),
                                            cwd=full_dir,
                                            stdout=subprocess.PIPE)
                    output, error = proc.communicate()
                except Exception as e:
                    self.logger.error(
                        "Unable to index requested target {} in {}".format(
                            file, full_dir))
        if reindexation:
            self.logger.info("Re-indexed contents of {}".format(full_dir))

    def fetch_external(self, force=False):
        """ Updates reference for data that IS ONLY LINKED to pubMLST """
        prefix = "https://pubmlst.org"
        query = urllib.request.urlopen("{}/data/".format(prefix))
        soup = BeautifulSoup(query, "html.parser")
        tr_sub = soup.find_all("tr", class_="td1")
        tr_sub = tr_sub + soup.find_all("tr", class_="td2")

        # Only search every other instance
        iterator = iter(tr_sub)
        unfound = True
        try:
            while unfound:
                entry = iterator.__next__()
                # Gather general info from first object
                sample = entry.get_text().split("\n")
                organ = sample[1].lower().replace(" ", "_")
                # In order to get ecoli #1
                if "escherichia_coli" in organ and "#1" in organ:
                    organ = organ[:-2]
                currver = self.db_access.get_version(
                    "profile_{}".format(organ))
                profile_no = re.search(r"\d+", sample[2]).group(0)
                if (organ in self.organisms
                        and organ.replace("_", " ") not in self.updated
                        and (int(profile_no.replace("-", "")) > int(
                            currver.replace("-", "")) or force)):
                    # Download definition files
                    st_link = prefix + entry.find_all("a")[1]["href"]
                    output = "{}/{}".format(self.config["folders"]["profiles"],
                                            organ)
                    urllib.request.urlretrieve(st_link, output)
                    # Update database
                    self.db_access.upd_rec(
                        {"name": "profile_{}".format(organ)},
                        "Versions",
                        {"version": profile_no},
                    )
                    self.db_access.reload_profiletable(organ)
                    # Gather loci from second object
                    entry = iterator.__next__()
                    # Clear existing directory and download allele files
                    out = "{}/{}".format(self.config["folders"]["references"],
                                         organ)
                    shutil.rmtree(out)
                    os.makedirs(out)
                    for loci in entry.find_all("a"):
                        loci = loci["href"]
                        lociname = os.path.basename(os.path.normpath(loci))
                        input = prefix + loci
                        urllib.request.urlretrieve(
                            input, "{}/{}".format(out, lociname))
                    # Create new indexes
                    self.index_db(out, ".tfa")
                else:
                    iterator.__next__()
        except StopIteration:
            pass

    def resync(self, type="", sample="", ignore=False):
        """Manipulates samples that have an internal ST that differs from pubMLST ST"""
        if type == "list":
            # Add single sample support later
            self.db_access.list_unresolved()
        elif type == "overwrite":
            if ignore:
                self.db_access.rm_novel(sample=sample)
            else:
                self.db_access.sync_novel(overwrite=True, sample=sample)
        else:
            self.db_access.sync_novel(overwrite=False, sample=sample)

    def fetch_resistances(self, force=False):
        cwd = os.getcwd()
        url = "https://bitbucket.org/genomicepidemiology/resfinder_db.git"
        hiddensrc = "{}/.resfinder_db".format(
            self.config["folders"]["resistances"])
        wipeIndex = False

        if not os.path.exists(hiddensrc) or len(os.listdir(hiddensrc)) == 0:
            self.logger.info("resFinder database not found. Caching..")
            if not os.path.exists(hiddensrc):
                os.makedirs(hiddensrc)
            cmd = "git clone {} --quiet".format(url)
            process = subprocess.Popen(
                cmd.split(),
                cwd=self.config["folders"]["resistances"],
                stdout=subprocess.PIPE,
            )
            output, error = process.communicate()
            os.rename(
                "{}/resfinder_db".format(
                    self.config["folders"]["resistances"]),
                hiddensrc,
            )
            wipeIndex = True
        else:
            if not wipeIndex:
                actual = os.listdir(self.config["folders"]["resistances"])

                for file in os.listdir(hiddensrc):
                    if file not in actual and (".fsa" in file):
                        self.logger.info(
                            "resFinder database files corrupted. Syncing...")
                        wipeIndex = True
                        break

                cmd = "git pull origin master"
                process = subprocess.Popen(
                    cmd.split(),
                    cwd=hiddensrc,
                    stdout=subprocess.PIPE,
                    stderr=subprocess.STDOUT,
                )
                output, error = process.communicate()
                if not "Already up-to-date." in str(output):
                    self.logger.info(
                        "Remote resFinder database updated. Syncing...")
                    wipeIndex = True
                else:
                    self.logger.info(
                        "Cached resFinder database identical to remote.")

        # Actual update of resistance folder
        if wipeIndex:
            for file in os.listdir(hiddensrc):
                if os.path.isfile("{}/{}".format(hiddensrc, file)):
                    # Copy fresh
                    shutil.copy(
                        "{}/{}".format(hiddensrc, file),
                        self.config["folders"]["resistances"],
                    )

        # Double checks indexation is current.
        self.index_db(self.config["folders"]["resistances"], ".fsa")

    def existing_organisms(self):
        """ Returns list of all organisms currently added """
        return self.organisms

    def organism2reference(self, normal_organism_name):
        """Finds which reference contains the same words as the organism
       and returns it in a format for database calls. Returns empty string if none found"""
        orgs = os.listdir(self.config["folders"]["references"])
        organism = re.split(r"\W+", normal_organism_name.lower())
        try:
            refs = 0
            for target in orgs:
                hit = 0
                for piece in organism:
                    if len(piece) == 1:
                        if target.startswith(piece):
                            hit += 1
                    else:
                        if piece in target:
                            hit += 1
                        # For when people misspell the strain in the orderform
                        elif piece == "pneumonsiae" and "pneumoniae" in target:
                            hit += 1
                        else:
                            break
                if hit == len(organism):
                    return target
        except Exception as e:
            self.logger.warn(
                "Unable to find existing reference for {}, strain {} has no reference match\nSource: {}"
                .format(organism, normal_organism_name, e))

    def download_ncbi(self, reference):
        """ Checks available references, downloads from NCBI if not present """
        try:
            DEVNULL = open(os.devnull, "wb")
            Entrez.email = "*****@*****.**"
            record = Entrez.efetch(db="nucleotide",
                                   id=reference,
                                   rettype="fasta",
                                   retmod="text")
            sequence = record.read()
            output = "{}/{}.fasta".format(self.config["folders"]["genomes"],
                                          reference)
            with open(output, "w") as f:
                f.write(sequence)
            bwaindex = "bwa index {}".format(output)
            proc = subprocess.Popen(
                bwaindex.split(),
                cwd=self.config["folders"]["genomes"],
                stdout=DEVNULL,
                stderr=DEVNULL,
            )
            out, err = proc.communicate()
            samindex = "samtools faidx {}".format(output)
            proc = subprocess.Popen(
                samindex.split(),
                cwd=self.config["folders"]["genomes"],
                stdout=DEVNULL,
                stderr=DEVNULL,
            )
            out, err = proc.communicate()
            self.logger.info("Downloaded reference {}".format(reference))
        except Exception as e:
            self.logger.warning(
                "Unable to download genome '{}' from NCBI".format(reference))

    def add_pubmlst(self, organism):
        """ Checks pubmlst for references of given organism and downloads them """
        # Organism must be in binomial format and only resolve to one hit
        errorg = organism
        try:
            organism = organism.lower().replace(".", " ")
            if organism.replace(" ", "_") in self.organisms and not self.force:
                self.logger.info(
                    "Organism {} already stored in microSALT".format(organism))
                return
            db_query = self.query_pubmlst()

            # Doublecheck organism name is correct and unique
            orgparts = organism.split(" ")
            counter = 0.0
            for item in db_query:
                for subtype in item["databases"]:
                    missingPart = False
                    for part in orgparts:
                        if len(part) == 1:
                            if not subtype["description"].lower().startswith(
                                    part):
                                missingPart = True
                        else:
                            if not part in subtype["description"].lower():
                                missingPart = True
                    if not missingPart:
                        # Seqdef always appear after isolates, so this is fine
                        seqdef_url = subtype["href"]
                        desc = subtype["description"]
                        counter += 1.0
                        self.logger.info(
                            "Located pubMLST hit {} for sample".format(desc))
            if counter > 2.0:
                raise Exception(
                    "Reference '{}' resolved to {} organisms. Please be more stringent"
                    .format(errorg, int(counter / 2)))
            elif counter < 1.0:
                # add external
                raise Exception(
                    "Unable to find requested organism '{}' in pubMLST database"
                    .format(errorg))
            else:
                truename = desc.lower().split(" ")
                truename = "{}_{}".format(truename[0], truename[1])
                self.download_pubmlst(truename, seqdef_url)
                # Update organism list
                self.refs = self.db_access.profiles
                self.logger.info("Created table profile_{}".format(truename))
        except Exception as e:
            self.logger.warning(e.args[0])

    def query_pubmlst(self):
        """ Returns a json object containing all organisms available via pubmlst.org """
        # Example request URI: http://rest.pubmlst.org/db/pubmlst_neisseria_seqdef/schemes/1/profiles_csv
        seqdef_url = dict()
        databases = "http://rest.pubmlst.org/db"
        db_req = urllib.request.Request(databases)
        with urllib.request.urlopen(db_req) as response:
            db_query = json.loads(response.read().decode("utf-8"))
        return db_query

    def download_pubmlst(self, organism, subtype_href, force=False):
        """ Downloads ST and loci for a given organism stored on pubMLST if it is more recent. Returns update date """
        organism = organism.lower().replace(" ", "_")

        # Pull version
        ver_req = urllib.request.Request(
            "{}/schemes/1/profiles".format(subtype_href))
        with urllib.request.urlopen(ver_req) as response:
            ver_query = json.loads(response.read().decode("utf-8"))
        currver = self.db_access.get_version("profile_{}".format(organism))
        if (int(ver_query["last_updated"].replace("-", "")) <= int(
                currver.replace("-", "")) and not force):
            # self.logger.info("Profile for {} already at latest version".format(organism.replace('_' ,' ').capitalize()))
            return currver

        # Pull ST file
        st_target = "{}/{}".format(self.config["folders"]["profiles"],
                                   organism)
        input = "{}/schemes/1/profiles_csv".format(subtype_href)
        urllib.request.urlretrieve(input, st_target)
        # Pull locus files
        loci_input = "{}/schemes/1".format(subtype_href)
        loci_req = urllib.request.Request(loci_input)
        with urllib.request.urlopen(loci_req) as response:
            loci_query = json.loads(response.read().decode("utf-8"))

        output = "{}/{}".format(self.config["folders"]["references"], organism)

        try:
            if os.path.isdir(output):
                shutil.rmtree(output)
        except FileNotFoundError as e:
            pass
        os.makedirs(output)

        for locipath in loci_query["loci"]:
            loci = os.path.basename(os.path.normpath(locipath))
            urllib.request.urlretrieve("{}/alleles_fasta".format(locipath),
                                       "{}/{}.tfa".format(output, loci))
        # Create new indexes
        self.index_db(output, ".tfa")

    def external_version(self, organism, subtype_href):
        ver_req = urllib.request.Request(
            "{}/schemes/1/profiles".format(subtype_href))
        with urllib.request.urlopen(ver_req) as response:
            ver_query = json.loads(response.read().decode("utf-8"))
        return ver_query["last_updated"]

    def fetch_pubmlst(self, force=False):
        """ Updates reference for data that is stored on pubMLST """
        seqdef_url = dict()
        db_query = self.query_pubmlst()

        # Fetch seqdef locations
        for item in db_query:
            for subtype in item["databases"]:
                for name in self.organisms:
                    if name.replace("_",
                                    " ") in subtype["description"].lower():
                        # Seqdef always appear after isolates, so this is fine
                        self.updated.append(name.replace("_", " "))
                        seqdef_url[name] = subtype["href"]

        for key, val in seqdef_url.items():
            internal_ver = self.db_access.get_version("profile_{}".format(key))
            external_ver = self.external_version(key, val)
            if internal_ver < external_ver:
                self.logger.info(
                    "pubMLST reference for {} updated to {} from {}".format(
                        key.replace("_", " ").capitalize(), external_ver,
                        internal_ver))
                self.download_pubmlst(key, val, force)
                self.db_access.upd_rec(
                    {"name": "profile_{}".format(key)},
                    "Versions",
                    {"version": external_ver},
                )
                self.db_access.reload_profiletable(key)
示例#7
0
    def __init__(self, config, log, sampleinfo={}, run_settings={}):
        self.config = config
        self.logger = log
        self.batchfile = "/tmp/batchfile.sbatch"

        self.filelist = list()
        if isinstance(run_settings.get("input"), list):
            self.filelist = run_settings.get("input")
            run_settings["input"] = "/tmp/"

        self.run_settings = run_settings
        self.indir = os.path.abspath(run_settings.get("input", "/tmp/"))
        self.trimmed = run_settings.get("trimmed", True)
        self.qc_only = run_settings.get("qc_only", False)
        self.careful = run_settings.get("careful", True)
        self.pool = run_settings.get("pool", [])
        self.finishdir = run_settings.get("finishdir", "")

        self.sampleinfo = sampleinfo
        self.sample = None
        if isinstance(self.sampleinfo, list) and len(self.sampleinfo) > 1:
            self.name = self.sampleinfo[0].get("CG_ID_project")
            self.sample = self.sampleinfo[0]
            for entry in self.sampleinfo:
                if entry.get("CG_ID_sample") == self.name:
                    raise Exception(
                        "Mixed projects in samples_info file. Do not know how to proceed"
                    )
        else:
            if isinstance(self.sampleinfo, list):
                self.sampleinfo = self.sampleinfo[0]
            self.name = self.sampleinfo.get("CG_ID_sample")
            self.sample = self.sampleinfo

        # If timestamp is provided. Use it as analysis time. Else use current time
        if run_settings.get("timestamp") is not None:
            self.now = run_settings.get("timestamp")
            temp = run_settings.get("timestamp").replace("_", ".").split(".")
            self.dt = datetime(
                int(temp[0]),
                int(temp[1]),
                int(temp[2]),
                int(temp[3]),
                int(temp[4]),
                int(temp[5]),
            )
        else:
            self.dt = datetime.now()
            self.now = time.strftime("{}.{}.{}_{}.{}.{}".format(
                self.dt.year,
                self.dt.month,
                self.dt.day,
                self.dt.hour,
                self.dt.minute,
                self.dt.second,
            ))

        if run_settings.get("finishdir") is None:
            self.finishdir = "{}/{}_{}".format(config["folders"]["results"],
                                               self.name, self.now)
        self.db_pusher = DB_Manipulator(config, log)
        self.concat_files = dict()
        self.ref_resolver = Referencer(config, log)
示例#8
0
class Job_Creator:
    def __init__(self, config, log, sampleinfo={}, run_settings={}):
        self.config = config
        self.logger = log
        self.batchfile = "/tmp/batchfile.sbatch"

        self.filelist = list()
        if isinstance(run_settings.get("input"), list):
            self.filelist = run_settings.get("input")
            run_settings["input"] = "/tmp/"

        self.run_settings = run_settings
        self.indir = os.path.abspath(run_settings.get("input", "/tmp/"))
        self.trimmed = run_settings.get("trimmed", True)
        self.qc_only = run_settings.get("qc_only", False)
        self.careful = run_settings.get("careful", True)
        self.pool = run_settings.get("pool", [])
        self.finishdir = run_settings.get("finishdir", "")

        self.sampleinfo = sampleinfo
        self.sample = None
        if isinstance(self.sampleinfo, list) and len(self.sampleinfo) > 1:
            self.name = self.sampleinfo[0].get("CG_ID_project")
            self.sample = self.sampleinfo[0]
            for entry in self.sampleinfo:
                if entry.get("CG_ID_sample") == self.name:
                    raise Exception(
                        "Mixed projects in samples_info file. Do not know how to proceed"
                    )
        else:
            if isinstance(self.sampleinfo, list):
                self.sampleinfo = self.sampleinfo[0]
            self.name = self.sampleinfo.get("CG_ID_sample")
            self.sample = self.sampleinfo

        # If timestamp is provided. Use it as analysis time. Else use current time
        if run_settings.get("timestamp") is not None:
            self.now = run_settings.get("timestamp")
            temp = run_settings.get("timestamp").replace("_", ".").split(".")
            self.dt = datetime(
                int(temp[0]),
                int(temp[1]),
                int(temp[2]),
                int(temp[3]),
                int(temp[4]),
                int(temp[5]),
            )
        else:
            self.dt = datetime.now()
            self.now = time.strftime("{}.{}.{}_{}.{}.{}".format(
                self.dt.year,
                self.dt.month,
                self.dt.day,
                self.dt.hour,
                self.dt.minute,
                self.dt.second,
            ))

        if run_settings.get("finishdir") is None:
            self.finishdir = "{}/{}_{}".format(config["folders"]["results"],
                                               self.name, self.now)
        self.db_pusher = DB_Manipulator(config, log)
        self.concat_files = dict()
        self.ref_resolver = Referencer(config, log)

    def get_sbatch(self):
        """ Returns sbatchfile, slightly superflous"""
        return self.batchfile

    def get_headerargs(self):
        headerline = "-A {} -p {} -n {} -t {} -J {}_{} --qos {} --output {}/slurm_{}.log".format(
            self.config["slurm_header"]["project"],
            self.config["slurm_header"]["type"],
            self.config["slurm_header"]["threads"],
            self.config["slurm_header"]["time"],
            self.config["slurm_header"]["job_prefix"],
            self.name,
            self.config["slurm_header"]["qos"],
            self.finishdir,
            self.name,
        )
        return headerline

    def verify_fastq(self):
        """ Uses arg indir to return a dict of PE fastq tuples fulfilling naming convention """
        verified_files = list()
        files = os.listdir(self.indir)
        if files == []:
            raise Exception("Directory {} lacks fastq files.".format(
                self.indir))
        for file in files:
            file_match = re.match(self.config["regex"]["file_pattern"], file)
            if file_match:
                # Check that symlinks resolve
                path = "{}/{}".format(self.indir, file)
                if os.path.islink(path):
                    if not os.path.exists(os.readlink(path)):
                        raise Exception(
                            "Some fastq files are unresolved symlinks in directory {}."
                            .format(self.indir))

                # Make sure both mates exist
                if (file_match[1] == "1" or file_match[1] == "2"
                        or file_match[1] == "forward"
                        or file_match[1] == "reverse"):
                    if file_match[1] == "forward" or file_match[1] == "reverse":
                        pairno = "forward"
                        if "forward" in file_match[1]:
                            pairno = "reverse"
                        pairname = file_match[0].replace(file_match[1], pairno)
                    else:
                        pairno = 2 - 1 % int(file_match[1])  # 1->2, 2->1
                        # Construct mate name
                        pairname = "{}{}{}".format(
                            file_match.string[:file_match.end(1) - 1],
                            pairno,
                            file_match.string[file_match.end(1):file_match.end(
                            )],
                        )
                    if pairname in files:
                        files.pop(files.index(pairname))
                        verified_files.append(file_match[0])
                        verified_files.append(pairname)
                else:
                    raise Exception(
                        "Some fastq files have no mate in directory {}.".
                        format(self.indir))
        if verified_files == []:
            raise Exception(
                "No files in directory {} match file_pattern '{}'.".format(
                    self.indir, self.config["regex"]["file_pattern"]))

        # Warn about file sizes
        for vfile in verified_files:
            try:
                bsize = os.stat("{}/{}".format(self.indir, vfile)).st_size
                bsize = bsize >> 20
                if bsize > 1000:
                    self.logger.warning(
                        "Input fastq {} exceeds 1000MB".format(vfile))
            except Exception as e:
                self.logger.warning(
                    "Unable to verify size of input file {}/{}".format(
                        self.indir, vfile))

        # Warn about invalid fastq files
        for vfile in verified_files:
            f = gzip.open("{}/{}".format(self.indir, vfile), "r")
            lines = f.read().splitlines()
            if len(lines) < 2 or not "+" in str(lines[-2]):
                self.logger.warning(
                    "Input fastq {} does not seem to end properly".format(
                        vfile))
        return sorted(verified_files)

    def create_assemblysection(self):
        batchfile = open(self.batchfile, "a+")
        # memory is actually 128 per node regardless of cores.
        batchfile.write("# Spades assembly\n")
        if self.trimmed:
            trimline = "-s {}".format(self.concat_files["i"])
        else:
            trimline = ""
        if self.careful:
            careline = "--careful"
        else:
            careline = ""

        batchfile.write(
            "spades.py --threads {} {} --memory {} -o {}/assembly -1 {} -2 {} {}\n"
            .format(
                self.config["slurm_header"]["threads"],
                careline,
                8 * int(self.config["slurm_header"]["threads"]),
                self.finishdir,
                self.concat_files["f"],
                self.concat_files["r"],
                trimline,
            ))

        batchfile.write(
            "mv {0}/assembly/contigs.fasta {0}/assembly/{1}_contigs.fasta\n".
            format(self.finishdir, self.name))
        batchfile.write(
            "sed -n '/NODE_1000_/q;p' {0}/assembly/{1}_contigs.fasta > {0}/assembly/{1}_trimmed_contigs.fasta\n"
            .format(self.finishdir, self.name))
        # batchfile.write("##Input cleanup\n")
        # batchfile.write("rm -r {}/trimmed\n".format(self.finishdir))
        batchfile.write("\n\n")
        batchfile.close()

    def blast_subset(self, name, search_string):
        # Create run
        file_list = glob.glob(search_string)
        batchfile = open(self.batchfile, "a+")
        batchfile.write("mkdir {}/blast_search/{}\n".format(
            self.finishdir, name))
        blast_format = '"7 stitle sstrand qaccver saccver pident evalue bitscore qstart qend sstart send length"'

        if len(file_list) > 1:
            for ref in file_list:
                if re.search(r"(\w+(?:\-\w+)*)\.\w+",
                             os.path.basename(ref)) is None:
                    self.logger.error(
                        "File {} does not match typical format. Consider deleting and redownloading"
                    )
                else:
                    ref_nosuf = re.search(r"(\w+(?:\-\w+)*)\.\w+",
                                          os.path.basename(ref)).group(1)
                batchfile.write("# BLAST {} search for {}, {}\n".format(
                    name, self.sample.get("organism"), ref_nosuf))
                if name == "mlst":
                    batchfile.write(
                        "blastn -db {}/{}  -query {}/assembly/{}_contigs.fasta -out {}/blast_search/{}/loci_query_{}.txt -task megablast -num_threads {} -outfmt {}\n"
                        .format(
                            os.path.dirname(ref),
                            ref_nosuf,
                            self.finishdir,
                            self.name,
                            self.finishdir,
                            name,
                            ref_nosuf,
                            self.config["slurm_header"]["threads"],
                            blast_format,
                        ))
                else:
                    batchfile.write(
                        "blastn -db {}/{}  -query {}/assembly/{}_contigs.fasta -out {}/blast_search/{}/{}.txt -task megablast -num_threads {} -outfmt {}\n"
                        .format(
                            os.path.dirname(ref),
                            ref_nosuf,
                            self.finishdir,
                            self.name,
                            self.finishdir,
                            name,
                            ref_nosuf,
                            self.config["slurm_header"]["threads"],
                            blast_format,
                        ))
        elif len(file_list) == 1:
            ref_nosuf = re.search(r"(\w+(?:\-\w+)*)\.\w+",
                                  os.path.basename(file_list[0])).group(1)
            batchfile.write("## BLAST {} search in {}\n".format(
                name,
                self.sample.get("organism").replace("_", " ").capitalize()))
            batchfile.write(
                "blastn -db {}/{}  -query {}/assembly/{}_contigs.fasta -out {}/blast_search/{}/{}.txt -task megablast -num_threads {} -outfmt {}\n"
                .format(
                    os.path.dirname(search_string),
                    ref_nosuf,
                    self.finishdir,
                    self.name,
                    self.finishdir,
                    name,
                    ref_nosuf,
                    self.config["slurm_header"]["threads"],
                    blast_format,
                ))
        batchfile.write("\n")
        batchfile.close()

    def create_variantsection(self):
        """ Creates a job for variant calling based on local alignment """
        ref = "{}/{}.fasta".format(self.config["folders"]["genomes"],
                                   self.sample.get("reference"))
        localdir = "{}/alignment".format(self.finishdir)
        outbase = "{}/{}_{}".format(localdir, self.name,
                                    self.sample.get("reference"))

        # Create run
        batchfile = open(self.batchfile, "a+")
        batchfile.write("# Variant calling based on local alignment\n")
        batchfile.write("mkdir {}\n".format(localdir))

        batchfile.write("## Alignment & Deduplication\n")
        batchfile.write("bwa mem -M -t {} {} {} {} > {}.sam\n".format(
            self.config["slurm_header"]["threads"],
            ref,
            self.concat_files["f"],
            self.concat_files["r"],
            outbase,
        ))
        batchfile.write(
            "samtools view --threads {} -b -o {}.bam -T {} {}.sam\n".format(
                self.config["slurm_header"]["threads"], outbase, ref, outbase))
        batchfile.write(
            "samtools sort --threads {} -o {}.bam_sort {}.bam\n".format(
                self.config["slurm_header"]["threads"], outbase, outbase))
        batchfile.write(
            "picard MarkDuplicates I={}.bam_sort O={}.bam_sort_rmdup M={}.stats.dup REMOVE_DUPLICATES=true\n"
            .format(outbase, outbase, outbase))
        batchfile.write("samtools index {}.bam_sort_rmdup\n".format(outbase))
        batchfile.write(
            "samtools idxstats {}.bam_sort_rmdup &> {}.stats.ref\n".format(
                outbase, outbase))
        # Removal of temp aligment files
        batchfile.write("rm {}.bam {}.sam\n".format(outbase, outbase))

        batchfile.write("## Primary stats generation\n")
        # Insert stats, dedupped
        batchfile.write(
            "picard CollectInsertSizeMetrics I={}.bam_sort_rmdup O={}.stats.ins H={}.hist.ins\n"
            .format(outbase, outbase, outbase))
        # Coverage
        batchfile.write(
            "samtools stats --coverage 1,10000,1 {}.bam_sort_rmdup |grep ^COV | cut -f 2- &> {}.stats.cov\n"
            .format(outbase, outbase))
        # Mapped rate, no dedup,dedup in MWGS (trimming has no effect)!
        batchfile.write(
            "samtools flagstat {}.bam_sort &> {}.stats.map\n".format(
                outbase, outbase))
        # Total reads, no dedup,dedup in MWGS (trimming has no effect)!
        batchfile.write(
            "samtools view -c {}.bam_sort &> {}.stats.raw\n".format(
                outbase, outbase))

        batchfile.write("\n\n")
        batchfile.close()

    def create_preprocsection(self):
        """Concatinates data, possibly trims it, then makes the unstranded reads usable"""
        forward = list()
        reverse = list()
        for root, dirs, files in os.walk(self.config["folders"]["adapters"]):
            if not "NexteraPE-PE.fa" in files:
                self.logger.error(
                    "Adapters folder at {} does not contain NexteraPE-PE.fa. Review paths.yml"
                )
            else:
                break
        trimdir = "{}/trimmed".format(self.finishdir)
        files = self.verify_fastq()
        batchfile = open(self.batchfile, "a+")
        batchfile.write("#Trimmomatic section\n")
        batchfile.write("mkdir {}\n".format(trimdir))

        batchfile.write("##Pre-concatination\n")
        for file in files:
            fullfile = "{}/{}".format(self.indir, file)
            # Even indexes = Forward
            if not files.index(file) % 2:
                forward.append(fullfile)
            elif files.index(file) % 2:
                reverse.append(fullfile)
        outfile = files[0].split("_")[0]

        self.concat_files["f"] = "{}/trimmed/{}_forward_reads.fastq.gz".format(
            self.finishdir, self.name)
        self.concat_files["r"] = "{}/trimmed/{}_reverse_reads.fastq.gz".format(
            self.finishdir, self.name)
        batchfile.write("cat {} > {}\n".format(" ".join(forward),
                                               self.concat_files.get("f")))
        batchfile.write("cat {} > {}\n".format(" ".join(reverse),
                                               self.concat_files.get("r")))

        if self.trimmed:
            fp = "{}/{}_trim_front_pair.fastq.gz".format(trimdir, outfile)
            fu = "{}/{}_trim_front_unpair.fastq.gz".format(trimdir, outfile)
            rp = "{}/{}_trim_rev_pair.fastq.gz".format(trimdir, outfile)
            ru = "{}/{}_trim_rev_unpair.fastq.gz".format(trimdir, outfile)
            batchfile.write("##Trimming section\n")
            batchfile.write(
                "trimmomatic PE -threads {} -phred33 {} {} {} {} {} {}\
      ILLUMINACLIP:{}/NexteraPE-PE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36\n"
                .format(
                    self.config["slurm_header"]["threads"],
                    self.concat_files.get("f"),
                    self.concat_files.get("r"),
                    fp,
                    fu,
                    rp,
                    ru,
                    self.config["folders"]["adapters"],
                ))

            batchfile.write("## Interlaced trimmed files\n")
            self.concat_files["f"] = fp
            self.concat_files["r"] = rp
            self.concat_files["i"] = "{}/{}_trim_unpair.fastq.gz".format(
                trimdir, outfile)

            batchfile.write("cat {} >> {}\n".format(" ".join(
                [fu, ru]), self.concat_files.get("i")))
        batchfile.write("\n")
        batchfile.close()

    def create_assemblystats_section(self):
        batchfile = open(self.batchfile, "a+")
        batchfile.write("# QUAST QC metrics\n")
        batchfile.write("mkdir {}/assembly/quast\n".format(self.finishdir))
        batchfile.write(
            "quast.py {}/assembly/{}_contigs.fasta -o {}/assembly/quast\n".
            format(self.finishdir, self.name, self.finishdir))
        batchfile.write(
            "mv {}/assembly/quast/report.tsv {}/assembly/quast/{}_report.tsv\n\n"
            .format(self.finishdir, self.finishdir, self.name))
        batchfile.close()

    def create_snpsection(self):
        snplist = self.filelist.copy()
        batchfile = open(self.batchfile, "a+")
        name = ""

        # VCFTools filters:
        vcffilter = "--minQ 30 --thin 50 --minDP 3 --min-meanDP 20"
        # BCFTools filters:
        bcffilter = "GL[0]<-500 & GL[1]=0 & QR/RO>30 & QA/AO>30 & QUAL>5000 & ODDS>1100 & GQ>140 & DP>100 & MQM>59 & SAP<15 & PAIRED>0.9 & EPP>3"

        for item in snplist:
            if item.count("/") >= 2:
                name = item.split("/")[-2]
            if "_" in name:
                name = name.split("_")[0]
            batchfile.write("# Basecalling for sample {}\n".format(name))
            ref = "{}/{}.fasta".format(self.config["folders"]["genomes"],
                                       self.sample.get("reference"))
            outbase = "{}/{}_{}".format(item, name,
                                        self.sample.get("reference"))
            batchfile.write(
                "samtools view -h -q 1 -F 4 -F 256 {}.bam_sort_rmdup | grep -v XA:Z | grep -v SA:Z| samtools view -b - > {}/{}.unique\n"
                .format(outbase, self.finishdir, name))
            batchfile.write(
                "freebayes -= --pvar 0.7 -j -J --standard-filters -C 6 --min-coverage 30 --ploidy 1 -f {} -b {}/{}.unique -v {}/{}.vcf\n"
                .format(ref, self.finishdir, name, self.finishdir, name))
            batchfile.write(
                "bcftools view {}/{}.vcf -o {}/{}.bcf.gz -O b --exclude-uncalled --types snps\n"
                .format(self.finishdir, name, self.finishdir, name))
            batchfile.write("bcftools index {}/{}.bcf.gz\n".format(
                self.finishdir, name))
            batchfile.write("\n")

            batchfile.write(
                "vcftools --bcf {}/{}.bcf.gz {} --remove-filtered-all --recode-INFO-all --recode-bcf --out {}/{}\n"
                .format(self.finishdir, name, vcffilter, self.finishdir, name))
            batchfile.write(
                'bcftools view {}/{}.recode.bcf -i "{}" -o {}/{}.recode.bcf.gz -O b --exclude-uncalled --types snps\n'
                .format(self.finishdir, name, bcffilter, self.finishdir, name))
            batchfile.write("bcftools index {}/{}.recode.bcf.gz\n\n".format(
                self.finishdir, name))

        batchfile.write("# SNP pair-wise distance\n")
        batchfile.write("touch {}/stats.out\n".format(self.finishdir))
        while len(snplist) > 1:
            nameOne = ""
            nameTwo = ""
            top = snplist.pop(0)
            if top.count("/") >= 2:
                nameOne = top.split("/")[-2]
            if "_" in nameOne:
                nameOne = nameOne.split("_")[0]
            for entry in snplist:
                if entry.count("/") >= 2:
                    nameTwo = entry.split("/")[-2]
                if "_" in nameTwo:
                    nameTwo = nameTwo.split("_")[0]

                pair = "{}_{}".format(nameOne, nameTwo)
                batchfile.write(
                    "bcftools isec {}/{}.recode.bcf.gz {}/{}.recode.bcf.gz -n=1 -c all -p {}/tmp -O b\n"
                    .format(self.finishdir, nameOne, self.finishdir, nameTwo,
                            self.finishdir))
                batchfile.write(
                    "bcftools merge -O b -o {}/{}.bcf.gz --force-samples {}/tmp/0000.bcf {}/tmp/0001.bcf\n"
                    .format(self.finishdir, pair, self.finishdir,
                            self.finishdir))
                batchfile.write("bcftools index {}/{}.bcf.gz\n".format(
                    self.finishdir, pair))

                batchfile.write(
                    "echo {} $( bcftools stats {}/{}.bcf.gz |grep SNPs: | cut -d $'\\t' -f4 ) >> {}/stats.out\n"
                    .format(pair, self.finishdir, pair, self.finishdir))
                batchfile.write("\n")
        batchfile.close()

    def create_collection(self):
        """Creates collection entry in database"""
        if self.db_pusher.exists("Collections", {"ID_collection": self.name}):
            self.db_pusher.purge_rec(name=self.name, type="Collections")
            for sample in self.pool:
                self.db_pusher.add_rec(
                    {
                        "ID_collection": self.name,
                        "CG_ID_sample": sample
                    }, "Collections")

        addedprojs = list()
        for sample in self.pool:
            proj = re.search(r"(\w+)A(?:\w+)", sample).group(1)
            if proj not in addedprojs:
                self.create_project(proj)
                addedprojs.append(proj)

    def create_project(self, name):
        """Creates project in database"""
        proj_col = dict()
        proj_col["CG_ID_project"] = name
        proj_col["Customer_ID_project"] = self.sample.get(
            "Customer_ID_project")
        proj_col["Customer_ID"] = self.sample.get("Customer_ID")
        self.db_pusher.add_rec(proj_col, "Projects")
        self.db_pusher.upd_rec({"CG_ID_project": name}, "Projects", proj_col)

    def create_sample(self, name):
        """Creates sample in database"""
        try:
            sample_col = self.db_pusher.get_columns("Samples")
            sample_col["CG_ID_sample"] = self.sample.get("CG_ID_sample")
            sample_col["CG_ID_project"] = self.sample.get("CG_ID_project")
            sample_col["Customer_ID_sample"] = self.sample.get(
                "Customer_ID_sample")
            sample_col["reference_genome"] = self.sample.get("reference")
            sample_col["reference_length"] = self.sample.get(
                "reference_length")
            sample_col["date_analysis"] = self.dt
            sample_col["organism"] = self.sample.get("organism")
            sample_col["application_tag"] = self.sample.get("application_tag")
            sample_col["priority"] = self.sample.get("priority")
            sample_col["date_arrival"] = datetime.strptime(
                self.sample.get("date_arrival"), "%Y-%m-%d %H:%M:%S")
            sample_col["date_sequencing"] = datetime.strptime(
                self.sample.get("date_sequencing"), "%Y-%m-%d %H:%M:%S")
            sample_col["date_libprep"] = datetime.strptime(
                self.sample.get("date_libprep"), "%Y-%m-%d %H:%M:%S")
            sample_col["method_libprep"] = self.sample.get("method_libprep")
            sample_col["method_sequencing"] = self.sample.get(
                "method_sequencing")
            # self.db_pusher.purge_rec(sample_col['CG_ID_sample'], 'sample')
            self.db_pusher.add_rec(sample_col, "Samples")
        except Exception as e:
            self.logger.error("Unable to add sample {} to database".format(
                self.name))

    def project_job(self, single_sample=False):
        if "dry" in self.config and self.config["dry"] == True:
            dry = True
        else:
            dry = False
        jobarray = list()
        if not os.path.exists(self.finishdir):
            os.makedirs(self.finishdir)
        # Loads project level info.
        try:
            if single_sample:
                self.create_project(self.sample.get("CG_ID_project"))
            elif self.pool:
                self.create_collection()
            else:
                self.create_project(self.name)
        except Exception as e:
            self.logger.error(
                "LIMS interaction failed. Unable to read/write project {}".
                format(self.name))
        # Writes the job creation sbatch
        if single_sample:
            try:
                self.sample_job()
                headerargs = self.get_headerargs()
                outfile = self.get_sbatch()
                bash_cmd = "sbatch {} {}".format(headerargs, outfile)
                if not dry and outfile != "":
                    samproc = subprocess.Popen(bash_cmd.split(),
                                               stdout=subprocess.PIPE)
                    output, error = samproc.communicate()
                    jobno = re.search(r"(\d+)", str(output)).group(0)
                    jobarray.append(jobno)
                else:
                    self.logger.info("Suppressed command: {}".format(bash_cmd))
            except Exception as e:
                self.logger.error("Unable to analyze single sample {}".format(
                    self.name))
        else:
            for ldir in glob.glob("{}/*/".format(self.indir)):
                ldir = os.path.basename(os.path.normpath(ldir))
                try:
                    sample_in = "{}/{}".format(self.indir, ldir)
                    sample_out = "{}/{}".format(self.finishdir, ldir)
                    linkedjson = None
                    local_sampleinfo = [
                        p for p in self.sampleinfo if p["CG_ID_sample"] == ldir
                    ]
                    if local_sampleinfo == []:
                        raise Exception(
                            "Sample {} has no counterpart in json file".format(
                                ldir))
                    else:
                        local_sampleinfo = local_sampleinfo[0]
                    sample_settings = dict(self.run_settings)
                    sample_settings["input"] = sample_in
                    sample_settings["finishdir"] = sample_out
                    sample_settings["timestamp"] = self.now
                    sample_instance = Job_Creator(
                        config=self.config,
                        log=self.logger,
                        sampleinfo=local_sampleinfo,
                        run_settings=sample_settings,
                    )
                    sample_instance.sample_job()
                    headerargs = sample_instance.get_headerargs()
                    outfile = ""
                    if os.path.isfile(sample_instance.get_sbatch()):
                        outfile = sample_instance.get_sbatch()
                    bash_cmd = "sbatch {} {}".format(headerargs, outfile)
                    if not dry and outfile != "":
                        projproc = subprocess.Popen(bash_cmd.split(),
                                                    stdout=subprocess.PIPE)
                        output, error = projproc.communicate()
                        jobno = re.search(r"(\d+)", str(output)).group(0)
                        jobarray.append(jobno)
                    else:
                        self.logger.info(
                            "Suppressed command: {}".format(bash_cmd))
                except Exception as e:
                    pass
        if not dry:
            self.finish_job(jobarray, single_sample)

    def finish_job(self, joblist, single_sample=False):
        """ Uploads data and sends an email once all analysis jobs are complete. """
        report = "default"
        if self.qc_only:
            report = "qc"
        custom_conf = ""
        if "config_path" in self.config:
            custom_conf = "--config {}".format(self.config["config_path"])

        process = subprocess.Popen("id -un".split(), stdout=subprocess.PIPE)
        user, error = process.communicate()
        user = str(user).replace(".", " ").title()
        # if not os.path.exists(self.finishdir):
        #  os.makedirs(self.finishdir)

        startfile = "{}/run_started.out".format(self.finishdir)
        configfile = "{}/config.log".format(self.finishdir)
        mailfile = "{}/mailjob.sh".format(self.finishdir)
        samplefile = "{}/sampleinfo.json".format(self.finishdir)
        with open(samplefile, "w+") as outfile:
            json.dump(self.sampleinfo, outfile)

        sb = open(startfile, "w+")
        cb = open(configfile, "w+")
        mb = open(mailfile, "w+")

        sb.write("#!/usr/bin/env bash\n")
        sb.close()
        configout = self.config.copy()
        if "genologics" in configout:
            del configout["genologics"]
        cb.write("ANALYSIS STARTED BY: {}\n".format(user))
        cb.write(json.dumps(configout, indent=2, separators=(",", ":")))
        cb.close()
        mb.write("#!/usr/bin/env bash\n\n")
        mb.write(
            "#Uploading of results to database and production of report\n")
        if "MICROSALT_CONFIG" in os.environ:
            mb.write("export MICROSALT_CONFIG={}\n".format(
                os.environ["MICROSALT_CONFIG"]))
        mb.write("source activate $CONDA_DEFAULT_ENV\n")

        mb.write(
            "microSALT utils finish {0}/sampleinfo.json --input {0} --email {1} --report {2} {3}\n"
            .format(
                self.finishdir,
                self.config["regex"]["mail_recipient"],
                report,
                custom_conf,
            ))
        mb.write("touch {}/run_complete.out".format(self.finishdir))
        mb.close()

        massagedJobs = list()
        final = ":".join(joblist)
        # Create subtracker if more than 50 samples
        maxlen = 50
        if len(joblist) > maxlen:
            i = 1
            while i <= len(joblist):
                if i + maxlen < len(joblist):
                    massagedJobs.append(":".join(joblist[i - 1:i + maxlen -
                                                         1]))
                else:
                    massagedJobs.append(":".join(joblist[i - 1:-1]))
                i += maxlen
            for entry in massagedJobs:
                if massagedJobs.index(entry) < len(massagedJobs) - 1:
                    head = "-A {} -p core -n 1 -t 00:00:10 -J {}_{}_SUBTRACKER --qos {} --dependency=afterany:{}".format(
                        self.config["slurm_header"]["project"],
                        self.config["slurm_header"]["job_prefix"],
                        self.name,
                        self.config["slurm_header"]["qos"],
                        entry,
                    )
                    bash_cmd = "sbatch {} {}".format(head, startfile)
                    mailproc = subprocess.Popen(bash_cmd.split(),
                                                stdout=subprocess.PIPE)
                    output, error = mailproc.communicate()
                    jobno = re.search(r"(\d+)", str(output)).group(0)
                    massagedJobs[massagedJobs.index(entry) +
                                 1] += ":{}".format(jobno)
                else:
                    final = entry
                    break

        head = "-A {} -p core -n 1 -t 6:00:00 -J {}_{}_MAILJOB --qos {} --open-mode append --dependency=afterany:{} --output {}".format(
            self.config["slurm_header"]["project"],
            self.config["slurm_header"]["job_prefix"],
            self.name,
            self.config["slurm_header"]["qos"],
            final,
            self.config["folders"]["log_file"],
            self.config["regex"]["mail_recipient"],
        )
        bash_cmd = "sbatch {} {}".format(head, mailfile)
        mailproc = subprocess.Popen(bash_cmd.split(), stdout=subprocess.PIPE)
        output, error = mailproc.communicate()

        try:
            jobno = str(re.search(r"(\d+)", str(output)).group(0))
            joblist.append(jobno)
        except Exception as e:
            self.logger.info("Unable to grab SLURMID for {0}".format(
                self.name))

        try:
            #Generates file with all slurm ids
            slurmname = "{}_slurm_ids.yaml".format(self.name)
            slurmreport_storedir = Path(self.config["folders"]["reports"],
                                        "trailblazer", slurmname)
            slurmreport_workdir = Path(self.finishdir, slurmname)
            yaml.safe_dump(data={"jobs": [str(job) for job in joblist]},
                           stream=open(slurmreport_workdir, "w"))
            shutil.copyfile(slurmreport_workdir, slurmreport_storedir)
            self.logger.info(
                "Saved Trailblazer slurm report file to %s and %s",
                slurmreport_storedir,
                slurmreport_workdir,
            )
        except Exception as e:
            self.logger.info(
                "Unable to generate Trailblazer slurm report file")

    def sample_job(self):
        """ Writes necessary sbatch job for each individual sample """
        try:
            if not os.path.exists(self.finishdir):
                os.makedirs(self.finishdir)
            try:
                # This is one job
                self.batchfile = "{}/runfile.sbatch".format(self.finishdir)
                batchfile = open(self.batchfile, "w+")
                batchfile.write("#!/bin/sh\n\n")
                batchfile.write("mkdir -p {}\n".format(self.finishdir))
                batchfile.close()
                self.create_preprocsection()
                self.create_variantsection()
                if not self.qc_only:
                    self.create_assemblysection()
                    self.create_assemblystats_section()
                    self.create_blast_search()
                batchfile = open(self.batchfile, "a+")
                batchfile.close()

                self.logger.info(
                    "Created runfile for sample {} in folder {}".format(
                        self.name, self.finishdir))
            except Exception as e:
                raise
            try:
                self.create_sample(self.name)
            except Exception as e:
                self.logger.error(
                    "Unable to access LIMS info for sample {}".format(
                        self.name))
        except Exception as e:
            self.logger.error(
                "Unable to create job for sample {}\nSource: {}".format(
                    self.name, str(e)))
            shutil.rmtree(self.finishdir, ignore_errors=True)
            raise

    def create_blast_search(self):
        reforganism = self.ref_resolver.organism2reference(
            self.sample.get("organism"))
        self.batchfile = "{}/runfile.sbatch".format(self.finishdir)
        batchfile = open(self.batchfile, "a+")
        batchfile.write("mkdir -p {}/blast_search\n".format(self.finishdir))
        batchfile.close()
        self.blast_subset(
            "mlst",
            "{}/{}/*.tfa".format(self.config["folders"]["references"],
                                 reforganism),
        )
        self.blast_subset(
            "resistance",
            "{}/*.fsa".format(self.config["folders"]["resistances"]))
        if reforganism == "escherichia_coli":
            ss = "{}/*{}".format(
                os.path.dirname(self.config["folders"]["expec"]),
                os.path.splitext(self.config["folders"]["expec"])[1],
            )
            self.blast_subset("expec", ss)

    def snp_job(self):
        """ Writes a SNP calling job for a set of samples """
        if not os.path.exists(self.finishdir):
            os.makedirs(self.finishdir)

        self.batchfile = "{}/runfile.sbatch".format(self.finishdir)
        batchfile = open(self.batchfile, "w+")
        batchfile.write("#!/usr/bin/env bash\n")
        batchfile.write("mkdir -p {}\n\n".format(self.finishdir))
        batchfile.close()

        self.create_snpsection()
        batchfile = open(self.batchfile, "a+")
        batchfile.close()

        headerline = (
            "-A {} -p {} -n 1 -t 24:00:00 -J {}_{} --qos {} --output {}/slurm_{}.log"
            .format(
                self.config["slurm_header"]["project"],
                self.config["slurm_header"]["type"],
                self.config["slurm_header"]["job_prefix"],
                self.name,
                self.config["slurm_header"]["qos"],
                self.finishdir,
                self.name,
            ))
        outfile = self.get_sbatch()
        bash_cmd = "sbatch {} {}".format(headerline, outfile)
        samproc = subprocess.Popen(bash_cmd.split(), stdout=subprocess.PIPE)
        output, error = samproc.communicate()
def dbm():
  db_file = re.search('sqlite:///(.+)', preset_config['database']['SQLALCHEMY_DATABASE_URI']).group(1)
  dbm = DB_Manipulator(config=preset_config,log=logger)
  dbm.create_tables()

  for antry in unpack_db_json('sampleinfo_projects.json'):
    dbm.add_rec(antry, 'Projects')
  for entry in unpack_db_json('sampleinfo_mlst.json'):
    dbm.add_rec(entry, 'Seq_types')
  for bentry in unpack_db_json('sampleinfo_resistance.json'):
    dbm.add_rec(bentry, 'Resistances')
  for centry in unpack_db_json('sampleinfo_expec.json'):
    dbm.add_rec(centry, 'Expacs')
  for dentry in unpack_db_json('sampleinfo_reports.json'):
    dbm.add_rec(dentry, 'Reports')
  return dbm
示例#10
0
class Scraper:
    def __init__(self, config, log, sampleinfo={}, input=""):
        self.config = config
        self.logger = log
        self.db_pusher = DB_Manipulator(config, log)
        self.referencer = Referencer(config, log)
        self.job_fallback = Job_Creator(config=config,
                                        log=log,
                                        sampleinfo=sampleinfo)
        self.infolder = os.path.abspath(input)
        self.sampledir = ""

        # Since all scraped folders are generated by Job_Creator, datestring is automatically provided in last folder
        last_folder = self.infolder.split("/")[-1]
        self.name = last_folder.split("_")[0]

        self.sampleinfo = sampleinfo
        self.sample = None
        if isinstance(self.sampleinfo, list) and len(self.sampleinfo) > 1:
            self.name = self.sampleinfo[0].get("CG_ID_project")
            self.sample = self.sampleinfo[0]
            for entry in self.sampleinfo:
                if entry.get("CG_ID_sample") == self.name:
                    raise Exception(
                        "Mixed projects in samples_info file. Do not know how to proceed"
                    )
        else:
            if isinstance(self.sampleinfo, list):
                self.sampleinfo = self.sampleinfo[0]
            self.name = self.sampleinfo.get("CG_ID_sample")
            self.sample = self.sampleinfo

        self.gene2resistance = self.load_resistances()

    def scrape_project(self, project=None):
        """Scrapes a project folder for information"""
        if project is None:
            project = self.name
        self.db_pusher.purge_rec(project, "Projects")
        if not self.db_pusher.exists("Projects", {"CG_ID_project": project}):
            self.logger.warning("Replacing project {}".format(project))
            self.job_fallback.create_project(project)

        # Scrape order matters a lot!
        for dir in os.listdir(self.infolder):
            subdir = "{}/{}".format(self.infolder, dir)
            local_param = [
                p for p in self.sampleinfo if p["CG_ID_sample"] == dir
            ]
            if local_param != []:
                local_param = local_param[0]
                sample_scraper = Scraper(
                    config=self.config,
                    log=self.logger,
                    sampleinfo=local_param,
                    input=subdir,
                )
                sample_scraper.scrape_sample()
            else:
                self.logger.warning(
                    "Skipping {} due to lacking info in sample_json file".
                    format(dir))

    def scrape_sample(self, sample=None):
        """Scrapes a sample folder for information"""
        if sample is None:
            sample = self.name
        self.db_pusher.purge_rec(sample, "Samples")

        if not self.db_pusher.exists(
                "Projects",
            {"CG_ID_project": self.sample.get("CG_ID_project")}):
            self.logger.warning("Replacing project {}".format(
                self.sample.get("CG_ID_project")))
            self.job_fallback.create_project(self.sample.get("CG_ID_project"))

        if not self.db_pusher.exists("Samples", {"CG_ID_sample": sample}):
            self.logger.warning("Replacing sample {}".format(sample))
            self.job_fallback.create_sample(sample)

        # Scrape order matters a lot!
        self.sampledir = self.infolder
        self.scrape_blast(type="seq_type")
        self.scrape_blast(type="resistance")
        if (self.referencer.organism2reference(
                self.sample.get("organism")) == "escherichia_coli"):
            self.scrape_blast(type="expec")
        self.scrape_alignment()
        self.scrape_quast()

    def scrape_quast(self, filename=""):
        """Scrapes a quast report for assembly information"""
        if filename == "":
            filename = "{}/assembly/quast/report.tsv".format(self.sampledir)
        quast = dict()
        try:
            with open(filename, "r") as infile:
                for line in infile:
                    lsplit = line.rstrip().split("\t")
                    if lsplit[0] == "# contigs":
                        quast["contigs"] = int(lsplit[1])
                    elif lsplit[0] == "Total length":
                        quast["genome_length"] = int(lsplit[1])
                    elif lsplit[0] == "GC (%)":
                        quast["gc_percentage"] = float(lsplit[1])
                    elif lsplit[0] == "N50":
                        quast["n50"] = int(lsplit[1])

            self.db_pusher.upd_rec({"CG_ID_sample": self.name}, "Samples",
                                   quast)
            self.logger.debug("Project {} recieved quast stats: {}".format(
                self.name, quast))
        except Exception as e:
            self.logger.warning(
                "Cannot generate quast statistics for {}".format(self.name))

    def get_locilengths(self, foldername, suffix):
        """ Generate a dict of length for any given loci """
        # Create dict with full name as key, associated nucleotides as value.
        alleles = dict()
        finalalleles = dict()
        for file in os.listdir(foldername):
            if file.endswith(suffix):
                lastallele = ""
                f = open("{}/{}".format(foldername, file), "r")
                for row in f:
                    if ">" in row:
                        lastallele = row.strip()
                        alleles[lastallele] = ""
                    else:
                        alleles[lastallele] = alleles[lastallele] + row.strip()
                f.close()

                for k, v in alleles.items():
                    finalalleles[k] = len(v)
        return finalalleles

    def scrape_blast(self, type="", file_list=[]):
        hypo = list()
        type2db = type.capitalize() + "s"
        if type == "expec":
            type2db = "Expacs"

        if file_list == []:
            if type == "seq_type":
                file_list = glob.glob("{}/blast_search/mlst/*".format(
                    self.sampledir))
            else:
                file_list = glob.glob("{}/blast_search/{}/*".format(
                    self.sampledir, type))

        organism = self.referencer.organism2reference(
            self.sample.get("organism"))
        if organism:
            self.db_pusher.upd_rec({"CG_ID_sample": self.name}, "Samples",
                                   {"organism": organism})
        res_cols = self.db_pusher.get_columns("{}".format(type2db))

        try:
            old_ref = ""
            for file in file_list:
                filename = os.path.basename(file).rsplit(
                    ".", 1)[0]  # Removes suffix
                if filename == "lactam":
                    filename = "beta-lactam"
                if type == "resistance":
                    ref_folder = self.config["folders"]["resistances"]
                    suffix = "fsa"
                elif type == "expec":
                    ref_folder = os.path.dirname(
                        self.config["folders"]["expec"])
                    suffix = os.path.basename(
                        self.config["folders"]["expec"]).rsplit(".", 1)[1]
                elif type == "seq_type":
                    ref_folder = "{}/{}".format(
                        self.config["folders"]["references"], organism)
                    suffix = "tfa"
                locilengths = self.get_locilengths(ref_folder, suffix)

                with open("{}".format(file), "r") as sample:
                    for line in sample:
                        # Ignore commented fields
                        if not line[0] == "#":

                            elem_list = line.rstrip().split("\t")
                            if not elem_list[1] == "N/A":
                                hypo.append(dict())
                                hypo[-1]["CG_ID_sample"] = self.name
                                hypo[-1]["identity"] = elem_list[4]
                                hypo[-1]["evalue"] = elem_list[5]
                                hypo[-1]["bitscore"] = elem_list[6]
                                if int(elem_list[7]) < int(elem_list[8]):
                                    hypo[-1]["contig_start"] = int(
                                        elem_list[7])
                                    hypo[-1]["contig_end"] = int(elem_list[8])
                                else:
                                    hypo[-1]["contig_start"] = int(
                                        elem_list[8])
                                    hypo[-1]["contig_end"] = int(elem_list[7])
                                hypo[-1]["subject_length"] = int(elem_list[11])

                                if type == "resistance":
                                    hypo[-1]["instance"] = filename
                                    partials = re.search(
                                        r"(?:\>)*(.+)_(\d+){1,3}(?:_(.+))*",
                                        elem_list[3],
                                    )
                                    hypo[-1]["reference"] = partials.group(3)
                                    hypo[-1]["gene"] = partials.group(1)
                                    if hypo[-1][
                                            "gene"] in self.gene2resistance.keys(
                                            ):
                                        hypo[-1][
                                            "resistance"] = self.gene2resistance[
                                                hypo[-1]["gene"]]
                                    else:
                                        hypo[-1]["{}".format(type)] = hypo[-1][
                                            "instance"].capitalize()
                                    hypo[-1]["span"] = (
                                        float(hypo[-1]["subject_length"]) /
                                        locilengths[">{}".format(
                                            elem_list[3])])

                                elif type == "expec":
                                    hypo[-1]["instance"] = filename
                                    # Thanks, precompiled list standards
                                    if ">" in elem_list[3]:
                                        partials = re.search(
                                            r">*(\w+_\w+\.*\w+).+\((\w+)\).+\((\w+)\)_(\w+)_\[.+\]",
                                            elem_list[3],
                                        )
                                    else:
                                        partials = re.search(
                                            r"(\w+)\(gb\|\w+\)_\((\S+)\)_(.+)_\[(\S+)_.+\]_\[\S+\]",
                                            elem_list[3],
                                        )
                                    if not partials:
                                        partials = re.search(
                                            r"(\w+\.*\w+)\:*\w*_*(?:\(\w+\-\w+\))*_\((\w+)\)_([^[]+)\[\S+\]",
                                            elem_list[3],
                                        )
                                    # NC/Protein reference
                                    hypo[-1]["reference"] = partials.group(1)
                                    # Full gene name
                                    hypo[-1]["gene"] = partials.group(2)
                                    # More generic group
                                    hypo[-1]["instance"] = partials.group(
                                        3).strip("_")
                                    # Description
                                    if len(partials.groups()) >= 4:
                                        hypo[-1]["virulence"] = (
                                            partials.group(4).replace(
                                                "_", " ").capitalize())
                                    else:
                                        hypo[-1]["virulence"] = ""
                                    hypo[-1]["span"] = (
                                        float(hypo[-1]["subject_length"]) /
                                        locilengths[">{}".format(
                                            elem_list[3])])

                                elif type == "seq_type":
                                    partials = re.search(
                                        r"(.+)_(\d+){1,3}(?:_(\w+))*",
                                        elem_list[3])
                                    hypo[-1]["loci"] = partials.group(1)
                                    hypo[-1]["allele"] = int(partials.group(2))
                                    hypo[-1]["span"] = (
                                        float(hypo[-1]["subject_length"]) /
                                        locilengths[">{}".format(
                                            elem_list[3])])

                                # split elem 2 into contig node_NO, length, cov
                                nodeinfo = elem_list[2].split("_")
                                hypo[-1]["contig_name"] = "{}_{}".format(
                                    nodeinfo[0], nodeinfo[1])
                                hypo[-1]["contig_length"] = int(nodeinfo[3])
                                hypo[-1]["contig_coverage"] = nodeinfo[5]
                                self.logger.debug(
                                    "scrape_blast scrape loop hit")
            self.logger.info("{} candidate {} hits found".format(
                len(hypo), type2db))
        except Exception as e:
            self.logger.error("Unable to process the pattern of {}".format(
                str(e)))

        # Cleanup of overlapping hits
        if type == "seq_type":
            identifier = "loci"
        elif type == "resistance" or type == "expec":
            identifier = "gene"
        ind = 0
        while ind < len(hypo) - 1:
            targ = ind + 1
            while targ < len(hypo):
                ignore = False
                if (hypo[ind]["contig_name"] == hypo[targ]["contig_name"]
                        or hypo[ind][identifier] == hypo[targ][identifier]):
                    # Overlapping or shared gene
                    if ((hypo[ind].get("contig_start") >=
                         hypo[targ].get("contig_start")
                         and hypo[ind].get("contig_start") <=
                         hypo[targ].get("contig_end"))
                            or (hypo[ind].get("contig_end") >=
                                hypo[targ].get("contig_start")
                                and hypo[ind].get("contig_end") <=
                                hypo[targ].get("contig_end"))
                            or (hypo[ind].get(identifier)
                                == hypo[targ].get(identifier))):
                        # Rightmost is worse
                        if float(hypo[ind].get("identity")) * (
                                1 - abs(1 - hypo[ind].get("span"))) > float(
                                    hypo[targ].get("identity")) * (
                                        1 - abs(1 - hypo[targ].get("span"))):
                            del hypo[targ]
                            ignore = True
                        # Leftmost is worse
                        elif float(hypo[ind].get("identity")) * (
                                1 - abs(1 - hypo[ind].get("span"))) < float(
                                    hypo[targ].get("identity")) * (
                                        1 - abs(1 - hypo[targ].get("span"))):
                            del hypo[ind]
                            targ = ind + 1
                            ignore = True
                        # Identical identity and span, seperating based on contig coverage
                        else:
                            # Rightmost is worse
                            if float(
                                    hypo[ind].get("contig_coverage")) >= float(
                                        hypo[targ].get("contig_coverage")):
                                del hypo[targ]
                                ignore = True
                            # Leftmost is worse
                            elif float(
                                    hypo[ind].get("contig_coverage")) < float(
                                        hypo[targ].get("contig_coverage")):
                                del hypo[ind]
                                targ = ind + 1
                                ignore = True
                if not ignore:
                    targ += 1
                else:
                    pass
            ind += 1

        self.logger.info(
            "{} {} hits were added after removing overlaps and duplicate hits".
            format(len(hypo), type))
        for hit in hypo:
            self.logger.debug("Kept {}:{} with span {} and id {}".format(
                hit.get("loci"),
                hit.get("allele"),
                hit.get("span"),
                hit.get("identity"),
            ))
            self.db_pusher.add_rec(hit, "{}".format(type2db))

        if type == "seq_type":
            try:
                ST = self.db_pusher.alleles2st(self.name)
                self.db_pusher.upd_rec({"CG_ID_sample": self.name}, "Samples",
                                       {"ST": ST})
                self.logger.info("Sample {} received ST {}".format(
                    self.name, ST))
            except Exception as e:
                self.logger.warning(
                    "Unable to type sample {} due to data value '{}'".format(
                        self.name, str(e)))

    def load_resistances(self):
        """Legacy function, loads common resistance names for genes from notes file"""
        conversions = dict()
        try:
            with open("{}/notes.txt".format(
                    self.config["folders"]["resistances"])) as fh:
                for line in fh:
                    if "#" not in line:
                        line = line.split(":")
                        cropped = re.sub(" resistance", "", line[1])
                        conversions[line[0]] = cropped
                        # Workaround for case issues
                        conversions[line[0].lower()] = cropped
        except Exception as e:
            self.logger.error(
                "Unable to initialize trivial names for resistances ({})".
                format(e))
        return conversions

    def scrape_alignment(self, file_list=[]):
        """Scrapes a single alignment result"""
        if file_list == []:
            file_list = glob.glob("{}/alignment/*.stats.*".format(
                self.sampledir))
        ins_list = list()
        cov_dict = dict()
        align_dict = dict()
        align_dict["reference_genome"] = self.sample.get("reference")

        # Reading
        file_list = glob.glob("{}/alignment/*.stats.*".format(self.sampledir))
        map_rate = 0.0
        median_ins = 0
        ref_len = 0.0
        tot_reads = 0
        tot_map = 0
        duprate = 0.0
        for file in file_list:
            with open(file, "r") as fh:
                type = file.split(".")[-1]
                for line in fh.readlines():
                    lsplit = line.rstrip().split("\t")
                    if type == "raw":
                        try:
                            tot_reads = int(lsplit[0])
                        except Exception as e:
                            pass
                    elif type == "ins":
                        if len(lsplit) >= 18 and lsplit[-12] in ["FF", "FR"]:
                            try:
                                median_ins = int(lsplit[0])
                            except Exception as e:
                                pass
                    elif type == "cov":
                        cov_dict[lsplit[1]] = int(lsplit[2])
                    elif type == "ref":
                        if lsplit[0] != "*" and len(lsplit) >= 2:
                            ref_len = ref_len + int(lsplit[1])
                    elif type == "dup":
                        if lsplit[0] == "Unknown Library":
                            try:
                                duprate = float(lsplit[8])
                            except Exception as e:
                                duprate = -1.0
                    elif type == "map":
                        dsplit = line.rstrip().split(" ")
                        if len(dsplit) >= 5 and dsplit[4] == "total":
                            tot_map = int(dsplit[0])
                        elif len(dsplit) >= 4 and dsplit[3] == "mapped":
                            if tot_map > 0:
                                map_rate = int(dsplit[0]) / float(tot_map)

        # Mangling
        sumz, plus10, plus30, plus50, plus100, total = 0, 0, 0, 0, 0, 0
        for k, v in cov_dict.items():
            sumz += int(k) * v
            total += v
            if int(k) > 10:
                plus10 += v
            if int(k) > 30:
                plus30 += v
            if int(k) > 50:
                plus50 += v
            if int(k) > 100:
                plus100 += v
        if total > 0:
            align_dict["coverage_10x"] = plus10 / float(ref_len)
            align_dict["coverage_30x"] = plus30 / float(ref_len)
            align_dict["coverage_50x"] = plus50 / float(ref_len)
            align_dict["coverage_100x"] = plus100 / float(ref_len)
        else:
            align_dict["coverage_10x"] = 0.0
            align_dict["coverage_30x"] = 0.0
            align_dict["coverage_50x"] = 0.0
            align_dict["coverage_100x"] = 0.0

        align_dict["mapped_rate"] = map_rate
        align_dict["insert_size"] = median_ins
        if ref_len > 0:
            align_dict["duplication_rate"] = duprate
            align_dict["average_coverage"] = sumz / float(ref_len)
        else:
            align_dict["duplication_rate"] = 0.0
            align_dict["average_coverage"] = 0.0
        align_dict["total_reads"] = tot_reads
        self.db_pusher.upd_rec({"CG_ID_sample": self.name}, "Samples",
                               align_dict)