Пример #1
0
    def __init__(self, config, log, sampleinfo={}, input=""):
        self.config = config
        self.logger = log
        self.db_pusher = DB_Manipulator(config, log)
        self.referencer = Referencer(config, log)
        self.job_fallback = Job_Creator(config=config,
                                        log=log,
                                        sampleinfo=sampleinfo)
        self.infolder = os.path.abspath(input)
        self.sampledir = ""

        # Since all scraped folders are generated by Job_Creator, datestring is automatically provided in last folder
        last_folder = self.infolder.split("/")[-1]
        self.name = last_folder.split("_")[0]

        self.sampleinfo = sampleinfo
        self.sample = None
        if isinstance(self.sampleinfo, list) and len(self.sampleinfo) > 1:
            self.name = self.sampleinfo[0].get("CG_ID_project")
            self.sample = self.sampleinfo[0]
            for entry in self.sampleinfo:
                if entry.get("CG_ID_sample") == self.name:
                    raise Exception(
                        "Mixed projects in samples_info file. Do not know how to proceed"
                    )
        else:
            if isinstance(self.sampleinfo, list):
                self.sampleinfo = self.sampleinfo[0]
            self.name = self.sampleinfo.get("CG_ID_sample")
            self.sample = self.sampleinfo

        self.gene2resistance = self.load_resistances()
Пример #2
0
def test_project_job(subproc, testdata):
    #Sets up subprocess mocking
    process_mock = mock.Mock()
    attrs = {'communicate.return_value': ('output 123456789', 'error')}
    process_mock.configure_mock(**attrs)
    subproc.return_value = process_mock

    jc = Job_Creator(config=preset_config,
                     log=logger,
                     sampleinfo=testdata,
                     run_settings={
                         'pool': ["AAA1234A1", "AAA1234A2"],
                         'input': '/tmp/AAA1234'
                     })
    jc.project_job()
Пример #3
0
def test_verify_fastq(gopen, stat, listdir, testdata):
    listdir.return_value = [
        "ACC6438A3_HVMHWDSXX_L1_1.fastq.gz",
        "ACC6438A3_HVMHWDSXX_L1_2.fastq.gz",
        "ACC6438A3_HVMHWDSXX_L2_2.fastq.gz",
        "ACC6438A3_HVMHWDSXX_L2_2.fastq.gz"
    ]
    stata = mock.MagicMock()
    stata.st_size = 2000
    stat.return_value = stata

    jc = Job_Creator(run_settings={'input': '/tmp/'},
                     config=preset_config,
                     log=logger,
                     sampleinfo=testdata)
    t = jc.verify_fastq()
    assert len(t) > 0
Пример #4
0
def test_create_snpsection(subproc, testdata):
    #Sets up subprocess mocking
    process_mock = mock.Mock()
    attrs = {'communicate.return_value': ('output 123456789', 'error')}
    process_mock.configure_mock(**attrs)
    subproc.return_value = process_mock

    testdata = [testdata[0]]
    jc = Job_Creator(run_settings={'input': ['AAA1234A1', 'AAA1234A2']},
                     config=preset_config,
                     log=logger,
                     sampleinfo=testdata)
    jc.snp_job()
    outfile = open(jc.get_sbatch(), 'r')
    count = 0
    for x in outfile.readlines():
        if "# SNP pair-wise distance" in x:
            count = count + 1
    assert count > 0
Пример #5
0
def analyse(ctx, sampleinfo_file, input, config, dry, email, skip_update,
            force_update, untrimmed, uncareful):
    """Sequence analysis, typing and resistance identification"""
    # Run section
    pool = []
    trimmed = not untrimmed
    careful = not uncareful
    set_cli_config(config)
    ctx.obj["config"]["regex"]["mail_recipient"] = email
    ctx.obj["config"]["dry"] = dry
    if not os.path.isdir(input):
        click.echo(
            "ERROR - Sequence data folder {} does not exist.".format(input))
        ctx.abort()
    for subfolder in os.listdir(input):
        if os.path.isdir("{}/{}".format(input, subfolder)):
            pool.append(subfolder)

    run_settings = {
        "input": input,
        "dry": dry,
        "email": email,
        "skip_update": skip_update,
        "trimmed": not untrimmed,
        "careful": not uncareful,
        "pool": pool,
    }

    # Samples section
    sampleinfo = review_sampleinfo(sampleinfo_file)
    run_creator = Job_Creator(
        config=ctx.obj["config"],
        log=ctx.obj["log"],
        sampleinfo=sampleinfo,
        run_settings=run_settings,
    )

    ext_refs = Referencer(config=ctx.obj["config"],
                          log=ctx.obj["log"],
                          sampleinfo=sampleinfo,
                          force=force_update)
    click.echo("INFO - Checking versions of references..")
    try:
        if not skip_update:
            ext_refs.identify_new(project=True)
            ext_refs.update_refs()
            click.echo("INFO - Version check done. Creating sbatch jobs")
        else:
            click.echo("INFO - Skipping version check.")
    except Exception as e:
        click.echo("{}".format(e))
    if len(sampleinfo) > 1:
        run_creator.project_job()
    elif len(sampleinfo) == 1:
        run_creator.project_job(single_sample=True)
    else:
        ctx.abort()

    done()
Пример #6
0
def test_blast_subset(glob_search, research, testdata):
    jc = Job_Creator(run_settings={'input': '/tmp/'},
                     config=preset_config,
                     log=logger,
                     sampleinfo=testdata)
    researcha = mock.MagicMock()
    researcha.group = fake_search
    research.return_value = researcha
    glob_search.return_value = ["/a/a/a", "/a/a/b", "/a/a/c"]

    jc.blast_subset('mlst', '/tmp/*')
    jc.blast_subset('other', '/tmp/*')
    outfile = open(jc.get_sbatch(), 'r')
    count = 0
    for x in outfile.readlines():
        if "blastn -db" in x:
            count = count + 1
    assert count > 0
Пример #7
0
class Scraper:
    def __init__(self, config, log, sampleinfo={}, input=""):
        self.config = config
        self.logger = log
        self.db_pusher = DB_Manipulator(config, log)
        self.referencer = Referencer(config, log)
        self.job_fallback = Job_Creator(config=config,
                                        log=log,
                                        sampleinfo=sampleinfo)
        self.infolder = os.path.abspath(input)
        self.sampledir = ""

        # Since all scraped folders are generated by Job_Creator, datestring is automatically provided in last folder
        last_folder = self.infolder.split("/")[-1]
        self.name = last_folder.split("_")[0]

        self.sampleinfo = sampleinfo
        self.sample = None
        if isinstance(self.sampleinfo, list) and len(self.sampleinfo) > 1:
            self.name = self.sampleinfo[0].get("CG_ID_project")
            self.sample = self.sampleinfo[0]
            for entry in self.sampleinfo:
                if entry.get("CG_ID_sample") == self.name:
                    raise Exception(
                        "Mixed projects in samples_info file. Do not know how to proceed"
                    )
        else:
            if isinstance(self.sampleinfo, list):
                self.sampleinfo = self.sampleinfo[0]
            self.name = self.sampleinfo.get("CG_ID_sample")
            self.sample = self.sampleinfo

        self.gene2resistance = self.load_resistances()

    def scrape_project(self, project=None):
        """Scrapes a project folder for information"""
        if project is None:
            project = self.name
        self.db_pusher.purge_rec(project, "Projects")
        if not self.db_pusher.exists("Projects", {"CG_ID_project": project}):
            self.logger.warning("Replacing project {}".format(project))
            self.job_fallback.create_project(project)

        # Scrape order matters a lot!
        for dir in os.listdir(self.infolder):
            subdir = "{}/{}".format(self.infolder, dir)
            local_param = [
                p for p in self.sampleinfo if p["CG_ID_sample"] == dir
            ]
            if local_param != []:
                local_param = local_param[0]
                sample_scraper = Scraper(
                    config=self.config,
                    log=self.logger,
                    sampleinfo=local_param,
                    input=subdir,
                )
                sample_scraper.scrape_sample()
            else:
                self.logger.warning(
                    "Skipping {} due to lacking info in sample_json file".
                    format(dir))

    def scrape_sample(self, sample=None):
        """Scrapes a sample folder for information"""
        if sample is None:
            sample = self.name
        self.db_pusher.purge_rec(sample, "Samples")

        if not self.db_pusher.exists(
                "Projects",
            {"CG_ID_project": self.sample.get("CG_ID_project")}):
            self.logger.warning("Replacing project {}".format(
                self.sample.get("CG_ID_project")))
            self.job_fallback.create_project(self.sample.get("CG_ID_project"))

        if not self.db_pusher.exists("Samples", {"CG_ID_sample": sample}):
            self.logger.warning("Replacing sample {}".format(sample))
            self.job_fallback.create_sample(sample)

        # Scrape order matters a lot!
        self.sampledir = self.infolder
        self.scrape_blast(type="seq_type")
        self.scrape_blast(type="resistance")
        if (self.referencer.organism2reference(
                self.sample.get("organism")) == "escherichia_coli"):
            self.scrape_blast(type="expec")
        self.scrape_alignment()
        self.scrape_quast()

    def scrape_quast(self, filename=""):
        """Scrapes a quast report for assembly information"""
        if filename == "":
            filename = "{}/assembly/quast/report.tsv".format(self.sampledir)
        quast = dict()
        try:
            with open(filename, "r") as infile:
                for line in infile:
                    lsplit = line.rstrip().split("\t")
                    if lsplit[0] == "# contigs":
                        quast["contigs"] = int(lsplit[1])
                    elif lsplit[0] == "Total length":
                        quast["genome_length"] = int(lsplit[1])
                    elif lsplit[0] == "GC (%)":
                        quast["gc_percentage"] = float(lsplit[1])
                    elif lsplit[0] == "N50":
                        quast["n50"] = int(lsplit[1])

            self.db_pusher.upd_rec({"CG_ID_sample": self.name}, "Samples",
                                   quast)
            self.logger.debug("Project {} recieved quast stats: {}".format(
                self.name, quast))
        except Exception as e:
            self.logger.warning(
                "Cannot generate quast statistics for {}".format(self.name))

    def get_locilengths(self, foldername, suffix):
        """ Generate a dict of length for any given loci """
        # Create dict with full name as key, associated nucleotides as value.
        alleles = dict()
        finalalleles = dict()
        for file in os.listdir(foldername):
            if file.endswith(suffix):
                lastallele = ""
                f = open("{}/{}".format(foldername, file), "r")
                for row in f:
                    if ">" in row:
                        lastallele = row.strip()
                        alleles[lastallele] = ""
                    else:
                        alleles[lastallele] = alleles[lastallele] + row.strip()
                f.close()

                for k, v in alleles.items():
                    finalalleles[k] = len(v)
        return finalalleles

    def scrape_blast(self, type="", file_list=[]):
        hypo = list()
        type2db = type.capitalize() + "s"
        if type == "expec":
            type2db = "Expacs"

        if file_list == []:
            if type == "seq_type":
                file_list = glob.glob("{}/blast_search/mlst/*".format(
                    self.sampledir))
            else:
                file_list = glob.glob("{}/blast_search/{}/*".format(
                    self.sampledir, type))

        organism = self.referencer.organism2reference(
            self.sample.get("organism"))
        if organism:
            self.db_pusher.upd_rec({"CG_ID_sample": self.name}, "Samples",
                                   {"organism": organism})
        res_cols = self.db_pusher.get_columns("{}".format(type2db))

        try:
            old_ref = ""
            for file in file_list:
                filename = os.path.basename(file).rsplit(
                    ".", 1)[0]  # Removes suffix
                if filename == "lactam":
                    filename = "beta-lactam"
                if type == "resistance":
                    ref_folder = self.config["folders"]["resistances"]
                    suffix = "fsa"
                elif type == "expec":
                    ref_folder = os.path.dirname(
                        self.config["folders"]["expec"])
                    suffix = os.path.basename(
                        self.config["folders"]["expec"]).rsplit(".", 1)[1]
                elif type == "seq_type":
                    ref_folder = "{}/{}".format(
                        self.config["folders"]["references"], organism)
                    suffix = "tfa"
                locilengths = self.get_locilengths(ref_folder, suffix)

                with open("{}".format(file), "r") as sample:
                    for line in sample:
                        # Ignore commented fields
                        if not line[0] == "#":

                            elem_list = line.rstrip().split("\t")
                            if not elem_list[1] == "N/A":
                                hypo.append(dict())
                                hypo[-1]["CG_ID_sample"] = self.name
                                hypo[-1]["identity"] = elem_list[4]
                                hypo[-1]["evalue"] = elem_list[5]
                                hypo[-1]["bitscore"] = elem_list[6]
                                if int(elem_list[7]) < int(elem_list[8]):
                                    hypo[-1]["contig_start"] = int(
                                        elem_list[7])
                                    hypo[-1]["contig_end"] = int(elem_list[8])
                                else:
                                    hypo[-1]["contig_start"] = int(
                                        elem_list[8])
                                    hypo[-1]["contig_end"] = int(elem_list[7])
                                hypo[-1]["subject_length"] = int(elem_list[11])

                                if type == "resistance":
                                    hypo[-1]["instance"] = filename
                                    partials = re.search(
                                        r"(?:\>)*(.+)_(\d+){1,3}(?:_(.+))*",
                                        elem_list[3],
                                    )
                                    hypo[-1]["reference"] = partials.group(3)
                                    hypo[-1]["gene"] = partials.group(1)
                                    if hypo[-1][
                                            "gene"] in self.gene2resistance.keys(
                                            ):
                                        hypo[-1][
                                            "resistance"] = self.gene2resistance[
                                                hypo[-1]["gene"]]
                                    else:
                                        hypo[-1]["{}".format(type)] = hypo[-1][
                                            "instance"].capitalize()
                                    hypo[-1]["span"] = (
                                        float(hypo[-1]["subject_length"]) /
                                        locilengths[">{}".format(
                                            elem_list[3])])

                                elif type == "expec":
                                    hypo[-1]["instance"] = filename
                                    # Thanks, precompiled list standards
                                    if ">" in elem_list[3]:
                                        partials = re.search(
                                            r">*(\w+_\w+\.*\w+).+\((\w+)\).+\((\w+)\)_(\w+)_\[.+\]",
                                            elem_list[3],
                                        )
                                    else:
                                        partials = re.search(
                                            r"(\w+)\(gb\|\w+\)_\((\S+)\)_(.+)_\[(\S+)_.+\]_\[\S+\]",
                                            elem_list[3],
                                        )
                                    if not partials:
                                        partials = re.search(
                                            r"(\w+\.*\w+)\:*\w*_*(?:\(\w+\-\w+\))*_\((\w+)\)_([^[]+)\[\S+\]",
                                            elem_list[3],
                                        )
                                    # NC/Protein reference
                                    hypo[-1]["reference"] = partials.group(1)
                                    # Full gene name
                                    hypo[-1]["gene"] = partials.group(2)
                                    # More generic group
                                    hypo[-1]["instance"] = partials.group(
                                        3).strip("_")
                                    # Description
                                    if len(partials.groups()) >= 4:
                                        hypo[-1]["virulence"] = (
                                            partials.group(4).replace(
                                                "_", " ").capitalize())
                                    else:
                                        hypo[-1]["virulence"] = ""
                                    hypo[-1]["span"] = (
                                        float(hypo[-1]["subject_length"]) /
                                        locilengths[">{}".format(
                                            elem_list[3])])

                                elif type == "seq_type":
                                    partials = re.search(
                                        r"(.+)_(\d+){1,3}(?:_(\w+))*",
                                        elem_list[3])
                                    hypo[-1]["loci"] = partials.group(1)
                                    hypo[-1]["allele"] = int(partials.group(2))
                                    hypo[-1]["span"] = (
                                        float(hypo[-1]["subject_length"]) /
                                        locilengths[">{}".format(
                                            elem_list[3])])

                                # split elem 2 into contig node_NO, length, cov
                                nodeinfo = elem_list[2].split("_")
                                hypo[-1]["contig_name"] = "{}_{}".format(
                                    nodeinfo[0], nodeinfo[1])
                                hypo[-1]["contig_length"] = int(nodeinfo[3])
                                hypo[-1]["contig_coverage"] = nodeinfo[5]
                                self.logger.debug(
                                    "scrape_blast scrape loop hit")
            self.logger.info("{} candidate {} hits found".format(
                len(hypo), type2db))
        except Exception as e:
            self.logger.error("Unable to process the pattern of {}".format(
                str(e)))

        # Cleanup of overlapping hits
        if type == "seq_type":
            identifier = "loci"
        elif type == "resistance" or type == "expec":
            identifier = "gene"
        ind = 0
        while ind < len(hypo) - 1:
            targ = ind + 1
            while targ < len(hypo):
                ignore = False
                if (hypo[ind]["contig_name"] == hypo[targ]["contig_name"]
                        or hypo[ind][identifier] == hypo[targ][identifier]):
                    # Overlapping or shared gene
                    if ((hypo[ind].get("contig_start") >=
                         hypo[targ].get("contig_start")
                         and hypo[ind].get("contig_start") <=
                         hypo[targ].get("contig_end"))
                            or (hypo[ind].get("contig_end") >=
                                hypo[targ].get("contig_start")
                                and hypo[ind].get("contig_end") <=
                                hypo[targ].get("contig_end"))
                            or (hypo[ind].get(identifier)
                                == hypo[targ].get(identifier))):
                        # Rightmost is worse
                        if float(hypo[ind].get("identity")) * (
                                1 - abs(1 - hypo[ind].get("span"))) > float(
                                    hypo[targ].get("identity")) * (
                                        1 - abs(1 - hypo[targ].get("span"))):
                            del hypo[targ]
                            ignore = True
                        # Leftmost is worse
                        elif float(hypo[ind].get("identity")) * (
                                1 - abs(1 - hypo[ind].get("span"))) < float(
                                    hypo[targ].get("identity")) * (
                                        1 - abs(1 - hypo[targ].get("span"))):
                            del hypo[ind]
                            targ = ind + 1
                            ignore = True
                        # Identical identity and span, seperating based on contig coverage
                        else:
                            # Rightmost is worse
                            if float(
                                    hypo[ind].get("contig_coverage")) >= float(
                                        hypo[targ].get("contig_coverage")):
                                del hypo[targ]
                                ignore = True
                            # Leftmost is worse
                            elif float(
                                    hypo[ind].get("contig_coverage")) < float(
                                        hypo[targ].get("contig_coverage")):
                                del hypo[ind]
                                targ = ind + 1
                                ignore = True
                if not ignore:
                    targ += 1
                else:
                    pass
            ind += 1

        self.logger.info(
            "{} {} hits were added after removing overlaps and duplicate hits".
            format(len(hypo), type))
        for hit in hypo:
            self.logger.debug("Kept {}:{} with span {} and id {}".format(
                hit.get("loci"),
                hit.get("allele"),
                hit.get("span"),
                hit.get("identity"),
            ))
            self.db_pusher.add_rec(hit, "{}".format(type2db))

        if type == "seq_type":
            try:
                ST = self.db_pusher.alleles2st(self.name)
                self.db_pusher.upd_rec({"CG_ID_sample": self.name}, "Samples",
                                       {"ST": ST})
                self.logger.info("Sample {} received ST {}".format(
                    self.name, ST))
            except Exception as e:
                self.logger.warning(
                    "Unable to type sample {} due to data value '{}'".format(
                        self.name, str(e)))

    def load_resistances(self):
        """Legacy function, loads common resistance names for genes from notes file"""
        conversions = dict()
        try:
            with open("{}/notes.txt".format(
                    self.config["folders"]["resistances"])) as fh:
                for line in fh:
                    if "#" not in line:
                        line = line.split(":")
                        cropped = re.sub(" resistance", "", line[1])
                        conversions[line[0]] = cropped
                        # Workaround for case issues
                        conversions[line[0].lower()] = cropped
        except Exception as e:
            self.logger.error(
                "Unable to initialize trivial names for resistances ({})".
                format(e))
        return conversions

    def scrape_alignment(self, file_list=[]):
        """Scrapes a single alignment result"""
        if file_list == []:
            file_list = glob.glob("{}/alignment/*.stats.*".format(
                self.sampledir))
        ins_list = list()
        cov_dict = dict()
        align_dict = dict()
        align_dict["reference_genome"] = self.sample.get("reference")

        # Reading
        file_list = glob.glob("{}/alignment/*.stats.*".format(self.sampledir))
        map_rate = 0.0
        median_ins = 0
        ref_len = 0.0
        tot_reads = 0
        tot_map = 0
        duprate = 0.0
        for file in file_list:
            with open(file, "r") as fh:
                type = file.split(".")[-1]
                for line in fh.readlines():
                    lsplit = line.rstrip().split("\t")
                    if type == "raw":
                        try:
                            tot_reads = int(lsplit[0])
                        except Exception as e:
                            pass
                    elif type == "ins":
                        if len(lsplit) >= 18 and lsplit[-12] in ["FF", "FR"]:
                            try:
                                median_ins = int(lsplit[0])
                            except Exception as e:
                                pass
                    elif type == "cov":
                        cov_dict[lsplit[1]] = int(lsplit[2])
                    elif type == "ref":
                        if lsplit[0] != "*" and len(lsplit) >= 2:
                            ref_len = ref_len + int(lsplit[1])
                    elif type == "dup":
                        if lsplit[0] == "Unknown Library":
                            try:
                                duprate = float(lsplit[8])
                            except Exception as e:
                                duprate = -1.0
                    elif type == "map":
                        dsplit = line.rstrip().split(" ")
                        if len(dsplit) >= 5 and dsplit[4] == "total":
                            tot_map = int(dsplit[0])
                        elif len(dsplit) >= 4 and dsplit[3] == "mapped":
                            if tot_map > 0:
                                map_rate = int(dsplit[0]) / float(tot_map)

        # Mangling
        sumz, plus10, plus30, plus50, plus100, total = 0, 0, 0, 0, 0, 0
        for k, v in cov_dict.items():
            sumz += int(k) * v
            total += v
            if int(k) > 10:
                plus10 += v
            if int(k) > 30:
                plus30 += v
            if int(k) > 50:
                plus50 += v
            if int(k) > 100:
                plus100 += v
        if total > 0:
            align_dict["coverage_10x"] = plus10 / float(ref_len)
            align_dict["coverage_30x"] = plus30 / float(ref_len)
            align_dict["coverage_50x"] = plus50 / float(ref_len)
            align_dict["coverage_100x"] = plus100 / float(ref_len)
        else:
            align_dict["coverage_10x"] = 0.0
            align_dict["coverage_30x"] = 0.0
            align_dict["coverage_50x"] = 0.0
            align_dict["coverage_100x"] = 0.0

        align_dict["mapped_rate"] = map_rate
        align_dict["insert_size"] = median_ins
        if ref_len > 0:
            align_dict["duplication_rate"] = duprate
            align_dict["average_coverage"] = sumz / float(ref_len)
        else:
            align_dict["duplication_rate"] = 0.0
            align_dict["average_coverage"] = 0.0
        align_dict["total_reads"] = tot_reads
        self.db_pusher.upd_rec({"CG_ID_sample": self.name}, "Samples",
                               align_dict)