Python Referencer 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: microSALT.utils.referencer

클래스/타입: Referencer

hotexamples.com에서의 예제들: 13

Python Referencer - 13개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 microSALT.utils.referencer.Referencer에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Referencer(11)

update_refs(6)

identify_new(4)

organism2reference(2)

resync(2)

add_pubmlst(1)

existing_organisms(1)

예제 #1

파일 보기

파일: scraper.py 프로젝트: eriksjolund/microSALT

    def __init__(self, config, log, sampleinfo={}, input=""):
        self.config = config
        self.logger = log
        self.db_pusher = DB_Manipulator(config, log)
        self.referencer = Referencer(config, log)
        self.job_fallback = Job_Creator(config=config,
                                        log=log,
                                        sampleinfo=sampleinfo)
        self.infolder = os.path.abspath(input)
        self.sampledir = ""

        # Since all scraped folders are generated by Job_Creator, datestring is automatically provided in last folder
        last_folder = self.infolder.split("/")[-1]
        self.name = last_folder.split("_")[0]

        self.sampleinfo = sampleinfo
        self.sample = None
        if isinstance(self.sampleinfo, list) and len(self.sampleinfo) > 1:
            self.name = self.sampleinfo[0].get("CG_ID_project")
            self.sample = self.sampleinfo[0]
            for entry in self.sampleinfo:
                if entry.get("CG_ID_sample") == self.name:
                    raise Exception(
                        "Mixed projects in samples_info file. Do not know how to proceed"
                    )
        else:
            if isinstance(self.sampleinfo, list):
                self.sampleinfo = self.sampleinfo[0]
            self.name = self.sampleinfo.get("CG_ID_sample")
            self.sample = self.sampleinfo

        self.gene2resistance = self.load_resistances()

예제 #2

파일 보기

파일: cli.py 프로젝트: eriksjolund/microSALT

def add(ctx, organism, force):
    """ Adds a new internal organism from pubMLST """
    referee = Referencer(config=ctx.obj["config"], log=ctx.obj["log"], force=force)
    try:
        referee.add_pubmlst(organism)
    except Exception as e:
        click.echo(e.args[0])
        ctx.abort()
    click.echo("INFO - Checking versions of all references..")
    referee = Referencer(config=ctx.obj["config"], log=ctx.obj["log"], force=force)
    referee.update_refs()

예제 #3

파일 보기

파일: cli.py 프로젝트: Clinical-Genomics/microSALT

def analyse(ctx, sampleinfo_file, input, config, dry, email, skip_update,
            force_update, untrimmed, uncareful):
    """Sequence analysis, typing and resistance identification"""
    # Run section
    pool = []
    trimmed = not untrimmed
    careful = not uncareful
    set_cli_config(config)
    ctx.obj["config"]["regex"]["mail_recipient"] = email
    ctx.obj["config"]["dry"] = dry
    if not os.path.isdir(input):
        click.echo(
            "ERROR - Sequence data folder {} does not exist.".format(input))
        ctx.abort()
    for subfolder in os.listdir(input):
        if os.path.isdir("{}/{}".format(input, subfolder)):
            pool.append(subfolder)

    run_settings = {
        "input": input,
        "dry": dry,
        "email": email,
        "skip_update": skip_update,
        "trimmed": not untrimmed,
        "careful": not uncareful,
        "pool": pool,
    }

    # Samples section
    sampleinfo = review_sampleinfo(sampleinfo_file)
    run_creator = Job_Creator(
        config=ctx.obj["config"],
        log=ctx.obj["log"],
        sampleinfo=sampleinfo,
        run_settings=run_settings,
    )

    ext_refs = Referencer(config=ctx.obj["config"],
                          log=ctx.obj["log"],
                          sampleinfo=sampleinfo,
                          force=force_update)
    click.echo("INFO - Checking versions of references..")
    try:
        if not skip_update:
            ext_refs.identify_new(project=True)
            ext_refs.update_refs()
            click.echo("INFO - Version check done. Creating sbatch jobs")
        else:
            click.echo("INFO - Skipping version check.")
    except Exception as e:
        click.echo("{}".format(e))
    if len(sampleinfo) > 1:
        run_creator.project_job()
    elif len(sampleinfo) == 1:
        run_creator.project_job(single_sample=True)
    else:
        ctx.abort()

    done()

예제 #4

파일 보기

파일: cli.py 프로젝트: Clinical-Genomics/microSALT

def review(ctx, type, customer, skip_update, email, output):
    """Generates information about novel ST"""
    # Trace exists by some samples having pubMLST_ST filled in. Make trace function later
    ctx.obj["config"]["regex"]["mail_recipient"] = email
    ext_refs = Referencer(config=ctx.obj["config"], log=ctx.obj["log"])
    if not skip_update:
        ext_refs.update_refs()
        ext_refs.resync()
    click.echo("INFO - Version check done. Generating output")
    if type == "report":
        codemonkey = Reporter(config=ctx.obj["config"],
                              log=ctx.obj["log"],
                              output=output)
        codemonkey.report(type="st_update", customer=customer)
    elif type == "list":
        ext_refs.resync(type=type)
    done()

예제 #5

파일 보기

파일: cli.py 프로젝트: eriksjolund/microSALT

def autobatch(ctx, dry, skip_update, email):
    """Analyses all currently unanalysed projects in the seqdata folder"""
    # Trace exists by some samples having pubMLST_ST filled in. Make trace function later
    ctx.obj["config"]["regex"]["mail_recipient"] = email
    ext_refs = Referencer(config=ctx.obj["config"], log=ctx.obj["log"])
    if not skip_update:
        ext_refs.identify_new(project=True)
        ext_refs.update_refs()

    process = subprocess.Popen(
        'squeue --format="%50j" -h -r'.split(), stdout=subprocess.PIPE
    )
    run_jobs, error = process.communicate()
    run_jobs = run_jobs.splitlines()
    run_jobs = [
        re.search('"(.+)"', str(jobname)).group(1).replace(" ", "")
        for jobname in run_jobs
    ]
    old_jobs = os.listdir(ctx.obj["config"]["folders"]["results"])

    for foldah in os.listdir(ctx.obj["config"]["folders"]["seqdata"]):
        # Project name not found in slurm list
        if len([job for job in run_jobs if foldah in job]) == 0:
            # Project name not found in results directories
            if len([job for job in old_jobs if foldah in job]) == 0:
                if dry:
                    click.echo(
                        "DRY - microSALT analyse {} --skip_update".format(foldah)
                    )
                else:
                    process = subprocess.Popen(
                        "microSALT analyse project {} --skip_update".format(
                            foldah
                        ).split(),
                        stdout=subprocess.PIPE,
                    )
                    output, error = process.communicate()
            elif dry:
                click.echo(
                    "INFO - Skipping {} due to existing analysis in results folder".format(
                        foldah
                    )
                )
        elif dry:
            click.echo("INFO - Skipping {} due to concurrent SLURM run".format(foldah))
    done()

예제 #6

파일 보기

def init_references(testdata):
    ref_obj = Referencer(config=preset_config, log=logger, sampleinfo=testdata)
    ref_obj.identify_new(testdata[0].get('CG_ID_project'), project=True)
    ref_obj.update_refs()

예제 #7

파일 보기

def references():
    ref_obj = Referencer(config=preset_config, log=logger)
    fixer.identify_new(project_id, project=True)
    fixer.update_refs()

예제 #8

파일 보기

파일: cli.py 프로젝트: Clinical-Genomics/microSALT

def overwrite(ctx, sample_name, force):
    """Flags sample as resolved"""
    ext_refs = Referencer(config=ctx.obj["config"], log=ctx.obj["log"])
    ext_refs.resync(type="overwrite", sample=sample_name, ignore=force)
    done()

예제 #9

파일 보기

파일: cli.py 프로젝트: Clinical-Genomics/microSALT

def observe(ctx):
    """ Lists all stored organisms """
    refe = Referencer(config=ctx.obj["config"], log=ctx.obj["log"])
    click.echo("INFO - Currently stored organisms:")
    for org in sorted(refe.existing_organisms()):
        click.echo(org.replace("_", " ").capitalize())

예제 #10

파일 보기

파일: cli.py 프로젝트: Clinical-Genomics/microSALT

def finish(ctx, sampleinfo_file, input, track, config, dry, email, skip_update,
           report, output):
    """Sequence analysis, typing and resistance identification"""
    # Run section
    pool = []
    set_cli_config(config)
    ctx.obj["config"]["regex"]["mail_recipient"] = email
    ctx.obj["config"]["dry"] = dry
    if not os.path.isdir(input):
        click.echo(
            "ERROR - Sequence data folder {} does not exist.".format(input))
        ctx.abort()
    if output == "":
        output = input
    for subfolder in os.listdir(input):
        if os.path.isdir("{}/{}".format(input, subfolder)):
            pool.append(subfolder)

    run_settings = {
        "input": input,
        "track": track,
        "dry": dry,
        "email": email,
        "skip_update": skip_update,
    }

    # Samples section
    sampleinfo = review_sampleinfo(sampleinfo_file)
    ext_refs = Referencer(config=ctx.obj["config"],
                          log=ctx.obj["log"],
                          sampleinfo=sampleinfo)
    click.echo("INFO - Checking versions of references..")
    try:
        if not skip_update:
            ext_refs.identify_new(project=True)
            ext_refs.update_refs()
            click.echo("INFO - Version check done. Creating sbatch jobs")
        else:
            click.echo("INFO - Skipping version check.")
    except Exception as e:
        click.echo("{}".format(e))

    res_scraper = Scraper(config=ctx.obj["config"],
                          log=ctx.obj["log"],
                          sampleinfo=sampleinfo,
                          input=input)
    if isinstance(sampleinfo, list) and len(sampleinfo) > 1:
        res_scraper.scrape_project()
        # for subfolder in pool:
        #  res_scraper.scrape_sample()
    else:
        res_scraper.scrape_sample()

    codemonkey = Reporter(
        config=ctx.obj["config"],
        log=ctx.obj["log"],
        sampleinfo=sampleinfo,
        output=output,
        collection=True,
    )
    codemonkey.report(report)
    done()

예제 #11

파일 보기

    def __init__(self, config, log, sampleinfo={}, run_settings={}):
        self.config = config
        self.logger = log
        self.batchfile = "/tmp/batchfile.sbatch"

        self.filelist = list()
        if isinstance(run_settings.get("input"), list):
            self.filelist = run_settings.get("input")
            run_settings["input"] = "/tmp/"

        self.run_settings = run_settings
        self.indir = os.path.abspath(run_settings.get("input", "/tmp/"))
        self.trimmed = run_settings.get("trimmed", True)
        self.qc_only = run_settings.get("qc_only", False)
        self.careful = run_settings.get("careful", True)
        self.pool = run_settings.get("pool", [])
        self.finishdir = run_settings.get("finishdir", "")

        self.sampleinfo = sampleinfo
        self.sample = None
        if isinstance(self.sampleinfo, list) and len(self.sampleinfo) > 1:
            self.name = self.sampleinfo[0].get("CG_ID_project")
            self.sample = self.sampleinfo[0]
            for entry in self.sampleinfo:
                if entry.get("CG_ID_sample") == self.name:
                    raise Exception(
                        "Mixed projects in samples_info file. Do not know how to proceed"
                    )
        else:
            if isinstance(self.sampleinfo, list):
                self.sampleinfo = self.sampleinfo[0]
            self.name = self.sampleinfo.get("CG_ID_sample")
            self.sample = self.sampleinfo

        # If timestamp is provided. Use it as analysis time. Else use current time
        if run_settings.get("timestamp") is not None:
            self.now = run_settings.get("timestamp")
            temp = run_settings.get("timestamp").replace("_", ".").split(".")
            self.dt = datetime(
                int(temp[0]),
                int(temp[1]),
                int(temp[2]),
                int(temp[3]),
                int(temp[4]),
                int(temp[5]),
            )
        else:
            self.dt = datetime.now()
            self.now = time.strftime("{}.{}.{}_{}.{}.{}".format(
                self.dt.year,
                self.dt.month,
                self.dt.day,
                self.dt.hour,
                self.dt.minute,
                self.dt.second,
            ))

        if run_settings.get("finishdir") is None:
            self.finishdir = "{}/{}_{}".format(config["folders"]["results"],
                                               self.name, self.now)
        self.db_pusher = DB_Manipulator(config, log)
        self.concat_files = dict()
        self.ref_resolver = Referencer(config, log)

예제 #12

파일 보기

class Job_Creator:
    def __init__(self, config, log, sampleinfo={}, run_settings={}):
        self.config = config
        self.logger = log
        self.batchfile = "/tmp/batchfile.sbatch"

        self.filelist = list()
        if isinstance(run_settings.get("input"), list):
            self.filelist = run_settings.get("input")
            run_settings["input"] = "/tmp/"

        self.run_settings = run_settings
        self.indir = os.path.abspath(run_settings.get("input", "/tmp/"))
        self.trimmed = run_settings.get("trimmed", True)
        self.qc_only = run_settings.get("qc_only", False)
        self.careful = run_settings.get("careful", True)
        self.pool = run_settings.get("pool", [])
        self.finishdir = run_settings.get("finishdir", "")

        self.sampleinfo = sampleinfo
        self.sample = None
        if isinstance(self.sampleinfo, list) and len(self.sampleinfo) > 1:
            self.name = self.sampleinfo[0].get("CG_ID_project")
            self.sample = self.sampleinfo[0]
            for entry in self.sampleinfo:
                if entry.get("CG_ID_sample") == self.name:
                    raise Exception(
                        "Mixed projects in samples_info file. Do not know how to proceed"
                    )
        else:
            if isinstance(self.sampleinfo, list):
                self.sampleinfo = self.sampleinfo[0]
            self.name = self.sampleinfo.get("CG_ID_sample")
            self.sample = self.sampleinfo

        # If timestamp is provided. Use it as analysis time. Else use current time
        if run_settings.get("timestamp") is not None:
            self.now = run_settings.get("timestamp")
            temp = run_settings.get("timestamp").replace("_", ".").split(".")
            self.dt = datetime(
                int(temp[0]),
                int(temp[1]),
                int(temp[2]),
                int(temp[3]),
                int(temp[4]),
                int(temp[5]),
            )
        else:
            self.dt = datetime.now()
            self.now = time.strftime("{}.{}.{}_{}.{}.{}".format(
                self.dt.year,
                self.dt.month,
                self.dt.day,
                self.dt.hour,
                self.dt.minute,
                self.dt.second,
            ))

        if run_settings.get("finishdir") is None:
            self.finishdir = "{}/{}_{}".format(config["folders"]["results"],
                                               self.name, self.now)
        self.db_pusher = DB_Manipulator(config, log)
        self.concat_files = dict()
        self.ref_resolver = Referencer(config, log)

    def get_sbatch(self):
        """ Returns sbatchfile, slightly superflous"""
        return self.batchfile

    def get_headerargs(self):
        headerline = "-A {} -p {} -n {} -t {} -J {}_{} --qos {} --output {}/slurm_{}.log".format(
            self.config["slurm_header"]["project"],
            self.config["slurm_header"]["type"],
            self.config["slurm_header"]["threads"],
            self.config["slurm_header"]["time"],
            self.config["slurm_header"]["job_prefix"],
            self.name,
            self.config["slurm_header"]["qos"],
            self.finishdir,
            self.name,
        )
        return headerline

    def verify_fastq(self):
        """ Uses arg indir to return a dict of PE fastq tuples fulfilling naming convention """
        verified_files = list()
        files = os.listdir(self.indir)
        if files == []:
            raise Exception("Directory {} lacks fastq files.".format(
                self.indir))
        for file in files:
            file_match = re.match(self.config["regex"]["file_pattern"], file)
            if file_match:
                # Check that symlinks resolve
                path = "{}/{}".format(self.indir, file)
                if os.path.islink(path):
                    if not os.path.exists(os.readlink(path)):
                        raise Exception(
                            "Some fastq files are unresolved symlinks in directory {}."
                            .format(self.indir))

                # Make sure both mates exist
                if (file_match[1] == "1" or file_match[1] == "2"
                        or file_match[1] == "forward"
                        or file_match[1] == "reverse"):
                    if file_match[1] == "forward" or file_match[1] == "reverse":
                        pairno = "forward"
                        if "forward" in file_match[1]:
                            pairno = "reverse"
                        pairname = file_match[0].replace(file_match[1], pairno)
                    else:
                        pairno = 2 - 1 % int(file_match[1])  # 1->2, 2->1
                        # Construct mate name
                        pairname = "{}{}{}".format(
                            file_match.string[:file_match.end(1) - 1],
                            pairno,
                            file_match.string[file_match.end(1):file_match.end(
                            )],
                        )
                    if pairname in files:
                        files.pop(files.index(pairname))
                        verified_files.append(file_match[0])
                        verified_files.append(pairname)
                else:
                    raise Exception(
                        "Some fastq files have no mate in directory {}.".
                        format(self.indir))
        if verified_files == []:
            raise Exception(
                "No files in directory {} match file_pattern '{}'.".format(
                    self.indir, self.config["regex"]["file_pattern"]))

        # Warn about file sizes
        for vfile in verified_files:
            try:
                bsize = os.stat("{}/{}".format(self.indir, vfile)).st_size
                bsize = bsize >> 20
                if bsize > 1000:
                    self.logger.warning(
                        "Input fastq {} exceeds 1000MB".format(vfile))
            except Exception as e:
                self.logger.warning(
                    "Unable to verify size of input file {}/{}".format(
                        self.indir, vfile))

        # Warn about invalid fastq files
        for vfile in verified_files:
            f = gzip.open("{}/{}".format(self.indir, vfile), "r")
            lines = f.read().splitlines()
            if len(lines) < 2 or not "+" in str(lines[-2]):
                self.logger.warning(
                    "Input fastq {} does not seem to end properly".format(
                        vfile))
        return sorted(verified_files)

    def create_assemblysection(self):
        batchfile = open(self.batchfile, "a+")
        # memory is actually 128 per node regardless of cores.
        batchfile.write("# Spades assembly\n")
        if self.trimmed:
            trimline = "-s {}".format(self.concat_files["i"])
        else:
            trimline = ""
        if self.careful:
            careline = "--careful"
        else:
            careline = ""

        batchfile.write(
            "spades.py --threads {} {} --memory {} -o {}/assembly -1 {} -2 {} {}\n"
            .format(
                self.config["slurm_header"]["threads"],
                careline,
                8 * int(self.config["slurm_header"]["threads"]),
                self.finishdir,
                self.concat_files["f"],
                self.concat_files["r"],
                trimline,
            ))

        batchfile.write(
            "mv {0}/assembly/contigs.fasta {0}/assembly/{1}_contigs.fasta\n".
            format(self.finishdir, self.name))
        batchfile.write(
            "sed -n '/NODE_1000_/q;p' {0}/assembly/{1}_contigs.fasta > {0}/assembly/{1}_trimmed_contigs.fasta\n"
            .format(self.finishdir, self.name))
        # batchfile.write("##Input cleanup\n")
        # batchfile.write("rm -r {}/trimmed\n".format(self.finishdir))
        batchfile.write("\n\n")
        batchfile.close()

    def blast_subset(self, name, search_string):
        # Create run
        file_list = glob.glob(search_string)
        batchfile = open(self.batchfile, "a+")
        batchfile.write("mkdir {}/blast_search/{}\n".format(
            self.finishdir, name))
        blast_format = '"7 stitle sstrand qaccver saccver pident evalue bitscore qstart qend sstart send length"'

        if len(file_list) > 1:
            for ref in file_list:
                if re.search(r"(\w+(?:\-\w+)*)\.\w+",
                             os.path.basename(ref)) is None:
                    self.logger.error(
                        "File {} does not match typical format. Consider deleting and redownloading"
                    )
                else:
                    ref_nosuf = re.search(r"(\w+(?:\-\w+)*)\.\w+",
                                          os.path.basename(ref)).group(1)
                batchfile.write("# BLAST {} search for {}, {}\n".format(
                    name, self.sample.get("organism"), ref_nosuf))
                if name == "mlst":
                    batchfile.write(
                        "blastn -db {}/{}  -query {}/assembly/{}_contigs.fasta -out {}/blast_search/{}/loci_query_{}.txt -task megablast -num_threads {} -outfmt {}\n"
                        .format(
                            os.path.dirname(ref),
                            ref_nosuf,
                            self.finishdir,
                            self.name,
                            self.finishdir,
                            name,
                            ref_nosuf,
                            self.config["slurm_header"]["threads"],
                            blast_format,
                        ))
                else:
                    batchfile.write(
                        "blastn -db {}/{}  -query {}/assembly/{}_contigs.fasta -out {}/blast_search/{}/{}.txt -task megablast -num_threads {} -outfmt {}\n"
                        .format(
                            os.path.dirname(ref),
                            ref_nosuf,
                            self.finishdir,
                            self.name,
                            self.finishdir,
                            name,
                            ref_nosuf,
                            self.config["slurm_header"]["threads"],
                            blast_format,
                        ))
        elif len(file_list) == 1:
            ref_nosuf = re.search(r"(\w+(?:\-\w+)*)\.\w+",
                                  os.path.basename(file_list[0])).group(1)
            batchfile.write("## BLAST {} search in {}\n".format(
                name,
                self.sample.get("organism").replace("_", " ").capitalize()))
            batchfile.write(
                "blastn -db {}/{}  -query {}/assembly/{}_contigs.fasta -out {}/blast_search/{}/{}.txt -task megablast -num_threads {} -outfmt {}\n"
                .format(
                    os.path.dirname(search_string),
                    ref_nosuf,
                    self.finishdir,
                    self.name,
                    self.finishdir,
                    name,
                    ref_nosuf,
                    self.config["slurm_header"]["threads"],
                    blast_format,
                ))
        batchfile.write("\n")
        batchfile.close()

    def create_variantsection(self):
        """ Creates a job for variant calling based on local alignment """
        ref = "{}/{}.fasta".format(self.config["folders"]["genomes"],
                                   self.sample.get("reference"))
        localdir = "{}/alignment".format(self.finishdir)
        outbase = "{}/{}_{}".format(localdir, self.name,
                                    self.sample.get("reference"))

        # Create run
        batchfile = open(self.batchfile, "a+")
        batchfile.write("# Variant calling based on local alignment\n")
        batchfile.write("mkdir {}\n".format(localdir))

        batchfile.write("## Alignment & Deduplication\n")
        batchfile.write("bwa mem -M -t {} {} {} {} > {}.sam\n".format(
            self.config["slurm_header"]["threads"],
            ref,
            self.concat_files["f"],
            self.concat_files["r"],
            outbase,
        ))
        batchfile.write(
            "samtools view --threads {} -b -o {}.bam -T {} {}.sam\n".format(
                self.config["slurm_header"]["threads"], outbase, ref, outbase))
        batchfile.write(
            "samtools sort --threads {} -o {}.bam_sort {}.bam\n".format(
                self.config["slurm_header"]["threads"], outbase, outbase))
        batchfile.write(
            "picard MarkDuplicates I={}.bam_sort O={}.bam_sort_rmdup M={}.stats.dup REMOVE_DUPLICATES=true\n"
            .format(outbase, outbase, outbase))
        batchfile.write("samtools index {}.bam_sort_rmdup\n".format(outbase))
        batchfile.write(
            "samtools idxstats {}.bam_sort_rmdup &> {}.stats.ref\n".format(
                outbase, outbase))
        # Removal of temp aligment files
        batchfile.write("rm {}.bam {}.sam\n".format(outbase, outbase))

        batchfile.write("## Primary stats generation\n")
        # Insert stats, dedupped
        batchfile.write(
            "picard CollectInsertSizeMetrics I={}.bam_sort_rmdup O={}.stats.ins H={}.hist.ins\n"
            .format(outbase, outbase, outbase))
        # Coverage
        batchfile.write(
            "samtools stats --coverage 1,10000,1 {}.bam_sort_rmdup |grep ^COV | cut -f 2- &> {}.stats.cov\n"
            .format(outbase, outbase))
        # Mapped rate, no dedup,dedup in MWGS (trimming has no effect)!
        batchfile.write(
            "samtools flagstat {}.bam_sort &> {}.stats.map\n".format(
                outbase, outbase))
        # Total reads, no dedup,dedup in MWGS (trimming has no effect)!
        batchfile.write(
            "samtools view -c {}.bam_sort &> {}.stats.raw\n".format(
                outbase, outbase))

        batchfile.write("\n\n")
        batchfile.close()

    def create_preprocsection(self):
        """Concatinates data, possibly trims it, then makes the unstranded reads usable"""
        forward = list()
        reverse = list()
        for root, dirs, files in os.walk(self.config["folders"]["adapters"]):
            if not "NexteraPE-PE.fa" in files:
                self.logger.error(
                    "Adapters folder at {} does not contain NexteraPE-PE.fa. Review paths.yml"
                )
            else:
                break
        trimdir = "{}/trimmed".format(self.finishdir)
        files = self.verify_fastq()
        batchfile = open(self.batchfile, "a+")
        batchfile.write("#Trimmomatic section\n")
        batchfile.write("mkdir {}\n".format(trimdir))

        batchfile.write("##Pre-concatination\n")
        for file in files:
            fullfile = "{}/{}".format(self.indir, file)
            # Even indexes = Forward
            if not files.index(file) % 2:
                forward.append(fullfile)
            elif files.index(file) % 2:
                reverse.append(fullfile)
        outfile = files[0].split("_")[0]

        self.concat_files["f"] = "{}/trimmed/{}_forward_reads.fastq.gz".format(
            self.finishdir, self.name)
        self.concat_files["r"] = "{}/trimmed/{}_reverse_reads.fastq.gz".format(
            self.finishdir, self.name)
        batchfile.write("cat {} > {}\n".format(" ".join(forward),
                                               self.concat_files.get("f")))
        batchfile.write("cat {} > {}\n".format(" ".join(reverse),
                                               self.concat_files.get("r")))

        if self.trimmed:
            fp = "{}/{}_trim_front_pair.fastq.gz".format(trimdir, outfile)
            fu = "{}/{}_trim_front_unpair.fastq.gz".format(trimdir, outfile)
            rp = "{}/{}_trim_rev_pair.fastq.gz".format(trimdir, outfile)
            ru = "{}/{}_trim_rev_unpair.fastq.gz".format(trimdir, outfile)
            batchfile.write("##Trimming section\n")
            batchfile.write(
                "trimmomatic PE -threads {} -phred33 {} {} {} {} {} {}\
      ILLUMINACLIP:{}/NexteraPE-PE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36\n"
                .format(
                    self.config["slurm_header"]["threads"],
                    self.concat_files.get("f"),
                    self.concat_files.get("r"),
                    fp,
                    fu,
                    rp,
                    ru,
                    self.config["folders"]["adapters"],
                ))

            batchfile.write("## Interlaced trimmed files\n")
            self.concat_files["f"] = fp
            self.concat_files["r"] = rp
            self.concat_files["i"] = "{}/{}_trim_unpair.fastq.gz".format(
                trimdir, outfile)

            batchfile.write("cat {} >> {}\n".format(" ".join(
                [fu, ru]), self.concat_files.get("i")))
        batchfile.write("\n")
        batchfile.close()

    def create_assemblystats_section(self):
        batchfile = open(self.batchfile, "a+")
        batchfile.write("# QUAST QC metrics\n")
        batchfile.write("mkdir {}/assembly/quast\n".format(self.finishdir))
        batchfile.write(
            "quast.py {}/assembly/{}_contigs.fasta -o {}/assembly/quast\n".
            format(self.finishdir, self.name, self.finishdir))
        batchfile.write(
            "mv {}/assembly/quast/report.tsv {}/assembly/quast/{}_report.tsv\n\n"
            .format(self.finishdir, self.finishdir, self.name))
        batchfile.close()

    def create_snpsection(self):
        snplist = self.filelist.copy()
        batchfile = open(self.batchfile, "a+")
        name = ""

        # VCFTools filters:
        vcffilter = "--minQ 30 --thin 50 --minDP 3 --min-meanDP 20"
        # BCFTools filters:
        bcffilter = "GL[0]<-500 & GL[1]=0 & QR/RO>30 & QA/AO>30 & QUAL>5000 & ODDS>1100 & GQ>140 & DP>100 & MQM>59 & SAP<15 & PAIRED>0.9 & EPP>3"

        for item in snplist:
            if item.count("/") >= 2:
                name = item.split("/")[-2]
            if "_" in name:
                name = name.split("_")[0]
            batchfile.write("# Basecalling for sample {}\n".format(name))
            ref = "{}/{}.fasta".format(self.config["folders"]["genomes"],
                                       self.sample.get("reference"))
            outbase = "{}/{}_{}".format(item, name,
                                        self.sample.get("reference"))
            batchfile.write(
                "samtools view -h -q 1 -F 4 -F 256 {}.bam_sort_rmdup | grep -v XA:Z | grep -v SA:Z| samtools view -b - > {}/{}.unique\n"
                .format(outbase, self.finishdir, name))
            batchfile.write(
                "freebayes -= --pvar 0.7 -j -J --standard-filters -C 6 --min-coverage 30 --ploidy 1 -f {} -b {}/{}.unique -v {}/{}.vcf\n"
                .format(ref, self.finishdir, name, self.finishdir, name))
            batchfile.write(
                "bcftools view {}/{}.vcf -o {}/{}.bcf.gz -O b --exclude-uncalled --types snps\n"
                .format(self.finishdir, name, self.finishdir, name))
            batchfile.write("bcftools index {}/{}.bcf.gz\n".format(
                self.finishdir, name))
            batchfile.write("\n")

            batchfile.write(
                "vcftools --bcf {}/{}.bcf.gz {} --remove-filtered-all --recode-INFO-all --recode-bcf --out {}/{}\n"
                .format(self.finishdir, name, vcffilter, self.finishdir, name))
            batchfile.write(
                'bcftools view {}/{}.recode.bcf -i "{}" -o {}/{}.recode.bcf.gz -O b --exclude-uncalled --types snps\n'
                .format(self.finishdir, name, bcffilter, self.finishdir, name))
            batchfile.write("bcftools index {}/{}.recode.bcf.gz\n\n".format(
                self.finishdir, name))

        batchfile.write("# SNP pair-wise distance\n")
        batchfile.write("touch {}/stats.out\n".format(self.finishdir))
        while len(snplist) > 1:
            nameOne = ""
            nameTwo = ""
            top = snplist.pop(0)
            if top.count("/") >= 2:
                nameOne = top.split("/")[-2]
            if "_" in nameOne:
                nameOne = nameOne.split("_")[0]
            for entry in snplist:
                if entry.count("/") >= 2:
                    nameTwo = entry.split("/")[-2]
                if "_" in nameTwo:
                    nameTwo = nameTwo.split("_")[0]

                pair = "{}_{}".format(nameOne, nameTwo)
                batchfile.write(
                    "bcftools isec {}/{}.recode.bcf.gz {}/{}.recode.bcf.gz -n=1 -c all -p {}/tmp -O b\n"
                    .format(self.finishdir, nameOne, self.finishdir, nameTwo,
                            self.finishdir))
                batchfile.write(
                    "bcftools merge -O b -o {}/{}.bcf.gz --force-samples {}/tmp/0000.bcf {}/tmp/0001.bcf\n"
                    .format(self.finishdir, pair, self.finishdir,
                            self.finishdir))
                batchfile.write("bcftools index {}/{}.bcf.gz\n".format(
                    self.finishdir, pair))

                batchfile.write(
                    "echo {} $( bcftools stats {}/{}.bcf.gz |grep SNPs: | cut -d $'\\t' -f4 ) >> {}/stats.out\n"
                    .format(pair, self.finishdir, pair, self.finishdir))
                batchfile.write("\n")
        batchfile.close()

    def create_collection(self):
        """Creates collection entry in database"""
        if self.db_pusher.exists("Collections", {"ID_collection": self.name}):
            self.db_pusher.purge_rec(name=self.name, type="Collections")
            for sample in self.pool:
                self.db_pusher.add_rec(
                    {
                        "ID_collection": self.name,
                        "CG_ID_sample": sample
                    }, "Collections")

        addedprojs = list()
        for sample in self.pool:
            proj = re.search(r"(\w+)A(?:\w+)", sample).group(1)
            if proj not in addedprojs:
                self.create_project(proj)
                addedprojs.append(proj)

    def create_project(self, name):
        """Creates project in database"""
        proj_col = dict()
        proj_col["CG_ID_project"] = name
        proj_col["Customer_ID_project"] = self.sample.get(
            "Customer_ID_project")
        proj_col["Customer_ID"] = self.sample.get("Customer_ID")
        self.db_pusher.add_rec(proj_col, "Projects")
        self.db_pusher.upd_rec({"CG_ID_project": name}, "Projects", proj_col)

    def create_sample(self, name):
        """Creates sample in database"""
        try:
            sample_col = self.db_pusher.get_columns("Samples")
            sample_col["CG_ID_sample"] = self.sample.get("CG_ID_sample")
            sample_col["CG_ID_project"] = self.sample.get("CG_ID_project")
            sample_col["Customer_ID_sample"] = self.sample.get(
                "Customer_ID_sample")
            sample_col["reference_genome"] = self.sample.get("reference")
            sample_col["reference_length"] = self.sample.get(
                "reference_length")
            sample_col["date_analysis"] = self.dt
            sample_col["organism"] = self.sample.get("organism")
            sample_col["application_tag"] = self.sample.get("application_tag")
            sample_col["priority"] = self.sample.get("priority")
            sample_col["date_arrival"] = datetime.strptime(
                self.sample.get("date_arrival"), "%Y-%m-%d %H:%M:%S")
            sample_col["date_sequencing"] = datetime.strptime(
                self.sample.get("date_sequencing"), "%Y-%m-%d %H:%M:%S")
            sample_col["date_libprep"] = datetime.strptime(
                self.sample.get("date_libprep"), "%Y-%m-%d %H:%M:%S")
            sample_col["method_libprep"] = self.sample.get("method_libprep")
            sample_col["method_sequencing"] = self.sample.get(
                "method_sequencing")
            # self.db_pusher.purge_rec(sample_col['CG_ID_sample'], 'sample')
            self.db_pusher.add_rec(sample_col, "Samples")
        except Exception as e:
            self.logger.error("Unable to add sample {} to database".format(
                self.name))

    def project_job(self, single_sample=False):
        if "dry" in self.config and self.config["dry"] == True:
            dry = True
        else:
            dry = False
        jobarray = list()
        if not os.path.exists(self.finishdir):
            os.makedirs(self.finishdir)
        # Loads project level info.
        try:
            if single_sample:
                self.create_project(self.sample.get("CG_ID_project"))
            elif self.pool:
                self.create_collection()
            else:
                self.create_project(self.name)
        except Exception as e:
            self.logger.error(
                "LIMS interaction failed. Unable to read/write project {}".
                format(self.name))
        # Writes the job creation sbatch
        if single_sample:
            try:
                self.sample_job()
                headerargs = self.get_headerargs()
                outfile = self.get_sbatch()
                bash_cmd = "sbatch {} {}".format(headerargs, outfile)
                if not dry and outfile != "":
                    samproc = subprocess.Popen(bash_cmd.split(),
                                               stdout=subprocess.PIPE)
                    output, error = samproc.communicate()
                    jobno = re.search(r"(\d+)", str(output)).group(0)
                    jobarray.append(jobno)
                else:
                    self.logger.info("Suppressed command: {}".format(bash_cmd))
            except Exception as e:
                self.logger.error("Unable to analyze single sample {}".format(
                    self.name))
        else:
            for ldir in glob.glob("{}/*/".format(self.indir)):
                ldir = os.path.basename(os.path.normpath(ldir))
                try:
                    sample_in = "{}/{}".format(self.indir, ldir)
                    sample_out = "{}/{}".format(self.finishdir, ldir)
                    linkedjson = None
                    local_sampleinfo = [
                        p for p in self.sampleinfo if p["CG_ID_sample"] == ldir
                    ]
                    if local_sampleinfo == []:
                        raise Exception(
                            "Sample {} has no counterpart in json file".format(
                                ldir))
                    else:
                        local_sampleinfo = local_sampleinfo[0]
                    sample_settings = dict(self.run_settings)
                    sample_settings["input"] = sample_in
                    sample_settings["finishdir"] = sample_out
                    sample_settings["timestamp"] = self.now
                    sample_instance = Job_Creator(
                        config=self.config,
                        log=self.logger,
                        sampleinfo=local_sampleinfo,
                        run_settings=sample_settings,
                    )
                    sample_instance.sample_job()
                    headerargs = sample_instance.get_headerargs()
                    outfile = ""
                    if os.path.isfile(sample_instance.get_sbatch()):
                        outfile = sample_instance.get_sbatch()
                    bash_cmd = "sbatch {} {}".format(headerargs, outfile)
                    if not dry and outfile != "":
                        projproc = subprocess.Popen(bash_cmd.split(),
                                                    stdout=subprocess.PIPE)
                        output, error = projproc.communicate()
                        jobno = re.search(r"(\d+)", str(output)).group(0)
                        jobarray.append(jobno)
                    else:
                        self.logger.info(
                            "Suppressed command: {}".format(bash_cmd))
                except Exception as e:
                    pass
        if not dry:
            self.finish_job(jobarray, single_sample)

    def finish_job(self, joblist, single_sample=False):
        """ Uploads data and sends an email once all analysis jobs are complete. """
        report = "default"
        if self.qc_only:
            report = "qc"
        custom_conf = ""
        if "config_path" in self.config:
            custom_conf = "--config {}".format(self.config["config_path"])

        process = subprocess.Popen("id -un".split(), stdout=subprocess.PIPE)
        user, error = process.communicate()
        user = str(user).replace(".", " ").title()
        # if not os.path.exists(self.finishdir):
        #  os.makedirs(self.finishdir)

        startfile = "{}/run_started.out".format(self.finishdir)
        configfile = "{}/config.log".format(self.finishdir)
        mailfile = "{}/mailjob.sh".format(self.finishdir)
        samplefile = "{}/sampleinfo.json".format(self.finishdir)
        with open(samplefile, "w+") as outfile:
            json.dump(self.sampleinfo, outfile)

        sb = open(startfile, "w+")
        cb = open(configfile, "w+")
        mb = open(mailfile, "w+")

        sb.write("#!/usr/bin/env bash\n")
        sb.close()
        configout = self.config.copy()
        if "genologics" in configout:
            del configout["genologics"]
        cb.write("ANALYSIS STARTED BY: {}\n".format(user))
        cb.write(json.dumps(configout, indent=2, separators=(",", ":")))
        cb.close()
        mb.write("#!/usr/bin/env bash\n\n")
        mb.write(
            "#Uploading of results to database and production of report\n")
        if "MICROSALT_CONFIG" in os.environ:
            mb.write("export MICROSALT_CONFIG={}\n".format(
                os.environ["MICROSALT_CONFIG"]))
        mb.write("source activate $CONDA_DEFAULT_ENV\n")

        mb.write(
            "microSALT utils finish {0}/sampleinfo.json --input {0} --email {1} --report {2} {3}\n"
            .format(
                self.finishdir,
                self.config["regex"]["mail_recipient"],
                report,
                custom_conf,
            ))
        mb.write("touch {}/run_complete.out".format(self.finishdir))
        mb.close()

        massagedJobs = list()
        final = ":".join(joblist)
        # Create subtracker if more than 50 samples
        maxlen = 50
        if len(joblist) > maxlen:
            i = 1
            while i <= len(joblist):
                if i + maxlen < len(joblist):
                    massagedJobs.append(":".join(joblist[i - 1:i + maxlen -
                                                         1]))
                else:
                    massagedJobs.append(":".join(joblist[i - 1:-1]))
                i += maxlen
            for entry in massagedJobs:
                if massagedJobs.index(entry) < len(massagedJobs) - 1:
                    head = "-A {} -p core -n 1 -t 00:00:10 -J {}_{}_SUBTRACKER --qos {} --dependency=afterany:{}".format(
                        self.config["slurm_header"]["project"],
                        self.config["slurm_header"]["job_prefix"],
                        self.name,
                        self.config["slurm_header"]["qos"],
                        entry,
                    )
                    bash_cmd = "sbatch {} {}".format(head, startfile)
                    mailproc = subprocess.Popen(bash_cmd.split(),
                                                stdout=subprocess.PIPE)
                    output, error = mailproc.communicate()
                    jobno = re.search(r"(\d+)", str(output)).group(0)
                    massagedJobs[massagedJobs.index(entry) +
                                 1] += ":{}".format(jobno)
                else:
                    final = entry
                    break

        head = "-A {} -p core -n 1 -t 6:00:00 -J {}_{}_MAILJOB --qos {} --open-mode append --dependency=afterany:{} --output {}".format(
            self.config["slurm_header"]["project"],
            self.config["slurm_header"]["job_prefix"],
            self.name,
            self.config["slurm_header"]["qos"],
            final,
            self.config["folders"]["log_file"],
            self.config["regex"]["mail_recipient"],
        )
        bash_cmd = "sbatch {} {}".format(head, mailfile)
        mailproc = subprocess.Popen(bash_cmd.split(), stdout=subprocess.PIPE)
        output, error = mailproc.communicate()

        try:
            jobno = str(re.search(r"(\d+)", str(output)).group(0))
            joblist.append(jobno)
        except Exception as e:
            self.logger.info("Unable to grab SLURMID for {0}".format(
                self.name))

        try:
            #Generates file with all slurm ids
            slurmname = "{}_slurm_ids.yaml".format(self.name)
            slurmreport_storedir = Path(self.config["folders"]["reports"],
                                        "trailblazer", slurmname)
            slurmreport_workdir = Path(self.finishdir, slurmname)
            yaml.safe_dump(data={"jobs": [str(job) for job in joblist]},
                           stream=open(slurmreport_workdir, "w"))
            shutil.copyfile(slurmreport_workdir, slurmreport_storedir)
            self.logger.info(
                "Saved Trailblazer slurm report file to %s and %s",
                slurmreport_storedir,
                slurmreport_workdir,
            )
        except Exception as e:
            self.logger.info(
                "Unable to generate Trailblazer slurm report file")

    def sample_job(self):
        """ Writes necessary sbatch job for each individual sample """
        try:
            if not os.path.exists(self.finishdir):
                os.makedirs(self.finishdir)
            try:
                # This is one job
                self.batchfile = "{}/runfile.sbatch".format(self.finishdir)
                batchfile = open(self.batchfile, "w+")
                batchfile.write("#!/bin/sh\n\n")
                batchfile.write("mkdir -p {}\n".format(self.finishdir))
                batchfile.close()
                self.create_preprocsection()
                self.create_variantsection()
                if not self.qc_only:
                    self.create_assemblysection()
                    self.create_assemblystats_section()
                    self.create_blast_search()
                batchfile = open(self.batchfile, "a+")
                batchfile.close()

                self.logger.info(
                    "Created runfile for sample {} in folder {}".format(
                        self.name, self.finishdir))
            except Exception as e:
                raise
            try:
                self.create_sample(self.name)
            except Exception as e:
                self.logger.error(
                    "Unable to access LIMS info for sample {}".format(
                        self.name))
        except Exception as e:
            self.logger.error(
                "Unable to create job for sample {}\nSource: {}".format(
                    self.name, str(e)))
            shutil.rmtree(self.finishdir, ignore_errors=True)
            raise

    def create_blast_search(self):
        reforganism = self.ref_resolver.organism2reference(
            self.sample.get("organism"))
        self.batchfile = "{}/runfile.sbatch".format(self.finishdir)
        batchfile = open(self.batchfile, "a+")
        batchfile.write("mkdir -p {}/blast_search\n".format(self.finishdir))
        batchfile.close()
        self.blast_subset(
            "mlst",
            "{}/{}/*.tfa".format(self.config["folders"]["references"],
                                 reforganism),
        )
        self.blast_subset(
            "resistance",
            "{}/*.fsa".format(self.config["folders"]["resistances"]))
        if reforganism == "escherichia_coli":
            ss = "{}/*{}".format(
                os.path.dirname(self.config["folders"]["expec"]),
                os.path.splitext(self.config["folders"]["expec"])[1],
            )
            self.blast_subset("expec", ss)

    def snp_job(self):
        """ Writes a SNP calling job for a set of samples """
        if not os.path.exists(self.finishdir):
            os.makedirs(self.finishdir)

        self.batchfile = "{}/runfile.sbatch".format(self.finishdir)
        batchfile = open(self.batchfile, "w+")
        batchfile.write("#!/usr/bin/env bash\n")
        batchfile.write("mkdir -p {}\n\n".format(self.finishdir))
        batchfile.close()

        self.create_snpsection()
        batchfile = open(self.batchfile, "a+")
        batchfile.close()

        headerline = (
            "-A {} -p {} -n 1 -t 24:00:00 -J {}_{} --qos {} --output {}/slurm_{}.log"
            .format(
                self.config["slurm_header"]["project"],
                self.config["slurm_header"]["type"],
                self.config["slurm_header"]["job_prefix"],
                self.name,
                self.config["slurm_header"]["qos"],
                self.finishdir,
                self.name,
            ))
        outfile = self.get_sbatch()
        bash_cmd = "sbatch {} {}".format(headerline, outfile)
        samproc = subprocess.Popen(bash_cmd.split(), stdout=subprocess.PIPE)
        output, error = samproc.communicate()

예제 #13

파일 보기

파일: scraper.py 프로젝트: eriksjolund/microSALT

class Scraper:
    def __init__(self, config, log, sampleinfo={}, input=""):
        self.config = config
        self.logger = log
        self.db_pusher = DB_Manipulator(config, log)
        self.referencer = Referencer(config, log)
        self.job_fallback = Job_Creator(config=config,
                                        log=log,
                                        sampleinfo=sampleinfo)
        self.infolder = os.path.abspath(input)
        self.sampledir = ""

        # Since all scraped folders are generated by Job_Creator, datestring is automatically provided in last folder
        last_folder = self.infolder.split("/")[-1]
        self.name = last_folder.split("_")[0]

        self.sampleinfo = sampleinfo
        self.sample = None
        if isinstance(self.sampleinfo, list) and len(self.sampleinfo) > 1:
            self.name = self.sampleinfo[0].get("CG_ID_project")
            self.sample = self.sampleinfo[0]
            for entry in self.sampleinfo:
                if entry.get("CG_ID_sample") == self.name:
                    raise Exception(
                        "Mixed projects in samples_info file. Do not know how to proceed"
                    )
        else:
            if isinstance(self.sampleinfo, list):
                self.sampleinfo = self.sampleinfo[0]
            self.name = self.sampleinfo.get("CG_ID_sample")
            self.sample = self.sampleinfo

        self.gene2resistance = self.load_resistances()

    def scrape_project(self, project=None):
        """Scrapes a project folder for information"""
        if project is None:
            project = self.name
        self.db_pusher.purge_rec(project, "Projects")
        if not self.db_pusher.exists("Projects", {"CG_ID_project": project}):
            self.logger.warning("Replacing project {}".format(project))
            self.job_fallback.create_project(project)

        # Scrape order matters a lot!
        for dir in os.listdir(self.infolder):
            subdir = "{}/{}".format(self.infolder, dir)
            local_param = [
                p for p in self.sampleinfo if p["CG_ID_sample"] == dir
            ]
            if local_param != []:
                local_param = local_param[0]
                sample_scraper = Scraper(
                    config=self.config,
                    log=self.logger,
                    sampleinfo=local_param,
                    input=subdir,
                )
                sample_scraper.scrape_sample()
            else:
                self.logger.warning(
                    "Skipping {} due to lacking info in sample_json file".
                    format(dir))

    def scrape_sample(self, sample=None):
        """Scrapes a sample folder for information"""
        if sample is None:
            sample = self.name
        self.db_pusher.purge_rec(sample, "Samples")

        if not self.db_pusher.exists(
                "Projects",
            {"CG_ID_project": self.sample.get("CG_ID_project")}):
            self.logger.warning("Replacing project {}".format(
                self.sample.get("CG_ID_project")))
            self.job_fallback.create_project(self.sample.get("CG_ID_project"))

        if not self.db_pusher.exists("Samples", {"CG_ID_sample": sample}):
            self.logger.warning("Replacing sample {}".format(sample))
            self.job_fallback.create_sample(sample)

        # Scrape order matters a lot!
        self.sampledir = self.infolder
        self.scrape_blast(type="seq_type")
        self.scrape_blast(type="resistance")
        if (self.referencer.organism2reference(
                self.sample.get("organism")) == "escherichia_coli"):
            self.scrape_blast(type="expec")
        self.scrape_alignment()
        self.scrape_quast()

    def scrape_quast(self, filename=""):
        """Scrapes a quast report for assembly information"""
        if filename == "":
            filename = "{}/assembly/quast/report.tsv".format(self.sampledir)
        quast = dict()
        try:
            with open(filename, "r") as infile:
                for line in infile:
                    lsplit = line.rstrip().split("\t")
                    if lsplit[0] == "# contigs":
                        quast["contigs"] = int(lsplit[1])
                    elif lsplit[0] == "Total length":
                        quast["genome_length"] = int(lsplit[1])
                    elif lsplit[0] == "GC (%)":
                        quast["gc_percentage"] = float(lsplit[1])
                    elif lsplit[0] == "N50":
                        quast["n50"] = int(lsplit[1])

            self.db_pusher.upd_rec({"CG_ID_sample": self.name}, "Samples",
                                   quast)
            self.logger.debug("Project {} recieved quast stats: {}".format(
                self.name, quast))
        except Exception as e:
            self.logger.warning(
                "Cannot generate quast statistics for {}".format(self.name))

    def get_locilengths(self, foldername, suffix):
        """ Generate a dict of length for any given loci """
        # Create dict with full name as key, associated nucleotides as value.
        alleles = dict()
        finalalleles = dict()
        for file in os.listdir(foldername):
            if file.endswith(suffix):
                lastallele = ""
                f = open("{}/{}".format(foldername, file), "r")
                for row in f:
                    if ">" in row:
                        lastallele = row.strip()
                        alleles[lastallele] = ""
                    else:
                        alleles[lastallele] = alleles[lastallele] + row.strip()
                f.close()

                for k, v in alleles.items():
                    finalalleles[k] = len(v)
        return finalalleles

    def scrape_blast(self, type="", file_list=[]):
        hypo = list()
        type2db = type.capitalize() + "s"
        if type == "expec":
            type2db = "Expacs"

        if file_list == []:
            if type == "seq_type":
                file_list = glob.glob("{}/blast_search/mlst/*".format(
                    self.sampledir))
            else:
                file_list = glob.glob("{}/blast_search/{}/*".format(
                    self.sampledir, type))

        organism = self.referencer.organism2reference(
            self.sample.get("organism"))
        if organism:
            self.db_pusher.upd_rec({"CG_ID_sample": self.name}, "Samples",
                                   {"organism": organism})
        res_cols = self.db_pusher.get_columns("{}".format(type2db))

        try:
            old_ref = ""
            for file in file_list:
                filename = os.path.basename(file).rsplit(
                    ".", 1)[0]  # Removes suffix
                if filename == "lactam":
                    filename = "beta-lactam"
                if type == "resistance":
                    ref_folder = self.config["folders"]["resistances"]
                    suffix = "fsa"
                elif type == "expec":
                    ref_folder = os.path.dirname(
                        self.config["folders"]["expec"])
                    suffix = os.path.basename(
                        self.config["folders"]["expec"]).rsplit(".", 1)[1]
                elif type == "seq_type":
                    ref_folder = "{}/{}".format(
                        self.config["folders"]["references"], organism)
                    suffix = "tfa"
                locilengths = self.get_locilengths(ref_folder, suffix)

                with open("{}".format(file), "r") as sample:
                    for line in sample:
                        # Ignore commented fields
                        if not line[0] == "#":

                            elem_list = line.rstrip().split("\t")
                            if not elem_list[1] == "N/A":
                                hypo.append(dict())
                                hypo[-1]["CG_ID_sample"] = self.name
                                hypo[-1]["identity"] = elem_list[4]
                                hypo[-1]["evalue"] = elem_list[5]
                                hypo[-1]["bitscore"] = elem_list[6]
                                if int(elem_list[7]) < int(elem_list[8]):
                                    hypo[-1]["contig_start"] = int(
                                        elem_list[7])
                                    hypo[-1]["contig_end"] = int(elem_list[8])
                                else:
                                    hypo[-1]["contig_start"] = int(
                                        elem_list[8])
                                    hypo[-1]["contig_end"] = int(elem_list[7])
                                hypo[-1]["subject_length"] = int(elem_list[11])

                                if type == "resistance":
                                    hypo[-1]["instance"] = filename
                                    partials = re.search(
                                        r"(?:\>)*(.+)_(\d+){1,3}(?:_(.+))*",
                                        elem_list[3],
                                    )
                                    hypo[-1]["reference"] = partials.group(3)
                                    hypo[-1]["gene"] = partials.group(1)
                                    if hypo[-1][
                                            "gene"] in self.gene2resistance.keys(
                                            ):
                                        hypo[-1][
                                            "resistance"] = self.gene2resistance[
                                                hypo[-1]["gene"]]
                                    else:
                                        hypo[-1]["{}".format(type)] = hypo[-1][
                                            "instance"].capitalize()
                                    hypo[-1]["span"] = (
                                        float(hypo[-1]["subject_length"]) /
                                        locilengths[">{}".format(
                                            elem_list[3])])

                                elif type == "expec":
                                    hypo[-1]["instance"] = filename
                                    # Thanks, precompiled list standards
                                    if ">" in elem_list[3]:
                                        partials = re.search(
                                            r">*(\w+_\w+\.*\w+).+\((\w+)\).+\((\w+)\)_(\w+)_\[.+\]",
                                            elem_list[3],
                                        )
                                    else:
                                        partials = re.search(
                                            r"(\w+)\(gb\|\w+\)_\((\S+)\)_(.+)_\[(\S+)_.+\]_\[\S+\]",
                                            elem_list[3],
                                        )
                                    if not partials:
                                        partials = re.search(
                                            r"(\w+\.*\w+)\:*\w*_*(?:\(\w+\-\w+\))*_\((\w+)\)_([^[]+)\[\S+\]",
                                            elem_list[3],
                                        )
                                    # NC/Protein reference
                                    hypo[-1]["reference"] = partials.group(1)
                                    # Full gene name
                                    hypo[-1]["gene"] = partials.group(2)
                                    # More generic group
                                    hypo[-1]["instance"] = partials.group(
                                        3).strip("_")
                                    # Description
                                    if len(partials.groups()) >= 4:
                                        hypo[-1]["virulence"] = (
                                            partials.group(4).replace(
                                                "_", " ").capitalize())
                                    else:
                                        hypo[-1]["virulence"] = ""
                                    hypo[-1]["span"] = (
                                        float(hypo[-1]["subject_length"]) /
                                        locilengths[">{}".format(
                                            elem_list[3])])

                                elif type == "seq_type":
                                    partials = re.search(
                                        r"(.+)_(\d+){1,3}(?:_(\w+))*",
                                        elem_list[3])
                                    hypo[-1]["loci"] = partials.group(1)
                                    hypo[-1]["allele"] = int(partials.group(2))
                                    hypo[-1]["span"] = (
                                        float(hypo[-1]["subject_length"]) /
                                        locilengths[">{}".format(
                                            elem_list[3])])

                                # split elem 2 into contig node_NO, length, cov
                                nodeinfo = elem_list[2].split("_")
                                hypo[-1]["contig_name"] = "{}_{}".format(
                                    nodeinfo[0], nodeinfo[1])
                                hypo[-1]["contig_length"] = int(nodeinfo[3])
                                hypo[-1]["contig_coverage"] = nodeinfo[5]
                                self.logger.debug(
                                    "scrape_blast scrape loop hit")
            self.logger.info("{} candidate {} hits found".format(
                len(hypo), type2db))
        except Exception as e:
            self.logger.error("Unable to process the pattern of {}".format(
                str(e)))

        # Cleanup of overlapping hits
        if type == "seq_type":
            identifier = "loci"
        elif type == "resistance" or type == "expec":
            identifier = "gene"
        ind = 0
        while ind < len(hypo) - 1:
            targ = ind + 1
            while targ < len(hypo):
                ignore = False
                if (hypo[ind]["contig_name"] == hypo[targ]["contig_name"]
                        or hypo[ind][identifier] == hypo[targ][identifier]):
                    # Overlapping or shared gene
                    if ((hypo[ind].get("contig_start") >=
                         hypo[targ].get("contig_start")
                         and hypo[ind].get("contig_start") <=
                         hypo[targ].get("contig_end"))
                            or (hypo[ind].get("contig_end") >=
                                hypo[targ].get("contig_start")
                                and hypo[ind].get("contig_end") <=
                                hypo[targ].get("contig_end"))
                            or (hypo[ind].get(identifier)
                                == hypo[targ].get(identifier))):
                        # Rightmost is worse
                        if float(hypo[ind].get("identity")) * (
                                1 - abs(1 - hypo[ind].get("span"))) > float(
                                    hypo[targ].get("identity")) * (
                                        1 - abs(1 - hypo[targ].get("span"))):
                            del hypo[targ]
                            ignore = True
                        # Leftmost is worse
                        elif float(hypo[ind].get("identity")) * (
                                1 - abs(1 - hypo[ind].get("span"))) < float(
                                    hypo[targ].get("identity")) * (
                                        1 - abs(1 - hypo[targ].get("span"))):
                            del hypo[ind]
                            targ = ind + 1
                            ignore = True
                        # Identical identity and span, seperating based on contig coverage
                        else:
                            # Rightmost is worse
                            if float(
                                    hypo[ind].get("contig_coverage")) >= float(
                                        hypo[targ].get("contig_coverage")):
                                del hypo[targ]
                                ignore = True
                            # Leftmost is worse
                            elif float(
                                    hypo[ind].get("contig_coverage")) < float(
                                        hypo[targ].get("contig_coverage")):
                                del hypo[ind]
                                targ = ind + 1
                                ignore = True
                if not ignore:
                    targ += 1
                else:
                    pass
            ind += 1

        self.logger.info(
            "{} {} hits were added after removing overlaps and duplicate hits".
            format(len(hypo), type))
        for hit in hypo:
            self.logger.debug("Kept {}:{} with span {} and id {}".format(
                hit.get("loci"),
                hit.get("allele"),
                hit.get("span"),
                hit.get("identity"),
            ))
            self.db_pusher.add_rec(hit, "{}".format(type2db))

        if type == "seq_type":
            try:
                ST = self.db_pusher.alleles2st(self.name)
                self.db_pusher.upd_rec({"CG_ID_sample": self.name}, "Samples",
                                       {"ST": ST})
                self.logger.info("Sample {} received ST {}".format(
                    self.name, ST))
            except Exception as e:
                self.logger.warning(
                    "Unable to type sample {} due to data value '{}'".format(
                        self.name, str(e)))

    def load_resistances(self):
        """Legacy function, loads common resistance names for genes from notes file"""
        conversions = dict()
        try:
            with open("{}/notes.txt".format(
                    self.config["folders"]["resistances"])) as fh:
                for line in fh:
                    if "#" not in line:
                        line = line.split(":")
                        cropped = re.sub(" resistance", "", line[1])
                        conversions[line[0]] = cropped
                        # Workaround for case issues
                        conversions[line[0].lower()] = cropped
        except Exception as e:
            self.logger.error(
                "Unable to initialize trivial names for resistances ({})".
                format(e))
        return conversions

    def scrape_alignment(self, file_list=[]):
        """Scrapes a single alignment result"""
        if file_list == []:
            file_list = glob.glob("{}/alignment/*.stats.*".format(
                self.sampledir))
        ins_list = list()
        cov_dict = dict()
        align_dict = dict()
        align_dict["reference_genome"] = self.sample.get("reference")

        # Reading
        file_list = glob.glob("{}/alignment/*.stats.*".format(self.sampledir))
        map_rate = 0.0
        median_ins = 0
        ref_len = 0.0
        tot_reads = 0
        tot_map = 0
        duprate = 0.0
        for file in file_list:
            with open(file, "r") as fh:
                type = file.split(".")[-1]
                for line in fh.readlines():
                    lsplit = line.rstrip().split("\t")
                    if type == "raw":
                        try:
                            tot_reads = int(lsplit[0])
                        except Exception as e:
                            pass
                    elif type == "ins":
                        if len(lsplit) >= 18 and lsplit[-12] in ["FF", "FR"]:
                            try:
                                median_ins = int(lsplit[0])
                            except Exception as e:
                                pass
                    elif type == "cov":
                        cov_dict[lsplit[1]] = int(lsplit[2])
                    elif type == "ref":
                        if lsplit[0] != "*" and len(lsplit) >= 2:
                            ref_len = ref_len + int(lsplit[1])
                    elif type == "dup":
                        if lsplit[0] == "Unknown Library":
                            try:
                                duprate = float(lsplit[8])
                            except Exception as e:
                                duprate = -1.0
                    elif type == "map":
                        dsplit = line.rstrip().split(" ")
                        if len(dsplit) >= 5 and dsplit[4] == "total":
                            tot_map = int(dsplit[0])
                        elif len(dsplit) >= 4 and dsplit[3] == "mapped":
                            if tot_map > 0:
                                map_rate = int(dsplit[0]) / float(tot_map)

        # Mangling
        sumz, plus10, plus30, plus50, plus100, total = 0, 0, 0, 0, 0, 0
        for k, v in cov_dict.items():
            sumz += int(k) * v
            total += v
            if int(k) > 10:
                plus10 += v
            if int(k) > 30:
                plus30 += v
            if int(k) > 50:
                plus50 += v
            if int(k) > 100:
                plus100 += v
        if total > 0:
            align_dict["coverage_10x"] = plus10 / float(ref_len)
            align_dict["coverage_30x"] = plus30 / float(ref_len)
            align_dict["coverage_50x"] = plus50 / float(ref_len)
            align_dict["coverage_100x"] = plus100 / float(ref_len)
        else:
            align_dict["coverage_10x"] = 0.0
            align_dict["coverage_30x"] = 0.0
            align_dict["coverage_50x"] = 0.0
            align_dict["coverage_100x"] = 0.0

        align_dict["mapped_rate"] = map_rate
        align_dict["insert_size"] = median_ins
        if ref_len > 0:
            align_dict["duplication_rate"] = duprate
            align_dict["average_coverage"] = sumz / float(ref_len)
        else:
            align_dict["duplication_rate"] = 0.0
            align_dict["average_coverage"] = 0.0
        align_dict["total_reads"] = tot_reads
        self.db_pusher.upd_rec({"CG_ID_sample": self.name}, "Samples",
                               align_dict)