def run_cutadapt(input_list: list):
    # Note the order, it hardly depends from the order of the upstream dataframe columns
    sample_name, sample_file_1, sample_file_2 = input_list
    _ADAPTER = "AGATCGGAAGAG"
    out_file_1, out_file_2, log_file = [
        os.path.join(cutadaptDir, "{}_cutadapt.{}".format(sample_name, i))
        for i in ("1.fq.gz", "2.fq.gz", "log")
    ]
    cmd = "cutadapt -a {ad} -A {ad} -m 50 -o {o1} -p {o2} {i1} {i2}".format(
        ad=_ADAPTER,
        i1=sample_file_1,
        i2=sample_file_2,
        o1=out_file_1,
        o2=out_file_2)
    try:
        for _f in [out_file_1, out_file_2, log_file]:
            if os.path.exists(_f):
                os.remove(_f)
        log = subprocess.getoutput(cmd)
    except PermissionError:
        raise ValueError(
            "Permission denied, please run `sudo chmod -R 777 {}`".format(
                os.path.dirname(sample_file_1)))
    Utilities.dump_string(log, file=log_file)
    return {
        "sample_name": sample_name,
        "trimmed_file_1": out_file_1,
        "trimmed_file_2": out_file_2
    }
def run_spades(input_list: list):
    # Same about the order
    sample_name, sample_file_1, sample_file_2 = input_list
    out_dir = os.path.join(spadesDir, sample_name)
    subprocess.getoutput("rm -rf {}".format(out_dir))
    os.makedirs(out_dir)
    cmd = "spades.py --careful -o {out} -1 {i1} -2 {i2}".format(
        out=out_dir, i1=sample_file_1, i2=sample_file_2)
    log = subprocess.getoutput(cmd)
    log_file = os.path.join(out_dir, "{}_spades.log".format(sample_name))
    Utilities.dump_string(log, file=log_file)
    return {
        "sample_name": sample_name,
        "assembly": os.path.join(out_dir, "contigs.fasta")
    }
def mp_get_and_blast_largest_contig(assembly_file: str):
    if os.path.getsize(assembly_file) == 0:
        print("Cannot process the empty file: '{}'".format(assembly_file))
        return
    with open(assembly_file) as f:
        contig_records = sorted(list(SeqIO.parse(f, "fasta")),
                                key=lambda x: len(x),
                                reverse=True)
        f.close()
    largest_contig = randomize_gene_slice(contig_records[0]).format("fasta")
    # The delay to avoid NCBI ban
    randomize_sleep()
    # NCBI query
    result_handle = attempt_func(NCBIWWW.qblast,
                                 ("blastn", "nt", largest_contig))
    blast_record = NCBIXML.read(result_handle)
    # Based on: https://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc95
    _E_VALUE_THRESH = 0.04
    _QUERY_REPORT_SYMBOLS = 75
    high_scoring_pairs = []
    for alignment in blast_record.alignments:
        for hsp in alignment.hsps:
            if hsp.expect < _E_VALUE_THRESH:
                high_scoring_pairs.append(
                    dict(title=alignment.title,
                         length=alignment.length,
                         expect=hsp.expect,
                         score=hsp.score,
                         bits=hsp.bits,
                         identities=hsp.identities,
                         positives=hsp.positives,
                         assembly_file=assembly_file,
                         query="...\n".join([
                             hsp.query[:_QUERY_REPORT_SYMBOLS],
                             hsp.match[:_QUERY_REPORT_SYMBOLS],
                             hsp.sbjct[:_QUERY_REPORT_SYMBOLS], ""
                         ])))
    high_scoring_pairs = sorted(high_scoring_pairs,
                                key=lambda x: x.get("score"),
                                reverse=True)
    # Export BLAST results
    Utilities.dump_string(
        json.dumps(high_scoring_pairs, sort_keys=True, indent=4),
        "{}.BLAST.json".format(os.path.splitext(assembly_file)[0]))
    return high_scoring_pairs
Exemplo n.º 4
0
def dump_index_guide(input_nucleotide_fasta: str, output_dir: str):
    if not Utilities.is_file_valid(input_nucleotide_fasta):
        raise ValueError(f"Invalid file: '{input_nucleotide_fasta}'")
    cmd_0 = f"""
    export IMG=ivasilyev/bwt_filtering_pipeline_worker:latest && \
    docker pull "$IMG" && \
    docker run --rm -v /data:/data -v /data1:/data1 -v /data2:/data2 -it "$IMG" \
        bash -c '
            cd "{output_dir}";
            python3 "$HOME/scripts/cook_the_reference.py" \
                --input "{input_nucleotide_fasta}" \
                --output "{output_dir}";
        '
    """
    cmd = Utilities.join_lines(cmd_0)
    out_file = os.path.join(output_dir, "index.sh")
    Utilities.dump_string(cmd, out_file)
    print(f"For indexing, run outside of Docker: 'bash \"{out_file}\"'")
Exemplo n.º 5
0
    "Tmt": "Trimethoprim",
    "Bla": "CBL",
    "Bla_ESBL": "ESBL",
    "Bla_broad": "BSBL",
    "Bla_broad_inhR": "BSBL-inhR"
},
                            inplace=True)

phenotype_df = pd.concat([initial_sample_data_df, antibiogram_df],
                         axis=1,
                         sort=False).sort_index()
phenotype_df.index.names = [INDEX_COL_NAME]
phenotype_df = process_header(phenotype_df).transpose().reset_index()
#
Utilities.dump_tsv(phenotype_df, os.path.join(article_dir, "phenotype.tsv"))
Utilities.dump_string(phenotype_df.to_latex(index=False, header=True),
                      os.path.join(article_dir, "phenotype.tex"))

genotype_df = pd.concat(
    [
        ncbi_accessions_df, combined_assembly_statistics_df,
        kleborate_results_df
    ],
    axis=1,
    sort=False).sort_index()  # .sort_values(["Patient ID", "Sample Number"])
genotype_df.index.names = [INDEX_COL_NAME]
# genotype_df.replace({"_": "\\_"}, regex=True)
genotype_df = process_header(genotype_df,
                             capitalize=False).transpose().reset_index()
#
Utilities.dump_tsv(genotype_df, os.path.join(article_dir, "genotype.tsv"))
Utilities.dump_string(genotype_df.to_latex(index=False, header=True),
Exemplo n.º 6
0
templates_dir = os.path.join(ProjectDescriber.ROOT_DIR, "reports", "1")
template = jinja2.Template(
    Utilities.load_string(os.path.join(templates_dir, "template.txt")))

for sample_name in SAMPLE_NAMES:
    # sample_name = SAMPLE_NAMES[0]
    #
    combined_assembly_statistics_df = Utilities.load_tsv(
        os.path.join(".", ProjectDescriber.OWNER, ProjectDescriber.NAME,
                     "data", "tables", "combined_assembly_statistics.tsv"))
    submission_report_df = Utilities.load_tsv(
        os.path.join(".", ProjectDescriber.OWNER, ProjectDescriber.NAME,
                     "data", "tables", "ncbi", "submission_report.tsv"))
    #
    submission_combined_df = pd.concat([
        i.set_index(INDEX_COL_NAME)
        for i in (combined_assembly_statistics_df, submission_report_df)
    ],
                                       axis=1,
                                       sort=False)
    submission_combined_df.index.names = [INDEX_COL_NAME]
    #
    rendering_dict = submission_combined_df.loc[sample_name, :].to_dict()
    rendering_dict.update(TOOL_VERSIONS)
    #
    out_dir = os.path.join(templates_dir, "out")
    os.makedirs(out_dir, exist_ok=True)
    Utilities.dump_string(template.render(rendering_dict),
                          os.path.join(out_dir, "{}.txt".format(sample_name)))