def merge(self):
     Utilities.concatenate_files(*Utilities.scan_whole_dir(
         os.path.join(self.reference_dir, "nucleotide")),
                                 target_file=self.nfasta)
     Utilities.concatenate_files(*Utilities.scan_whole_dir(
         os.path.join(self.reference_dir, "protein")),
                                 target_file=self.pfasta)
     self.describer.get_index_guide(self.nfasta)
     print(
         "# Merge completed. \n# Protein FASTA to annotate: '{}'\n".format(
             self.pfasta))
Exemplo n.º 2
0
 def __init__(self, reference_describer_instance, value_col_name):
     self.describer = reference_describer_instance
     self.value_col_name = value_col_name
     self.coverage_files = [
         i
         for i in Utilities.scan_whole_dir(projectDescriber.MAPPED_DATA_DIR)
         if all(j in i for j in [self.describer.ALIAS, "coverage.tsv"])
     ]
     self.annotation_file = self.describer.get_refdata_dict().get(
         "sequence_1").annotation_file
     self.raw_annotated_pivot = self.join_and_annotate()
Exemplo n.º 3
0
def parse_spades_version(sample_name_):
    log_file = [
        i for i in Utilities.scan_whole_dir(
            "/data1/bio/projects/vradchenko/lactobacillus_salivarius/pga-pe/log"
        )
        if i.endswith(".log") and all(j in i for j in ["spades", sample_name_])
    ][0]
    log_lines = Utilities.load_list(log_file)
    image_version_line = [
        i for i in log_lines
        if i.strip().startswith("Status: Image is up to date for ")
    ][0].strip()
    spades_version = re.split("[\t ]+", image_version_line)[-1]
    return spades_version
    strain = " ".join(organism + [qualifiers_dict.get("strain")[0]])
    taxonomy_id = Utilities.safe_findall("\d+", [
        i for i in qualifiers_dict.get("db_xref")
        if i.split(":")[0].strip() == "taxon"
    ][0])
    return dict(assembly_file=d.get("assembly_file"),
                strain=strain,
                taxonomy_id=taxonomy_id,
                reference_accession_id=genbank_record.id,
                cds_number=cds_number,
                reference_bp=len(genbank_record),
                reference_description=genbank_record.description)


assemblies = [
    i for i in Utilities.scan_whole_dir(
        os.path.join(ProjectDescriber.ROOT_DIR, "pga-pe", "06_plasmid_merger"))
    if i.endswith(".fna") and os.path.getsize(i) > 0
]
# Browse properties of the largest contigs for each assembly
props = {
    i: sorted(list(SeqIO.parse(i, "fasta")),
              key=lambda x: len(x),
              reverse=True)[0].format("fasta")
    for i in assemblies
}
props_stats = {
    k: {
        "length": len(props.get(k)),
        "head": props.get(k)[:50]
    }
    for k in props
Exemplo n.º 5
0
import os
import pandas as pd
from meta.scripts.Utilities import Utilities

#%%

sra_dir = "/data1/bio/projects/vradchenko/lactobacillus_salivarius/sra"
sra_df = Utilities.load_tsv(os.path.join(sra_dir, "sra.tsv"))

queue = [{
    "func": Utilities.count_reads_statistics,
    "kwargs": {
        "reads_file": i,
        "type_": "fastq_gz"
    }
} for i in Utilities.scan_whole_dir(os.path.join(sra_dir, "reads"))]

raw_reads_base_stats = Utilities.multi_core_queue(Utilities.wrapper,
                                                  queue,
                                                  async_=True)

#%%

raw_reads_base_stat_df = pd.DataFrame(raw_reads_base_stats)
raw_reads_base_stat_df["reads_file"] = raw_reads_base_stat_df[
    "reads_file"].apply(os.path.basename)
raw_reads_base_stat_df["sample_name"] = raw_reads_base_stat_df[
    "reads_file"].str.extract(r"(.+)\[")

Utilities.dump_tsv(raw_reads_base_stat_df,
                   os.path.join(sra_dir, "raw_reads_base_stats.tsv"))
Exemplo n.º 6
0
        plt.close()
        plt.clf()

# Map TADB data
"""
export IMG=ivasilyev/bwt_filtering_pipeline_worker:latest && \
docker pull $IMG && \
docker run --rm -v /data:/data -v /data1:/data1 -v /data2:/data2 -it $IMG \
python3 /home/docker/scripts/nBee.py \
-i /data1/bio/projects/inicolaeva/klebsiella_infants/trimmed.sampledata \
-r /data/reference/TADB/tadb_v2.0/index/tadb_v2.0_refdata.json \
-o /data1/bio/projects/inicolaeva/klebsiella_infants/map_data
"""

tadb_coverage_files = [
    i for i in Utilities.scan_whole_dir(MAPPED_STAT_DIR)
    if all(j in i for j in ["tadb", "coverage.tsv"])
]

for value_col_name in VALUE_COL_NAMES:
    tadb_annotated_df = join_and_annotate(
        tadb_coverage_files, value_col_name,
        "/data/reference/TADB/tadb_v2.0/index/tadb_v2.0_annotation.tsv")
    tadb_raw_dir = os.path.join(RAW_DIR, "tadb")
    os.makedirs(tadb_raw_dir, exist_ok=True)
    tadb_annotated_df.reset_index().to_csv(os.path.join(
        tadb_raw_dir, "tadb_annotated_pivot_by_{}.tsv".format(value_col_name)),
                                           sep="\t",
                                           index=False,
                                           header=True)
    genera_names_dict = generate_keywords_dict(
Exemplo n.º 7
0
        "The reference sequence was derived from [^\.]+\.", "https:[^ ]+",
        "The annotation was added by the NCBI Prokaryotic Genome Annotation Pipeline \(PGAP\)\.",
        "Information about PGAP can be found here:",
        "Annotation was added by the NCBI Prokaryotic Genome Annotation Pipeline \(released 2013\)\.",
        "Information about the Pipeline can be found here:",
        "Annotation Pipeline \(PGAP\) set[;]{0,1}",
        "Annotation Pipeline set[;]{0,1}",
        "GeneMark[^ ]+ repeat_region[;]{0,1}", "COMPLETENESS: full length\.")
    for regex in _RGX:
        s = re.sub(regex, "", s)
    return flatten_string(s)


genbank_dir = "/data1/bio/projects/inicolaeva/klebsiella_infants/ncbi-dl/gbff"
genbank_files = [
    i for i in Utilities.scan_whole_dir(genbank_dir) if i.endswith(".gbff")
]

tree = Phylo.read(
    "/data1/bio/projects/inicolaeva/klebsiella_infants/roary/newick/iTOL_collapsed_tree.newick",
    "newick")
node_names = [
    j for j in [i.name for i in tree.find_clades()]
    if j is not None and j.startswith("GCF")
]

annotations_list = []
for node_name in node_names:
    # node_name = "GCF_005377825.1_ASM537782v1"
    genbank_file = os.path.join(genbank_dir,
                                "{}_genomic.gbff".format(node_name))
Exemplo n.º 8
0
docker run --rm -v /data:/data -v /data1:/data1 -v /data2:/data2 --net=host -it ${IMG} bash

git pull
LC_ALL=C python3
"""

import os
import pandas as pd
from shutil import copy2
from meta.scripts.Utilities import Utilities
from vradchenko.lactobacillus_salivarius.ProjectDescriber import ProjectDescriber

# Get the raw reads files
raw_reads_files_dir = ProjectDescriber.RAW_DATA_DIR
raw_reads_files_list = [
    i for i in Utilities.scan_whole_dir(raw_reads_files_dir)
    if i.endswith("_001.fastq.gz")
]

# Split them into the two groups
STRANDS = ("R1", "R2")
raw_reads_list = []
for raw_reads_files_pair in Utilities.get_most_similar_word_pairs(
        raw_reads_files_list):
    # Illumina file names have template '[sample]_[sequence]_[lane]_[strand]_[number].fastq.gz'
    # E.g: '336g_S1_L001_R1_001.fastq.gz'
    sample_name = Utilities.safe_findall(
        "(.+)_S[0-9]+_L[0-9]+_R[0-9]+_[0-9]+",
        os.path.basename(raw_reads_files_pair[0]))
    raw_reads_dict = dict(sample_name=sample_name)
    for raw_reads_file in raw_reads_files_pair:
import subprocess
import pandas as pd
from meta.scripts.Utilities import Utilities
from inicolaeva.klebsiella_infants.ProjectDescriber import ProjectDescriber
from Bio import SeqIO
from copy import deepcopy
from meta.scripts.ncbi_contamination_remover import ContaminationRemover
from shutil import copy2

ASSEMBLY_TYPES = ("genome", "plasmid")
ORGANISM = "Klebsiella pneumoniae"
ISOLATE_PREFIX = "KZN_INI_KINF"

assembler_result_dir = "/data1/bio/projects/inicolaeva/klebsiella_infants/pipeline/05_spades"
assembly_files = [
    i for i in Utilities.scan_whole_dir(assembler_result_dir)
    if os.path.basename(i) == "contigs.fasta"
]
assemblies_target_dir = "/data1/bio/projects/inicolaeva/klebsiella_infants/assemblies"

sample_dirs = sorted(
    set([os.path.dirname(os.path.dirname(i)) for i in assembly_files]))

_ = subprocess.getoutput("rm -rf {}".format(assemblies_target_dir))
os.makedirs(assemblies_target_dir, exist_ok=True)

assemblies_annotations = []
for sample_dir in sample_dirs:
    sample_name = os.path.basename(sample_dir)
    sample_number = Utilities.safe_findall("([0-9]+)", sample_name)
    sample_assemblies = [i for i in assembly_files if i.startswith(sample_dir)]
Exemplo n.º 10
0
 def merge(self):
     for fasta_type, fasta_file in zip(self._fasta_types, [self.nfasta, self.pfasta]):
         source = os.path.join(self.reference_dir, fasta_type)
         Utilities.concatenate_files(*Utilities.scan_whole_dir(source), target_file=fasta_file)
     self.index_dir = self.describer.get_index_guide(self.nfasta)
     print("Merge completed")
Exemplo n.º 11
0
    decontaminated_assembly = os.path.join(
        decontaminated_assemblies_dir, os.path.basename(contaminated_assembly))
    if not os.path.isfile(contamination_report):
        copy2(contaminated_assembly, decontaminated_assembly)
        continue
    remover = ContaminationRemover(contaminated_assembly, contamination_report)
    remover.export(decontaminated_assembly)

contamination_reports_dir2 = os.path.join(upload_dir, "contaminations2")
os.makedirs(contamination_reports_dir, exist_ok=True)

decontaminated_assemblies_dir2 = os.path.join(upload_dir, "decontaminated2")
os.makedirs(decontaminated_assemblies_dir2, exist_ok=True)

for contaminated_assembly in [
        i for i in Utilities.scan_whole_dir(decontaminated_assemblies_dir)
        if i.endswith(".fna")
]:
    contaminated_basename = os.path.splitext(
        os.path.basename(contaminated_assembly))[0]
    contamination_report = os.path.join(
        contamination_reports_dir2,
        "Contamination_{}.txt".format(contaminated_basename))
    decontaminated_assembly = os.path.join(
        decontaminated_assemblies_dir2,
        os.path.basename(contaminated_assembly))
    if not os.path.isfile(contamination_report):
        continue
    remover = ContaminationRemover(contaminated_assembly, contamination_report)
    remover.export(decontaminated_assembly)