def merge(self): Utilities.concatenate_files(*Utilities.scan_whole_dir( os.path.join(self.reference_dir, "nucleotide")), target_file=self.nfasta) Utilities.concatenate_files(*Utilities.scan_whole_dir( os.path.join(self.reference_dir, "protein")), target_file=self.pfasta) self.describer.get_index_guide(self.nfasta) print( "# Merge completed. \n# Protein FASTA to annotate: '{}'\n".format( self.pfasta))
def __init__(self, reference_describer_instance, value_col_name): self.describer = reference_describer_instance self.value_col_name = value_col_name self.coverage_files = [ i for i in Utilities.scan_whole_dir(projectDescriber.MAPPED_DATA_DIR) if all(j in i for j in [self.describer.ALIAS, "coverage.tsv"]) ] self.annotation_file = self.describer.get_refdata_dict().get( "sequence_1").annotation_file self.raw_annotated_pivot = self.join_and_annotate()
def parse_spades_version(sample_name_): log_file = [ i for i in Utilities.scan_whole_dir( "/data1/bio/projects/vradchenko/lactobacillus_salivarius/pga-pe/log" ) if i.endswith(".log") and all(j in i for j in ["spades", sample_name_]) ][0] log_lines = Utilities.load_list(log_file) image_version_line = [ i for i in log_lines if i.strip().startswith("Status: Image is up to date for ") ][0].strip() spades_version = re.split("[\t ]+", image_version_line)[-1] return spades_version
strain = " ".join(organism + [qualifiers_dict.get("strain")[0]]) taxonomy_id = Utilities.safe_findall("\d+", [ i for i in qualifiers_dict.get("db_xref") if i.split(":")[0].strip() == "taxon" ][0]) return dict(assembly_file=d.get("assembly_file"), strain=strain, taxonomy_id=taxonomy_id, reference_accession_id=genbank_record.id, cds_number=cds_number, reference_bp=len(genbank_record), reference_description=genbank_record.description) assemblies = [ i for i in Utilities.scan_whole_dir( os.path.join(ProjectDescriber.ROOT_DIR, "pga-pe", "06_plasmid_merger")) if i.endswith(".fna") and os.path.getsize(i) > 0 ] # Browse properties of the largest contigs for each assembly props = { i: sorted(list(SeqIO.parse(i, "fasta")), key=lambda x: len(x), reverse=True)[0].format("fasta") for i in assemblies } props_stats = { k: { "length": len(props.get(k)), "head": props.get(k)[:50] } for k in props
import os import pandas as pd from meta.scripts.Utilities import Utilities #%% sra_dir = "/data1/bio/projects/vradchenko/lactobacillus_salivarius/sra" sra_df = Utilities.load_tsv(os.path.join(sra_dir, "sra.tsv")) queue = [{ "func": Utilities.count_reads_statistics, "kwargs": { "reads_file": i, "type_": "fastq_gz" } } for i in Utilities.scan_whole_dir(os.path.join(sra_dir, "reads"))] raw_reads_base_stats = Utilities.multi_core_queue(Utilities.wrapper, queue, async_=True) #%% raw_reads_base_stat_df = pd.DataFrame(raw_reads_base_stats) raw_reads_base_stat_df["reads_file"] = raw_reads_base_stat_df[ "reads_file"].apply(os.path.basename) raw_reads_base_stat_df["sample_name"] = raw_reads_base_stat_df[ "reads_file"].str.extract(r"(.+)\[") Utilities.dump_tsv(raw_reads_base_stat_df, os.path.join(sra_dir, "raw_reads_base_stats.tsv"))
plt.close() plt.clf() # Map TADB data """ export IMG=ivasilyev/bwt_filtering_pipeline_worker:latest && \ docker pull $IMG && \ docker run --rm -v /data:/data -v /data1:/data1 -v /data2:/data2 -it $IMG \ python3 /home/docker/scripts/nBee.py \ -i /data1/bio/projects/inicolaeva/klebsiella_infants/trimmed.sampledata \ -r /data/reference/TADB/tadb_v2.0/index/tadb_v2.0_refdata.json \ -o /data1/bio/projects/inicolaeva/klebsiella_infants/map_data """ tadb_coverage_files = [ i for i in Utilities.scan_whole_dir(MAPPED_STAT_DIR) if all(j in i for j in ["tadb", "coverage.tsv"]) ] for value_col_name in VALUE_COL_NAMES: tadb_annotated_df = join_and_annotate( tadb_coverage_files, value_col_name, "/data/reference/TADB/tadb_v2.0/index/tadb_v2.0_annotation.tsv") tadb_raw_dir = os.path.join(RAW_DIR, "tadb") os.makedirs(tadb_raw_dir, exist_ok=True) tadb_annotated_df.reset_index().to_csv(os.path.join( tadb_raw_dir, "tadb_annotated_pivot_by_{}.tsv".format(value_col_name)), sep="\t", index=False, header=True) genera_names_dict = generate_keywords_dict(
"The reference sequence was derived from [^\.]+\.", "https:[^ ]+", "The annotation was added by the NCBI Prokaryotic Genome Annotation Pipeline \(PGAP\)\.", "Information about PGAP can be found here:", "Annotation was added by the NCBI Prokaryotic Genome Annotation Pipeline \(released 2013\)\.", "Information about the Pipeline can be found here:", "Annotation Pipeline \(PGAP\) set[;]{0,1}", "Annotation Pipeline set[;]{0,1}", "GeneMark[^ ]+ repeat_region[;]{0,1}", "COMPLETENESS: full length\.") for regex in _RGX: s = re.sub(regex, "", s) return flatten_string(s) genbank_dir = "/data1/bio/projects/inicolaeva/klebsiella_infants/ncbi-dl/gbff" genbank_files = [ i for i in Utilities.scan_whole_dir(genbank_dir) if i.endswith(".gbff") ] tree = Phylo.read( "/data1/bio/projects/inicolaeva/klebsiella_infants/roary/newick/iTOL_collapsed_tree.newick", "newick") node_names = [ j for j in [i.name for i in tree.find_clades()] if j is not None and j.startswith("GCF") ] annotations_list = [] for node_name in node_names: # node_name = "GCF_005377825.1_ASM537782v1" genbank_file = os.path.join(genbank_dir, "{}_genomic.gbff".format(node_name))
docker run --rm -v /data:/data -v /data1:/data1 -v /data2:/data2 --net=host -it ${IMG} bash git pull LC_ALL=C python3 """ import os import pandas as pd from shutil import copy2 from meta.scripts.Utilities import Utilities from vradchenko.lactobacillus_salivarius.ProjectDescriber import ProjectDescriber # Get the raw reads files raw_reads_files_dir = ProjectDescriber.RAW_DATA_DIR raw_reads_files_list = [ i for i in Utilities.scan_whole_dir(raw_reads_files_dir) if i.endswith("_001.fastq.gz") ] # Split them into the two groups STRANDS = ("R1", "R2") raw_reads_list = [] for raw_reads_files_pair in Utilities.get_most_similar_word_pairs( raw_reads_files_list): # Illumina file names have template '[sample]_[sequence]_[lane]_[strand]_[number].fastq.gz' # E.g: '336g_S1_L001_R1_001.fastq.gz' sample_name = Utilities.safe_findall( "(.+)_S[0-9]+_L[0-9]+_R[0-9]+_[0-9]+", os.path.basename(raw_reads_files_pair[0])) raw_reads_dict = dict(sample_name=sample_name) for raw_reads_file in raw_reads_files_pair:
import subprocess import pandas as pd from meta.scripts.Utilities import Utilities from inicolaeva.klebsiella_infants.ProjectDescriber import ProjectDescriber from Bio import SeqIO from copy import deepcopy from meta.scripts.ncbi_contamination_remover import ContaminationRemover from shutil import copy2 ASSEMBLY_TYPES = ("genome", "plasmid") ORGANISM = "Klebsiella pneumoniae" ISOLATE_PREFIX = "KZN_INI_KINF" assembler_result_dir = "/data1/bio/projects/inicolaeva/klebsiella_infants/pipeline/05_spades" assembly_files = [ i for i in Utilities.scan_whole_dir(assembler_result_dir) if os.path.basename(i) == "contigs.fasta" ] assemblies_target_dir = "/data1/bio/projects/inicolaeva/klebsiella_infants/assemblies" sample_dirs = sorted( set([os.path.dirname(os.path.dirname(i)) for i in assembly_files])) _ = subprocess.getoutput("rm -rf {}".format(assemblies_target_dir)) os.makedirs(assemblies_target_dir, exist_ok=True) assemblies_annotations = [] for sample_dir in sample_dirs: sample_name = os.path.basename(sample_dir) sample_number = Utilities.safe_findall("([0-9]+)", sample_name) sample_assemblies = [i for i in assembly_files if i.startswith(sample_dir)]
def merge(self): for fasta_type, fasta_file in zip(self._fasta_types, [self.nfasta, self.pfasta]): source = os.path.join(self.reference_dir, fasta_type) Utilities.concatenate_files(*Utilities.scan_whole_dir(source), target_file=fasta_file) self.index_dir = self.describer.get_index_guide(self.nfasta) print("Merge completed")
decontaminated_assembly = os.path.join( decontaminated_assemblies_dir, os.path.basename(contaminated_assembly)) if not os.path.isfile(contamination_report): copy2(contaminated_assembly, decontaminated_assembly) continue remover = ContaminationRemover(contaminated_assembly, contamination_report) remover.export(decontaminated_assembly) contamination_reports_dir2 = os.path.join(upload_dir, "contaminations2") os.makedirs(contamination_reports_dir, exist_ok=True) decontaminated_assemblies_dir2 = os.path.join(upload_dir, "decontaminated2") os.makedirs(decontaminated_assemblies_dir2, exist_ok=True) for contaminated_assembly in [ i for i in Utilities.scan_whole_dir(decontaminated_assemblies_dir) if i.endswith(".fna") ]: contaminated_basename = os.path.splitext( os.path.basename(contaminated_assembly))[0] contamination_report = os.path.join( contamination_reports_dir2, "Contamination_{}.txt".format(contaminated_basename)) decontaminated_assembly = os.path.join( decontaminated_assemblies_dir2, os.path.basename(contaminated_assembly)) if not os.path.isfile(contamination_report): continue remover = ContaminationRemover(contaminated_assembly, contamination_report) remover.export(decontaminated_assembly)