def join_by_value_columns(tables: list, index_col_name: str, value_col_name_: str): dfs_list = [ Utilities.load_tsv(i).set_index(index_col_name) [value_col_name_].rename(i) for i in tables ] out = pd.concat(dfs_list, axis=1, sort=False).sort_index() out.index.names = [index_col_name] return out
def join_and_annotate(self): annotation_df = Utilities.load_tsv( self.annotation_file).set_index(REFERENCE_COL_NAME) values_df = self.join_by_value_columns(self.coverage_files, REFERENCE_COL_NAME, self.value_col_name) out = pd.concat([annotation_df, values_df], axis=1, sort=False) out.index.names = [REFERENCE_COL_NAME] return out
def annotate(self): self._raw_nfasta_df = Utilities.load_tsv(self.annotation_file) raw_nfasta_headers = self._raw_nfasta_df["former_id"].values.tolist() processed_nfasta_headers = [ Utilities.dict2pd_series(i) for i in Utilities.multi_core_queue( self._mp_parse_nfasta_header, raw_nfasta_headers) ] self._processed_nfasta_df = Utilities.merge_pd_series_list( processed_nfasta_headers).sort_values("former_id") zf_len = len(max(self._processed_nfasta_df["vfdb_id"].values.tolist())) # Join table assembled from pFASTA headers raw_pfasta_headers = [] with open(self._raw_pfasta_file, mode="r", encoding="utf-8") as _f: for _line in _f: if _line.startswith(">"): raw_pfasta_headers.append(re.sub("^>", "", _line).strip()) _f.close() raw_pfasta_headers = sorted( set([i for i in raw_pfasta_headers if len(i) > 0])) processed_pfasta_headers = [ Utilities.dict2pd_series(i) for i in Utilities.multi_core_queue( self._mp_parse_pfasta_header, raw_pfasta_headers) ] self._processed_pfasta_df = Utilities.merge_pd_series_list( processed_pfasta_headers).sort_values("protein_header") self._processed_pfasta_df["vfdb_id"] = self._processed_pfasta_df[ "vfdb_id"].str.zfill(zf_len) # Join provided table. Note the table file placed into the same dir with the merged protein FASTA file vfs_table_file = os.path.join(os.path.dirname(self._raw_pfasta_file), "VFs.xls") vfs_df = pd.read_excel(vfs_table_file, sheet_name="VFs", header=1).fillna("") vfs_df["vfdb_id"] = vfs_df["VFID"].str.extract("VF(\d+)")[0].str.zfill( zf_len) self.merged_df = pd.concat([ i.set_index("vfdb_id").sort_index() for i in [self._processed_nfasta_df, self._processed_pfasta_df, vfs_df] ], axis=1, sort=False).sort_index() self.merged_df.index.names = ["vfdb_id"] self.merged_df = self.merged_df.loc[ self.merged_df["former_id"].str.len() > 0].reset_index() self.merged_df = Utilities.left_merge(self._raw_nfasta_df, self.merged_df, "former_id")
_df = df.copy() if capitalize: _df.rename(columns={i: i[0].upper() + i[1:] for i in _df.columns}, inplace=True) return _df.rename(columns={i: i.replace("_", " ") for i in _df.columns}) data_dir = "./inicolaeva/klebsiella_infants/datasets" article_dir = os.path.join(ProjectDescriber.DATA_DIGEST_DIR, "article") INDEX_COL_NAME = "sample_name" antibacterial_agents = ['klebsiella_phage', 'pyo_bacteriophage'] initial_sample_data_df = Utilities.load_tsv( os.path.join(data_dir, "initial_sample_data.tsv") ).loc[:, [INDEX_COL_NAME] + [ 'sample_number', 'delivery', 'patient_id', 'checkpoint_age_days', 'checkpoint_kpneumoniae_lg_cfu_per_g', 'extended-spectrum_beta-lactamases' ] + antibacterial_agents].set_index(INDEX_COL_NAME).sort_index() initial_sample_data_df["delivery"].replace({ "vaginal": "V", "caesarean": "C" }, inplace=True) initial_sample_data_df["extended-spectrum_beta-lactamases"].replace( { True: "+", False: "-" }, inplace=True) for antibacterial_agent in antibacterial_agents: initial_sample_data_df[antibacterial_agent].replace( {
#!/usr/bin/env python3 # -*- coding: utf-8 -*- #%% import os import pandas as pd from meta.scripts.Utilities import Utilities #%% sra_dir = "/data1/bio/projects/vradchenko/lactobacillus_salivarius/sra" sra_df = Utilities.load_tsv(os.path.join(sra_dir, "sra.tsv")) queue = [{ "func": Utilities.count_reads_statistics, "kwargs": { "reads_file": i, "type_": "fastq_gz" } } for i in Utilities.scan_whole_dir(os.path.join(sra_dir, "reads"))] raw_reads_base_stats = Utilities.multi_core_queue(Utilities.wrapper, queue, async_=True) #%% raw_reads_base_stat_df = pd.DataFrame(raw_reads_base_stats) raw_reads_base_stat_df["reads_file"] = raw_reads_base_stat_df[ "reads_file"].apply(os.path.basename)
for sra_regular_col_name, sra_regular_value in zip([ "library_strategy", "library_source", "library_selection", "library_layout", "platform", "instrument_model", "filetype" ], [ "WGS", "GENOMIC", "RANDOM", "paired", "ILLUMINA", "Illumina MiSeq", "fastq" ]): sra_df[sra_regular_col_name] = sra_regular_value sra_df["design_description"] = raw_sampledata_df["R1"].apply( lambda x: os.path.dirname(x).split("_")[-1]) sra_df["library_ID"] = sra_df["sample_name"] sra_df.set_index("sample_name", inplace=True) submission_report_df = Utilities.load_tsv( "https://raw.githubusercontent.com/ivasilyev/curated_projects/master/vradchenko/lactobacillus_salivarius/data/tables/ncbi/submission_report.tsv" ).set_index("sample_name") sra_df = pd.concat( [sra_df, submission_report_df.loc[:, ["BioSample", "BioProject"]]], axis=1, sort=False) sra_df.rename(columns={ "BioSample": "biosample_accession", "BioProject": "bioproject_accession" }, inplace=True) biosample_attributes_df = Utilities.load_tsv( "https://raw.githubusercontent.com/ivasilyev/curated_projects/master/vradchenko/lactobacillus_salivarius/data/tables/ncbi/biosample_attributes_microbe.tsv" ).set_index("*sample_name") sra_df = pd.concat([
subprocess.getoutput("rm -rf {}".format(out_dir)) os.makedirs(out_dir) cmd = "spades.py --careful -o {out} -1 {i1} -2 {i2}".format( out=out_dir, i1=sample_file_1, i2=sample_file_2) log = subprocess.getoutput(cmd) log_file = os.path.join(out_dir, "{}_spades.log".format(sample_name)) Utilities.dump_string(log, file=log_file) return { "sample_name": sample_name, "assembly": os.path.join(out_dir, "contigs.fasta") } projectDescriber = ProjectDescriber() rawSampledataDF = Utilities.load_tsv( "/data1/bio/projects/inicolaeva/klebsiella_infants/sample_data/raw_reads.sampledata" ) # Prepare path rawReadsDir = os.path.join(projectDescriber.RAW_DATA_DIR, "reads") cutadaptDir = os.path.join(rawReadsDir, "cutadapt") os.makedirs(cutadaptDir, exist_ok=True) # Trim reads cutadaptResults = Utilities.multi_core_queue( run_cutadapt, queue=rawSampledataDF.values.tolist()) cutadaptResultsDF = pd.DataFrame.from_dict(cutadaptResults).sort_values( "sample_name") Utilities.dump_tsv( cutadaptResultsDF, table_file=projectDescriber.SAMPLE_DATA_FILE, col_names=["sample_name", "trimmed_file_1", "trimmed_file_2"]) # Assemble reads
a=idx + 1, b=ORGANISM, c=ISOLATE_PREFIX, d=sample_number) if seq_record_processed.description.endswith(" PLASMID"): plasmid_counter += 1 seq_record_processed.description = "[plasmid-name=unnamed{0:02d}]".format( plasmid_counter) else: seq_record_processed.description = "" assemblies_annotations.append(assemblies_annotation) # SeqIO.write(seq_records_processed, assembly_target_file, "fasta") INDEX_COL_NAME = "sample_name" assemblies_statistics_df = pd.DataFrame(assemblies_annotations).set_index( INDEX_COL_NAME) reads_statistics_file = "/data1/bio/projects/inicolaeva/klebsiella_infants/sample_data/reads_statistics.tsv" reads_statistics_df = Utilities.load_tsv(reads_statistics_file).set_index( INDEX_COL_NAME) combined_statistics_df = pd.concat( [reads_statistics_df, assemblies_statistics_df], axis=1, sort=False) combined_statistics_df.index.names = [INDEX_COL_NAME] numeric_col_names = [ i for i in combined_statistics_df.columns if any(j in i for j in ("_assembly_contigs_", "_assembly_bp_")) ] combined_statistics_df.fillna(0, inplace=True) combined_statistics_df = combined_statistics_df.astype( {i: int for i in numeric_col_names}) # From NCBI template ('Template_GenomeBatch.11700383121d.xlsx'): # The estimated base coverage across the genome, eg 12x. # This can be calculated by dividing the number of bases sequenced by the expected genome size # and multiplying that by the percentage of bases that were placed in the final assembly.
def count_fasta_statistics(fasta_file: str, sample_name: str = None): from Bio import SeqIO with open(fasta_file, mode="r", encoding="utf-8") as f: seq_records = list(SeqIO.parse(f, "fasta")) f.close() out = dict(fasta_file=fasta_file, fasta_sequences_number=len(seq_records), fasta_total_bp=sum([len(i) for i in seq_records])) if sample_name: out["sample_name"] = sample_name return out # Process assemblies blasted_data_df = Utilities.load_tsv( os.path.join(ProjectDescriber.SAMPLE_DATA_DIR, "BLASTed.sampledata")) blasted_data_df["organism"] = blasted_data_df["strain"].apply( lambda x: " ".join(x.split(" ")[:2])) blasted_data_df.rename(columns={ i: "reference_{}".format(i) for i in blasted_data_df.columns if all(j not in i for j in ["assembly", "reference", "sample"]) }, inplace=True) assembly_files = blasted_data_df["assembly_file"].values.tolist() assembly_stats_df = pd.DataFrame( Utilities.multi_core_queue(Utilities.count_assembly_statistics, assembly_files)) assembly_stats_df.rename(
TOOL_VERSIONS = dict( fastqc_version="quay.io/biocontainers/fastqc:0.11.8--1", trimmomatic_version="quay.io/biocontainers/trimmomatic:0.39--1", cutadapt_version="quay.io/biocontainers/cutadapt:2.5--py37h516909a_0", bowtie2_version="quay.io/biocontainers/bowtie2:2.3.5--py37he860b03_0", spades_version="quay.io/biocontainers/spades:3.9.1--0") templates_dir = os.path.join(ProjectDescriber.ROOT_DIR, "reports", "1") template = jinja2.Template( Utilities.load_string(os.path.join(templates_dir, "template.txt"))) for sample_name in SAMPLE_NAMES: # sample_name = SAMPLE_NAMES[0] # combined_assembly_statistics_df = Utilities.load_tsv( os.path.join(".", ProjectDescriber.OWNER, ProjectDescriber.NAME, "data", "tables", "combined_assembly_statistics.tsv")) submission_report_df = Utilities.load_tsv( os.path.join(".", ProjectDescriber.OWNER, ProjectDescriber.NAME, "data", "tables", "ncbi", "submission_report.tsv")) # submission_combined_df = pd.concat([ i.set_index(INDEX_COL_NAME) for i in (combined_assembly_statistics_df, submission_report_df) ], axis=1, sort=False) submission_combined_df.index.names = [INDEX_COL_NAME] # rendering_dict = submission_combined_df.loc[sample_name, :].to_dict() rendering_dict.update(TOOL_VERSIONS)