def parse_fastq(self, output_file: str): import os import math if os.path.isfile(output_file): os.remove(output_file) print("Deleted file in order to replace it with new data: '{}'". format(output_file)) counter = 0 first_position = 0 last_position = self.CHUNK_SIZE chunks_number = math.ceil(len(self.raw_fastqs_list) / self.CHUNK_SIZE) while last_position < len(self.raw_fastqs_list): with open(output_file, mode="a", encoding="utf-8") as f: f.write("{}\n".format("\n".join( Utilities.multi_core_queue( self.mp_parse_fastq_line, self.raw_fastqs_list[first_position:last_position])))) counter += 1 print("Passed FASTQ parse iteration: {} (of {})".format( counter, chunks_number)) first_position += self.CHUNK_SIZE last_position += self.CHUNK_SIZE if first_position <= len(self.raw_fastqs_list): with open(output_file, mode="a", encoding="utf-8") as f: f.write("{}\n".format("\n".join( Utilities.multi_core_queue( self.mp_parse_fastq_line, self.raw_fastqs_list[ first_position:len(self.raw_fastqs_list)])))) print("Passed FASTQ parse last iteration") print("Finished parse FASTQ items: {}".format(len( self.raw_fastqs_list)))
def annotate(self): self._raw_nfasta_df = Utilities.load_tsv(self.annotation_file) raw_nfasta_headers = self._raw_nfasta_df["former_id"].values.tolist() processed_nfasta_headers = [ Utilities.dict2pd_series(i) for i in Utilities.multi_core_queue( self._mp_parse_nfasta_header, raw_nfasta_headers) ] self._processed_nfasta_df = Utilities.merge_pd_series_list( processed_nfasta_headers).sort_values("former_id") zf_len = len(max(self._processed_nfasta_df["vfdb_id"].values.tolist())) # Join table assembled from pFASTA headers raw_pfasta_headers = [] with open(self._raw_pfasta_file, mode="r", encoding="utf-8") as _f: for _line in _f: if _line.startswith(">"): raw_pfasta_headers.append(re.sub("^>", "", _line).strip()) _f.close() raw_pfasta_headers = sorted( set([i for i in raw_pfasta_headers if len(i) > 0])) processed_pfasta_headers = [ Utilities.dict2pd_series(i) for i in Utilities.multi_core_queue( self._mp_parse_pfasta_header, raw_pfasta_headers) ] self._processed_pfasta_df = Utilities.merge_pd_series_list( processed_pfasta_headers).sort_values("protein_header") self._processed_pfasta_df["vfdb_id"] = self._processed_pfasta_df[ "vfdb_id"].str.zfill(zf_len) # Join provided table. Note the table file placed into the same dir with the merged protein FASTA file vfs_table_file = os.path.join(os.path.dirname(self._raw_pfasta_file), "VFs.xls") vfs_df = pd.read_excel(vfs_table_file, sheet_name="VFs", header=1).fillna("") vfs_df["vfdb_id"] = vfs_df["VFID"].str.extract("VF(\d+)")[0].str.zfill( zf_len) self.merged_df = pd.concat([ i.set_index("vfdb_id").sort_index() for i in [self._processed_nfasta_df, self._processed_pfasta_df, vfs_df] ], axis=1, sort=False).sort_index() self.merged_df.index.names = ["vfdb_id"] self.merged_df = self.merged_df.loc[ self.merged_df["former_id"].str.len() > 0].reset_index() self.merged_df = Utilities.left_merge(self._raw_nfasta_df, self.merged_df, "former_id")
def annotate(self): # Process nucleotide FASTA self._raw_nfasta_df = pd.read_table(self.annotation_file, sep="\t", header=0) raw_nfasta_headers = self._raw_nfasta_df["former_id"].values.tolist() processed_nfasta_headers = [Utilities.dict2pd_series(i) for i in Utilities.multi_core_queue(self._mp_parse_nfasta_header, raw_nfasta_headers)] self._processed_nfasta_df = Utilities.merge_pd_series_list(processed_nfasta_headers).sort_values("former_id") self.nfasta_df = Utilities.left_merge(self._raw_nfasta_df, self._processed_nfasta_df, "former_id") # Process protein FASTA raw_pfasta_headers = sorted(set([j for j in [re.sub("^>", "", i).strip() for i in open(self._raw_pfasta_file, mode="r", encoding="utf-8") if i.startswith(">")] if len(j) > 0])) processed_pfasta_headers = [Utilities.dict2pd_series(i) for i in Utilities.multi_core_queue(self._mp_parse_nfasta_header, raw_pfasta_headers)] self.pfasta_df = Utilities.merge_pd_series_list(processed_pfasta_headers).sort_values("former_id") self.pfasta_df.rename(columns={"geninfo_id": "protein_geninfo_id", "refseq_id": "genpept_id", "description": "protein_description", "host": "protein_host"}, inplace=True) self.merged_df = Utilities.left_merge(self.nfasta_df, self.pfasta_df, "tadb_id", "category", "gene_symbol") self.merged_df = Utilities.combine_duplicate_rows(self.merged_df, "reference_id")
def annotate(self): self.annotation_file = self.describer.get_refdata_dict().get( "sequence_1").annotation_file self._raw_nfasta_df = pd.read_table(self.annotation_file, sep='\t', header=0) mp_result = Utilities.multi_core_queue( self._mp_parse_nfasta_header, self._raw_nfasta_df["former_id"].values.tolist()) self._processed_nfasta_df = Utilities.merge_pd_series_list( mp_result).sort_values("former_id") self.nfasta_df = Utilities.left_merge(self._raw_nfasta_df, self._processed_nfasta_df, "former_id") # Join 'aro_index.tsv' aro_index_df = pd.read_table(os.path.join(self.reference_dir, "data", "aro_index.tsv"), sep='\t', header=0) aro_index_df["aro_id"] = aro_index_df["ARO Accession"].str.extract( "ARO:(\d+)") # 'aro_index.tsv' has more entries than 'nucleotide_fasta_protein_homolog_model.fasta' provides self.nfasta_df = Utilities.left_merge(self.nfasta_df, aro_index_df, "aro_id") # Join 'aro_categories_index.tsv' aro_categories_index_df = pd.read_table(os.path.join( self.reference_dir, "data", "aro_categories_index.tsv"), sep='\t', header=0) self.nfasta_df = Utilities.left_merge(self.nfasta_df, aro_categories_index_df, "Protein Accession") # Joining 'aro_categories.tsv' is useless: the resulting 'ARO Category' is filled by NaN # Join 'aro.tsv' aro_df = pd.read_table(os.path.join(self.reference_dir, "ontology", "aro.tsv"), sep='\t', header=0) aro_df.rename(columns={ "Accession": "ARO Accession", "Name": "ARO Name" }, inplace=True) self.nfasta_df = Utilities.left_merge(self.nfasta_df, aro_df, "ARO Accession") self.nfasta_df = Utilities.combine_duplicate_rows( self.nfasta_df, "reference_id")
props = { i: sorted(list(SeqIO.parse(i, "fasta")), key=lambda x: len(x), reverse=True)[0].format("fasta") for i in assemblies } props_stats = { k: { "length": len(props.get(k)), "head": props.get(k)[:50] } for k in props } # Create BLAST queries blast_reports = Utilities.multi_core_queue(mp_get_and_blast_largest_contig, assemblies) headers = Utilities.single_core_queue(process_blast_report, blast_reports) # Create GenBank queries genbank_reports = Utilities.multi_core_queue(mp_download_reference_genbank, headers) reference_df = pd.DataFrame( Utilities.single_core_queue(process_genbank_report, genbank_reports)) reference_df["sample_name"] = reference_df["assembly_file"].apply( lambda x: "_".join( os.path.splitext(os.path.basename(x))[0].split("_")[:-1])) reference_df.sort_values("sample_name", inplace=True) reference_table = os.path.join(ProjectDescriber.SAMPLE_DATA_DIR, "BLASTed.sampledata") Utilities.dump_tsv(reference_df, reference_table)
import pandas as pd from meta.scripts.Utilities import Utilities #%% sra_dir = "/data1/bio/projects/vradchenko/lactobacillus_salivarius/sra" sra_df = Utilities.load_tsv(os.path.join(sra_dir, "sra.tsv")) queue = [{ "func": Utilities.count_reads_statistics, "kwargs": { "reads_file": i, "type_": "fastq_gz" } } for i in Utilities.scan_whole_dir(os.path.join(sra_dir, "reads"))] raw_reads_base_stats = Utilities.multi_core_queue(Utilities.wrapper, queue, async_=True) #%% raw_reads_base_stat_df = pd.DataFrame(raw_reads_base_stats) raw_reads_base_stat_df["reads_file"] = raw_reads_base_stat_df[ "reads_file"].apply(os.path.basename) raw_reads_base_stat_df["sample_name"] = raw_reads_base_stat_df[ "reads_file"].str.extract(r"(.+)\[") Utilities.dump_tsv(raw_reads_base_stat_df, os.path.join(sra_dir, "raw_reads_base_stats.tsv"))
for copy_number in [1, 2]: max_id += 1 for fastq_archive in [r1_fastq_archive, r2_fastq_archive]: output_file = "{}{}".format( Utilities.ends_with_slash("/".join( fastq_archive.split("/")[:-1])), re.sub( "\.gz$", "", re.sub("[0-9]{6}", str(max_id).zfill(6), fastq_archive.split("/")[-1]))) print("Loading file '{}'".format(fastq_archive)) fq_array = FASTAArray( subprocess.getoutput("zcat {}".format(fastq_archive))) print("Loaded file '{}'".format(fastq_archive)) fq_array.parse_fastq(output_file) del fq_array output_files_list.append(output_file) print("Saved file '{}'".format(output_file)) gc.collect() def mp_gzip_file(file): print("Compressing file '{}'".format(file)) print(subprocess.getoutput("gzip -9 -c {a} > {a}.gz".format(a=file))) print("Compressed file '{}'".format(file)) print("Compressing {} files".format(len(output_files_list))) print(Utilities.multi_core_queue(mp_gzip_file, output_files_list))
return { "sample_name": sample_name, "assembly": os.path.join(out_dir, "contigs.fasta") } projectDescriber = ProjectDescriber() rawSampledataDF = Utilities.load_tsv( "/data1/bio/projects/inicolaeva/klebsiella_infants/sample_data/raw_reads.sampledata" ) # Prepare path rawReadsDir = os.path.join(projectDescriber.RAW_DATA_DIR, "reads") cutadaptDir = os.path.join(rawReadsDir, "cutadapt") os.makedirs(cutadaptDir, exist_ok=True) # Trim reads cutadaptResults = Utilities.multi_core_queue( run_cutadapt, queue=rawSampledataDF.values.tolist()) cutadaptResultsDF = pd.DataFrame.from_dict(cutadaptResults).sort_values( "sample_name") Utilities.dump_tsv( cutadaptResultsDF, table_file=projectDescriber.SAMPLE_DATA_FILE, col_names=["sample_name", "trimmed_file_1", "trimmed_file_2"]) # Assemble reads spadesDir = os.path.join(rawReadsDir, "spades") spadesResults = Utilities.single_core_queue(run_spades, cutadaptResultsDF.values.tolist()) spadesResultsDF = pd.DataFrame.from_dict(spadesResults).sort_values( "sample_name") spadesResultsSampleData = os.path.join( os.path.dirname(projectDescriber.SAMPLE_DATA_FILE), "assemblies.sampledata")
# Process assemblies blasted_data_df = Utilities.load_tsv( os.path.join(ProjectDescriber.SAMPLE_DATA_DIR, "BLASTed.sampledata")) blasted_data_df["organism"] = blasted_data_df["strain"].apply( lambda x: " ".join(x.split(" ")[:2])) blasted_data_df.rename(columns={ i: "reference_{}".format(i) for i in blasted_data_df.columns if all(j not in i for j in ["assembly", "reference", "sample"]) }, inplace=True) assembly_files = blasted_data_df["assembly_file"].values.tolist() assembly_stats_df = pd.DataFrame( Utilities.multi_core_queue(Utilities.count_assembly_statistics, assembly_files)) assembly_stats_df.rename( columns={i: "assembly_{}".format(i) for i in assembly_stats_df.columns}, inplace=True) blasted_data_df = pd.concat([ blasted_data_df.set_index("assembly_file"), assembly_stats_df.set_index("assembly_file") ], axis=1, sort=False) blasted_data_df.index.names = ["assembly_file"] blasted_data_df.reset_index(inplace=True) # Process raw reads sample_data_df = Utilities.load_tsv(ProjectDescriber.SAMPLE_DATA_FILE)