def retrieve(self): queue = [] for key in self._links_dict: for url in self._links_dict[key]: queue.append({"url": url, "out_dir": os.path.join(self.reference_dir, key)}) Utilities.single_core_queue(self._dl_handler, queue) print("Download completed")
def retrieve(self): if os.path.exists(self.reference_dir): print("Warning! The reference path exists: '{}'".format( self.reference_dir)) os.makedirs(self.reference_dir, exist_ok=True) chromosomes_dir = os.path.join(self.reference_dir, "chromosomes") os.makedirs(chromosomes_dir, exist_ok=True) # UCSC returns HTTP 530 when attempting to download in multi-thread compressed_chromosomes = Utilities.single_core_queue( self._dl_wrapper, [{ "chromosome": i, "chromosomes_dir": chromosomes_dir } for i in self.CHROMOSOMES]) # Process sequence self.parsed_records = Utilities.flatten_2d_array( Utilities.single_core_queue(self._parse_gzip_fna, compressed_chromosomes)) self.nfasta_file = os.path.join(self.reference_dir, "hg19.fasta") SeqIO.write(self.parsed_records, self.nfasta_file, "fasta") # Process annotation self.index_dir = self.describer.get_index_guide(self.nfasta_file)
key=lambda x: len(x), reverse=True)[0].format("fasta") for i in assemblies } props_stats = { k: { "length": len(props.get(k)), "head": props.get(k)[:50] } for k in props } # Create BLAST queries blast_reports = Utilities.multi_core_queue(mp_get_and_blast_largest_contig, assemblies) headers = Utilities.single_core_queue(process_blast_report, blast_reports) # Create GenBank queries genbank_reports = Utilities.multi_core_queue(mp_download_reference_genbank, headers) reference_df = pd.DataFrame( Utilities.single_core_queue(process_genbank_report, genbank_reports)) reference_df["sample_name"] = reference_df["assembly_file"].apply( lambda x: "_".join( os.path.splitext(os.path.basename(x))[0].split("_")[:-1])) reference_df.sort_values("sample_name", inplace=True) reference_table = os.path.join(ProjectDescriber.SAMPLE_DATA_DIR, "BLASTed.sampledata") Utilities.dump_tsv(reference_df, reference_table)
def retrieve(self): tmp = Utilities.single_core_queue(self._download_handler, self._dl_queue) del tmp print("Download completed")
# Prepare path rawReadsDir = os.path.join(projectDescriber.RAW_DATA_DIR, "reads") cutadaptDir = os.path.join(rawReadsDir, "cutadapt") os.makedirs(cutadaptDir, exist_ok=True) # Trim reads cutadaptResults = Utilities.multi_core_queue( run_cutadapt, queue=rawSampledataDF.values.tolist()) cutadaptResultsDF = pd.DataFrame.from_dict(cutadaptResults).sort_values( "sample_name") Utilities.dump_tsv( cutadaptResultsDF, table_file=projectDescriber.SAMPLE_DATA_FILE, col_names=["sample_name", "trimmed_file_1", "trimmed_file_2"]) # Assemble reads spadesDir = os.path.join(rawReadsDir, "spades") spadesResults = Utilities.single_core_queue(run_spades, cutadaptResultsDF.values.tolist()) spadesResultsDF = pd.DataFrame.from_dict(spadesResults).sort_values( "sample_name") spadesResultsSampleData = os.path.join( os.path.dirname(projectDescriber.SAMPLE_DATA_FILE), "assemblies.sampledata") Utilities.dump_tsv(spadesResultsDF, table_file=spadesResultsSampleData, col_names=["sample_name", "assembly"]) print("\n".join([projectDescriber.SAMPLE_DATA_FILE, spadesResultsSampleData])) """ /data1/bio/projects/inicolaeva/klebsiella_infants/sample_data/trimmed.sampledata /data1/bio/projects/inicolaeva/klebsiella_infants/sample_data/assemblies.sampledata """
Utilities.dump_tsv(df=raw_sampledata_df, table_file=raw_sampledata_file, col_names=["sample_name", "R1", "R2"]) print(raw_sampledata_file) # /data1/bio/projects/inicolaeva/klebsiella_infants/sample_data/raw_reads.sampledata # Create more detailed sampledata raw_sampledata_df["reads_files"] = raw_sampledata_df.loc[:, ["R1", "R2"]].apply(lambda x: ";".join(x), axis=1) raw_sampledata_df["taxon"] = "Klebsiella pneumoniae" pipeline_sampledata_file = os.path.join(project_describer.ROOT_DIR, "sample_data", "raw_reads_pipeline.sampledata") Utilities.dump_tsv(df=raw_sampledata_df, table_file=pipeline_sampledata_file, col_names=["sample_name", "reads", "taxon"]) print(pipeline_sampledata_file) # /data1/bio/projects/inicolaeva/klebsiella_infants/sample_data/raw_reads_pipeline.sampledata reads_stats_list = Utilities.single_core_queue(Utilities.get_reads_stats_from_fq_gz, raw_sampledata_df["R1"].values.tolist()) reads_stats_df = pd.DataFrame(reads_stats_list) # Illumina's PE reads always have same counts of base pairs and total reads raw_sampledata_df["sample_reads_number"] = reads_stats_df["sample_reads_number"] * 2 raw_sampledata_df["sample_reads_bp"] = reads_stats_df["sample_reads_bp"] * 2 # Count the expected coverage according to https://www.genome.jp/kegg-bin/show_organism?org=kpm raw_sampledata_df["reference_genome_refseq"] = "NC_016845.1" raw_sampledata_df["reference_genome_bp"] = 5333942 raw_sampledata_df["expected_coverage"] = raw_sampledata_df["sample_reads_bp"] / raw_sampledata_df["reference_genome_bp"] raw_sampledata_df["expected_coverage"] = raw_sampledata_df["expected_coverage"].apply( lambda x: "{0:.1f}x".format(x)) reads_statistics_file = os.path.join(project_describer.ROOT_DIR, "sample_data", "reads_statistics.tsv") Utilities.dump_tsv(raw_sampledata_df, reads_statistics_file, col_names=["sample_name", "taxon", "sample_reads_number", "reference_genome_refseq", "reference_genome_bp", "sample_reads_bp", "expected_coverage"])