Exemplo n.º 1
0
 def retrieve(self):
     queue = []
     for key in self._links_dict:
         for url in self._links_dict[key]:
             queue.append({"url": url, "out_dir": os.path.join(self.reference_dir, key)})
     Utilities.single_core_queue(self._dl_handler, queue)
     print("Download completed")
 def retrieve(self):
     if os.path.exists(self.reference_dir):
         print("Warning! The reference path exists: '{}'".format(
             self.reference_dir))
     os.makedirs(self.reference_dir, exist_ok=True)
     chromosomes_dir = os.path.join(self.reference_dir, "chromosomes")
     os.makedirs(chromosomes_dir, exist_ok=True)
     # UCSC returns HTTP 530 when attempting to download in multi-thread
     compressed_chromosomes = Utilities.single_core_queue(
         self._dl_wrapper, [{
             "chromosome": i,
             "chromosomes_dir": chromosomes_dir
         } for i in self.CHROMOSOMES])
     # Process sequence
     self.parsed_records = Utilities.flatten_2d_array(
         Utilities.single_core_queue(self._parse_gzip_fna,
                                     compressed_chromosomes))
     self.nfasta_file = os.path.join(self.reference_dir, "hg19.fasta")
     SeqIO.write(self.parsed_records, self.nfasta_file, "fasta")
     # Process annotation
     self.index_dir = self.describer.get_index_guide(self.nfasta_file)
              key=lambda x: len(x),
              reverse=True)[0].format("fasta")
    for i in assemblies
}
props_stats = {
    k: {
        "length": len(props.get(k)),
        "head": props.get(k)[:50]
    }
    for k in props
}

# Create BLAST queries
blast_reports = Utilities.multi_core_queue(mp_get_and_blast_largest_contig,
                                           assemblies)
headers = Utilities.single_core_queue(process_blast_report, blast_reports)

# Create GenBank queries
genbank_reports = Utilities.multi_core_queue(mp_download_reference_genbank,
                                             headers)
reference_df = pd.DataFrame(
    Utilities.single_core_queue(process_genbank_report, genbank_reports))
reference_df["sample_name"] = reference_df["assembly_file"].apply(
    lambda x: "_".join(
        os.path.splitext(os.path.basename(x))[0].split("_")[:-1]))
reference_df.sort_values("sample_name", inplace=True)
reference_table = os.path.join(ProjectDescriber.SAMPLE_DATA_DIR,
                               "BLASTed.sampledata")

Utilities.dump_tsv(reference_df, reference_table)
 def retrieve(self):
     tmp = Utilities.single_core_queue(self._download_handler,
                                       self._dl_queue)
     del tmp
     print("Download completed")
# Prepare path
rawReadsDir = os.path.join(projectDescriber.RAW_DATA_DIR, "reads")
cutadaptDir = os.path.join(rawReadsDir, "cutadapt")
os.makedirs(cutadaptDir, exist_ok=True)
# Trim reads
cutadaptResults = Utilities.multi_core_queue(
    run_cutadapt, queue=rawSampledataDF.values.tolist())
cutadaptResultsDF = pd.DataFrame.from_dict(cutadaptResults).sort_values(
    "sample_name")
Utilities.dump_tsv(
    cutadaptResultsDF,
    table_file=projectDescriber.SAMPLE_DATA_FILE,
    col_names=["sample_name", "trimmed_file_1", "trimmed_file_2"])
# Assemble reads
spadesDir = os.path.join(rawReadsDir, "spades")
spadesResults = Utilities.single_core_queue(run_spades,
                                            cutadaptResultsDF.values.tolist())
spadesResultsDF = pd.DataFrame.from_dict(spadesResults).sort_values(
    "sample_name")
spadesResultsSampleData = os.path.join(
    os.path.dirname(projectDescriber.SAMPLE_DATA_FILE),
    "assemblies.sampledata")
Utilities.dump_tsv(spadesResultsDF,
                   table_file=spadesResultsSampleData,
                   col_names=["sample_name", "assembly"])
print("\n".join([projectDescriber.SAMPLE_DATA_FILE, spadesResultsSampleData]))
"""
/data1/bio/projects/inicolaeva/klebsiella_infants/sample_data/trimmed.sampledata
/data1/bio/projects/inicolaeva/klebsiella_infants/sample_data/assemblies.sampledata
"""
Exemplo n.º 6
0
Utilities.dump_tsv(df=raw_sampledata_df, table_file=raw_sampledata_file, col_names=["sample_name", "R1", "R2"])

print(raw_sampledata_file)  # /data1/bio/projects/inicolaeva/klebsiella_infants/sample_data/raw_reads.sampledata
# Create more detailed sampledata
raw_sampledata_df["reads_files"] = raw_sampledata_df.loc[:, ["R1", "R2"]].apply(lambda x: ";".join(x), axis=1)
raw_sampledata_df["taxon"] = "Klebsiella pneumoniae"
pipeline_sampledata_file = os.path.join(project_describer.ROOT_DIR, "sample_data", "raw_reads_pipeline.sampledata")

Utilities.dump_tsv(df=raw_sampledata_df, table_file=pipeline_sampledata_file,
                   col_names=["sample_name", "reads", "taxon"])

print(pipeline_sampledata_file)
# /data1/bio/projects/inicolaeva/klebsiella_infants/sample_data/raw_reads_pipeline.sampledata

reads_stats_list = Utilities.single_core_queue(Utilities.get_reads_stats_from_fq_gz,
                                               raw_sampledata_df["R1"].values.tolist())
reads_stats_df = pd.DataFrame(reads_stats_list)
# Illumina's PE reads always have same counts of base pairs and total reads
raw_sampledata_df["sample_reads_number"] = reads_stats_df["sample_reads_number"] * 2
raw_sampledata_df["sample_reads_bp"] = reads_stats_df["sample_reads_bp"] * 2
# Count the expected coverage according to https://www.genome.jp/kegg-bin/show_organism?org=kpm
raw_sampledata_df["reference_genome_refseq"] = "NC_016845.1"
raw_sampledata_df["reference_genome_bp"] = 5333942
raw_sampledata_df["expected_coverage"] = raw_sampledata_df["sample_reads_bp"] / raw_sampledata_df["reference_genome_bp"]
raw_sampledata_df["expected_coverage"] = raw_sampledata_df["expected_coverage"].apply(
    lambda x: "{0:.1f}x".format(x))
reads_statistics_file = os.path.join(project_describer.ROOT_DIR, "sample_data", "reads_statistics.tsv")

Utilities.dump_tsv(raw_sampledata_df, reads_statistics_file,
                   col_names=["sample_name", "taxon", "sample_reads_number", "reference_genome_refseq",
                              "reference_genome_bp", "sample_reads_bp", "expected_coverage"])