Exemplo n.º 1
0
 def parse_fastq(self, output_file: str):
     import os
     import math
     if os.path.isfile(output_file):
         os.remove(output_file)
         print("Deleted file in order to replace it with new data: '{}'".
               format(output_file))
     counter = 0
     first_position = 0
     last_position = self.CHUNK_SIZE
     chunks_number = math.ceil(len(self.raw_fastqs_list) / self.CHUNK_SIZE)
     while last_position < len(self.raw_fastqs_list):
         with open(output_file, mode="a", encoding="utf-8") as f:
             f.write("{}\n".format("\n".join(
                 Utilities.multi_core_queue(
                     self.mp_parse_fastq_line,
                     self.raw_fastqs_list[first_position:last_position]))))
         counter += 1
         print("Passed FASTQ parse iteration: {} (of {})".format(
             counter, chunks_number))
         first_position += self.CHUNK_SIZE
         last_position += self.CHUNK_SIZE
     if first_position <= len(self.raw_fastqs_list):
         with open(output_file, mode="a", encoding="utf-8") as f:
             f.write("{}\n".format("\n".join(
                 Utilities.multi_core_queue(
                     self.mp_parse_fastq_line, self.raw_fastqs_list[
                         first_position:len(self.raw_fastqs_list)]))))
         print("Passed FASTQ parse last iteration")
     print("Finished parse FASTQ items: {}".format(len(
         self.raw_fastqs_list)))
 def annotate(self):
     self._raw_nfasta_df = Utilities.load_tsv(self.annotation_file)
     raw_nfasta_headers = self._raw_nfasta_df["former_id"].values.tolist()
     processed_nfasta_headers = [
         Utilities.dict2pd_series(i) for i in Utilities.multi_core_queue(
             self._mp_parse_nfasta_header, raw_nfasta_headers)
     ]
     self._processed_nfasta_df = Utilities.merge_pd_series_list(
         processed_nfasta_headers).sort_values("former_id")
     zf_len = len(max(self._processed_nfasta_df["vfdb_id"].values.tolist()))
     # Join table assembled from pFASTA headers
     raw_pfasta_headers = []
     with open(self._raw_pfasta_file, mode="r", encoding="utf-8") as _f:
         for _line in _f:
             if _line.startswith(">"):
                 raw_pfasta_headers.append(re.sub("^>", "", _line).strip())
         _f.close()
     raw_pfasta_headers = sorted(
         set([i for i in raw_pfasta_headers if len(i) > 0]))
     processed_pfasta_headers = [
         Utilities.dict2pd_series(i) for i in Utilities.multi_core_queue(
             self._mp_parse_pfasta_header, raw_pfasta_headers)
     ]
     self._processed_pfasta_df = Utilities.merge_pd_series_list(
         processed_pfasta_headers).sort_values("protein_header")
     self._processed_pfasta_df["vfdb_id"] = self._processed_pfasta_df[
         "vfdb_id"].str.zfill(zf_len)
     # Join provided table. Note the table file placed into the same dir with the merged protein FASTA file
     vfs_table_file = os.path.join(os.path.dirname(self._raw_pfasta_file),
                                   "VFs.xls")
     vfs_df = pd.read_excel(vfs_table_file, sheet_name="VFs",
                            header=1).fillna("")
     vfs_df["vfdb_id"] = vfs_df["VFID"].str.extract("VF(\d+)")[0].str.zfill(
         zf_len)
     self.merged_df = pd.concat([
         i.set_index("vfdb_id").sort_index() for i in
         [self._processed_nfasta_df, self._processed_pfasta_df, vfs_df]
     ],
                                axis=1,
                                sort=False).sort_index()
     self.merged_df.index.names = ["vfdb_id"]
     self.merged_df = self.merged_df.loc[
         self.merged_df["former_id"].str.len() > 0].reset_index()
     self.merged_df = Utilities.left_merge(self._raw_nfasta_df,
                                           self.merged_df, "former_id")
Exemplo n.º 3
0
 def annotate(self):
     # Process nucleotide FASTA
     self._raw_nfasta_df = pd.read_table(self.annotation_file, sep="\t", header=0)
     raw_nfasta_headers = self._raw_nfasta_df["former_id"].values.tolist()
     processed_nfasta_headers = [Utilities.dict2pd_series(i) for i in
                                 Utilities.multi_core_queue(self._mp_parse_nfasta_header, raw_nfasta_headers)]
     self._processed_nfasta_df = Utilities.merge_pd_series_list(processed_nfasta_headers).sort_values("former_id")
     self.nfasta_df = Utilities.left_merge(self._raw_nfasta_df, self._processed_nfasta_df, "former_id")
     # Process protein FASTA
     raw_pfasta_headers = sorted(set([j for j in [re.sub("^>", "", i).strip() for i in
                                                  open(self._raw_pfasta_file, mode="r", encoding="utf-8") if
                                                  i.startswith(">")] if len(j) > 0]))
     processed_pfasta_headers = [Utilities.dict2pd_series(i) for i in
                                 Utilities.multi_core_queue(self._mp_parse_nfasta_header, raw_pfasta_headers)]
     self.pfasta_df = Utilities.merge_pd_series_list(processed_pfasta_headers).sort_values("former_id")
     self.pfasta_df.rename(columns={"geninfo_id": "protein_geninfo_id", "refseq_id": "genpept_id",
                                    "description": "protein_description", "host": "protein_host"}, inplace=True)
     self.merged_df = Utilities.left_merge(self.nfasta_df, self.pfasta_df, "tadb_id", "category", "gene_symbol")
     self.merged_df = Utilities.combine_duplicate_rows(self.merged_df, "reference_id")
Exemplo n.º 4
0
 def annotate(self):
     self.annotation_file = self.describer.get_refdata_dict().get(
         "sequence_1").annotation_file
     self._raw_nfasta_df = pd.read_table(self.annotation_file,
                                         sep='\t',
                                         header=0)
     mp_result = Utilities.multi_core_queue(
         self._mp_parse_nfasta_header,
         self._raw_nfasta_df["former_id"].values.tolist())
     self._processed_nfasta_df = Utilities.merge_pd_series_list(
         mp_result).sort_values("former_id")
     self.nfasta_df = Utilities.left_merge(self._raw_nfasta_df,
                                           self._processed_nfasta_df,
                                           "former_id")
     # Join 'aro_index.tsv'
     aro_index_df = pd.read_table(os.path.join(self.reference_dir, "data",
                                               "aro_index.tsv"),
                                  sep='\t',
                                  header=0)
     aro_index_df["aro_id"] = aro_index_df["ARO Accession"].str.extract(
         "ARO:(\d+)")
     # 'aro_index.tsv' has more entries than 'nucleotide_fasta_protein_homolog_model.fasta' provides
     self.nfasta_df = Utilities.left_merge(self.nfasta_df, aro_index_df,
                                           "aro_id")
     # Join 'aro_categories_index.tsv'
     aro_categories_index_df = pd.read_table(os.path.join(
         self.reference_dir, "data", "aro_categories_index.tsv"),
                                             sep='\t',
                                             header=0)
     self.nfasta_df = Utilities.left_merge(self.nfasta_df,
                                           aro_categories_index_df,
                                           "Protein Accession")
     # Joining 'aro_categories.tsv' is useless: the resulting 'ARO Category' is filled by NaN
     # Join 'aro.tsv'
     aro_df = pd.read_table(os.path.join(self.reference_dir, "ontology",
                                         "aro.tsv"),
                            sep='\t',
                            header=0)
     aro_df.rename(columns={
         "Accession": "ARO Accession",
         "Name": "ARO Name"
     },
                   inplace=True)
     self.nfasta_df = Utilities.left_merge(self.nfasta_df, aro_df,
                                           "ARO Accession")
     self.nfasta_df = Utilities.combine_duplicate_rows(
         self.nfasta_df, "reference_id")
props = {
    i: sorted(list(SeqIO.parse(i, "fasta")),
              key=lambda x: len(x),
              reverse=True)[0].format("fasta")
    for i in assemblies
}
props_stats = {
    k: {
        "length": len(props.get(k)),
        "head": props.get(k)[:50]
    }
    for k in props
}

# Create BLAST queries
blast_reports = Utilities.multi_core_queue(mp_get_and_blast_largest_contig,
                                           assemblies)
headers = Utilities.single_core_queue(process_blast_report, blast_reports)

# Create GenBank queries
genbank_reports = Utilities.multi_core_queue(mp_download_reference_genbank,
                                             headers)
reference_df = pd.DataFrame(
    Utilities.single_core_queue(process_genbank_report, genbank_reports))
reference_df["sample_name"] = reference_df["assembly_file"].apply(
    lambda x: "_".join(
        os.path.splitext(os.path.basename(x))[0].split("_")[:-1]))
reference_df.sort_values("sample_name", inplace=True)
reference_table = os.path.join(ProjectDescriber.SAMPLE_DATA_DIR,
                               "BLASTed.sampledata")

Utilities.dump_tsv(reference_df, reference_table)
Exemplo n.º 6
0
import pandas as pd
from meta.scripts.Utilities import Utilities

#%%

sra_dir = "/data1/bio/projects/vradchenko/lactobacillus_salivarius/sra"
sra_df = Utilities.load_tsv(os.path.join(sra_dir, "sra.tsv"))

queue = [{
    "func": Utilities.count_reads_statistics,
    "kwargs": {
        "reads_file": i,
        "type_": "fastq_gz"
    }
} for i in Utilities.scan_whole_dir(os.path.join(sra_dir, "reads"))]

raw_reads_base_stats = Utilities.multi_core_queue(Utilities.wrapper,
                                                  queue,
                                                  async_=True)

#%%

raw_reads_base_stat_df = pd.DataFrame(raw_reads_base_stats)
raw_reads_base_stat_df["reads_file"] = raw_reads_base_stat_df[
    "reads_file"].apply(os.path.basename)
raw_reads_base_stat_df["sample_name"] = raw_reads_base_stat_df[
    "reads_file"].str.extract(r"(.+)\[")

Utilities.dump_tsv(raw_reads_base_stat_df,
                   os.path.join(sra_dir, "raw_reads_base_stats.tsv"))
Exemplo n.º 7
0
    for copy_number in [1, 2]:
        max_id += 1
        for fastq_archive in [r1_fastq_archive, r2_fastq_archive]:
            output_file = "{}{}".format(
                Utilities.ends_with_slash("/".join(
                    fastq_archive.split("/")[:-1])),
                re.sub(
                    "\.gz$", "",
                    re.sub("[0-9]{6}",
                           str(max_id).zfill(6),
                           fastq_archive.split("/")[-1])))
            print("Loading file '{}'".format(fastq_archive))
            fq_array = FASTAArray(
                subprocess.getoutput("zcat {}".format(fastq_archive)))
            print("Loaded file '{}'".format(fastq_archive))
            fq_array.parse_fastq(output_file)
            del fq_array
            output_files_list.append(output_file)
            print("Saved file '{}'".format(output_file))
            gc.collect()


def mp_gzip_file(file):
    print("Compressing file '{}'".format(file))
    print(subprocess.getoutput("gzip -9 -c {a} > {a}.gz".format(a=file)))
    print("Compressed file '{}'".format(file))


print("Compressing {} files".format(len(output_files_list)))
print(Utilities.multi_core_queue(mp_gzip_file, output_files_list))
    return {
        "sample_name": sample_name,
        "assembly": os.path.join(out_dir, "contigs.fasta")
    }


projectDescriber = ProjectDescriber()
rawSampledataDF = Utilities.load_tsv(
    "/data1/bio/projects/inicolaeva/klebsiella_infants/sample_data/raw_reads.sampledata"
)
# Prepare path
rawReadsDir = os.path.join(projectDescriber.RAW_DATA_DIR, "reads")
cutadaptDir = os.path.join(rawReadsDir, "cutadapt")
os.makedirs(cutadaptDir, exist_ok=True)
# Trim reads
cutadaptResults = Utilities.multi_core_queue(
    run_cutadapt, queue=rawSampledataDF.values.tolist())
cutadaptResultsDF = pd.DataFrame.from_dict(cutadaptResults).sort_values(
    "sample_name")
Utilities.dump_tsv(
    cutadaptResultsDF,
    table_file=projectDescriber.SAMPLE_DATA_FILE,
    col_names=["sample_name", "trimmed_file_1", "trimmed_file_2"])
# Assemble reads
spadesDir = os.path.join(rawReadsDir, "spades")
spadesResults = Utilities.single_core_queue(run_spades,
                                            cutadaptResultsDF.values.tolist())
spadesResultsDF = pd.DataFrame.from_dict(spadesResults).sort_values(
    "sample_name")
spadesResultsSampleData = os.path.join(
    os.path.dirname(projectDescriber.SAMPLE_DATA_FILE),
    "assemblies.sampledata")
Exemplo n.º 9
0
# Process assemblies
blasted_data_df = Utilities.load_tsv(
    os.path.join(ProjectDescriber.SAMPLE_DATA_DIR, "BLASTed.sampledata"))
blasted_data_df["organism"] = blasted_data_df["strain"].apply(
    lambda x: " ".join(x.split(" ")[:2]))

blasted_data_df.rename(columns={
    i: "reference_{}".format(i)
    for i in blasted_data_df.columns
    if all(j not in i for j in ["assembly", "reference", "sample"])
},
                       inplace=True)
assembly_files = blasted_data_df["assembly_file"].values.tolist()

assembly_stats_df = pd.DataFrame(
    Utilities.multi_core_queue(Utilities.count_assembly_statistics,
                               assembly_files))
assembly_stats_df.rename(
    columns={i: "assembly_{}".format(i)
             for i in assembly_stats_df.columns},
    inplace=True)
blasted_data_df = pd.concat([
    blasted_data_df.set_index("assembly_file"),
    assembly_stats_df.set_index("assembly_file")
],
                            axis=1,
                            sort=False)
blasted_data_df.index.names = ["assembly_file"]
blasted_data_df.reset_index(inplace=True)

# Process raw reads
sample_data_df = Utilities.load_tsv(ProjectDescriber.SAMPLE_DATA_FILE)