Пример #1
0
 def generate(pair_2d_array: list,
              regex: str = DEFAULT_REGEX,
              extension: str = DEFAULT_READS_EXTENSION):
     arr = SampleDataArray()
     for sample_read_files in pair_2d_array:
         sample_read_files = sorted(sample_read_files)
         sample_file = os.path.basename(sample_read_files[0])
         sample_name = Utilities.safe_findall(
             regex, re.sub(f"{extension}$", "", sample_file))
         if len(sample_name) == 0:
             raise ValueError(
                 f"Cannot process the file '{sample_file}' with the regex '{regex}'"
             )
         if any(sample_name not in i for i in sample_read_files):
             raise ValueError(
                 f"Some files from the list '{sample_read_files}' do not contain {sample_name} parsed by the regex '{regex}'"
             )
         if sample_name in arr.lines.keys():
             print(
                 f"Duplicate sample data line key, the regex check is considered: '{sample_name}'"
             )
             c = 0
             sample_name_ = str(sample_name)
             while sample_name in arr.lines.keys():
                 c += 1
                 sample_name = "{}.{}".format(sample_name_, c)
         arr.lines[sample_name] = SampleDataLine(sample_name,
                                                 sample_read_files)
     return arr
def process_blast_report(high_scoring_pairs: list):
    first_report = high_scoring_pairs[0]
    reference_header = first_report.get("title")
    accession_id = Utilities.safe_findall("\|* *gi\| *([^|]+) *\|",
                                          reference_header)
    return dict(assembly_file=first_report.get("assembly_file"),
                reference_header=reference_header,
                accession_id=accession_id)
Пример #3
0
 def get_genera_dict(input_list: list):
     return {
         j: ()
         for j in sorted([
             Utilities.safe_findall("([A-Z][a-z]{4,})", i).strip()
             for i in set(input_list) if isinstance(i, str)
         ]) if len(j) > 0
     }
Пример #4
0
 def _mp_parse_nfasta_header(header):
     output_dict = dict(former_id=header)
     output_dict["genbank_id"] = Utilities.safe_findall(
         "^gb\|([^|]+)", header)
     output_dict["is_antisense_strand"] = header.split("|")[2].startswith(
         "-")
     output_dict["locus"] = Utilities.safe_findall("\|(\d+\-\d+)", header)
     output_dict["aro_id"] = Utilities.safe_findall("\|ARO:(\d+)", header)
     gene_chunk = header.split("|")[-1]
     output_dict["host"] = Utilities.safe_findall("\[(.+)\]", gene_chunk)
     output_dict["gene_description"] = gene_chunk.replace(
         "[{}]".format(output_dict["host"]), "").strip()
     _MIN_GENE_SYMBOL_LENGTH = 3
     _NON_GENE_SYMBOL_WORDS = ("DNA", "RNA")
     output_dict["gene_symbol"] = min([
         j for j in [
             i.strip()
             for i in output_dict.get("gene_description").split(" ")
         ] if len(j) >= _MIN_GENE_SYMBOL_LENGTH
         and j not in _NON_GENE_SYMBOL_WORDS
     ],
                                      key=len)
     return Utilities.dict2pd_series(output_dict)
 def _mp_parse_nfasta_header(header: str):
     _VFDB_REGEXES = (("vfdb_id", "^VFG(\d+)", "VFG{}"),
                      ("gene_accession_id", "\(([^\(]+)\) ",
                       "({}) "), ("gene_symbol", "^\(([^\(]+)\) ", "({}) "),
                      ("gene_host", "\[([^\]]+)\]$",
                       "[{}]"), ("gene_name", " \[([^\]]+)\] $", " [{}] "),
                      ("gene_description", ".*", "{}"))
     out = {"former_id": header}
     # Spaces are important here
     for _tuple in _VFDB_REGEXES:
         key, regex, replacement = _tuple
         out[key] = Utilities.safe_findall(regex, header)
         if len(out.get(key)) > 0:
             header = header.replace(replacement.format(out.get(key)), "")
     return {k: out.get(k).strip() for k in out}
Пример #6
0
def define_species(_sample_name: str):
    _SPECIES = {
        "Bacillus subtilis BZR 336g": 336,
        "Bacillus subtilis BZR 517": 517,
        "Lactobacillus salivarius": 1,
        "Lactobacillus curvatus": 2,
        "Lactobacillus heilongjiangensis": 8
    }
    first_digits = Utilities.safe_findall("^\d+", _sample_name)
    if len(first_digits) > 0:
        first_digits = int(first_digits)
        for k in _SPECIES:
            if first_digits == _SPECIES.get(k):
                return k
    print("Cannot define species: '{}'".format(_sample_name))
    return "_"
Пример #7
0
 def _mp_parse_nfasta_header(header):
     out = {"former_id": header}
     for tag in re.findall("\[(.+)\]", header):
         header = header.replace("[{}]".format(tag), "[{}]".format(tag.strip()))
     header_chunks = [i.strip() for i in header.split("|")]
     category_chunk = header_chunks[1].upper()
     if category_chunk.startswith("T"):
         out["category"] = "Toxin"
     elif category_chunk.startswith("AT"):
         out["category"] = "Antitoxin"
     elif category_chunk.startswith("RE"):
         out["category"] = "Regulator"
     else:
         raise ValueError("Cannot define the header's category: {}".format(header))
     out["tadb_id"] = Utilities.safe_findall("([0-9]+)", category_chunk)
     out["geninfo_id"] = Utilities.safe_findall("gi\|([0-9]+)\|", header.lower())
     ref = Utilities.safe_findall("REF\|(.+)\|", header.upper()).split("|")[0]
     if len(ref) == 0:
         try:
             ref = Utilities.safe_findall("((N|Y)(C|P|Z)_\d+\.\d*)", header.upper())[0].split("|")[0]
         except IndexError:
             pass
     out["refseq_id"] = ref
     locus = Utilities.safe_findall("\|:([c]{0,1}[0-9\-]+)", header.lower())
     out["is_antisense_strand"] = locus.startswith("c")
     out["locus"] = locus.replace("c", "")
     tail = header_chunks[-1]
     out["description"] = tail.split("[")[0].replace(
         ":{}{}".format(["", "c"][out["is_antisense_strand"]], out["locus"]), "").replace(ref, "")
     out["gene_symbol"] = Utilities.safe_findall("\[([^\[]+)\]$", tail)
     host = ""
     if tail.count("[") > 1:
         host = Utilities.safe_findall("\[([^\[]+)\]", tail, 0)
     if len(host) == 0:
         host = Utilities.safe_findall("([A-Z][a-z]+ [a-z]+[\.]{0,1})", tail)
     out["host"] = host
     for key in out:
         if isinstance(out.get(key), str):
             out[key] = out.get(key).strip()
     return out
def process_genbank_report(d: dict):
    genbank_records = d.get("genbank_records")
    genbank_record = genbank_records[0]
    cds_number = len([i for i in genbank_record.features if i.type == "CDS"])
    qualifiers_dict = [
        i.qualifiers for i in genbank_record.features if i.type == "source"
    ][0]
    organism = Utilities.remove_empty_values(
        qualifiers_dict.get("organism")[0].split(" "))[:2]
    strain = " ".join(organism + [qualifiers_dict.get("strain")[0]])
    taxonomy_id = Utilities.safe_findall("\d+", [
        i for i in qualifiers_dict.get("db_xref")
        if i.split(":")[0].strip() == "taxon"
    ][0])
    return dict(assembly_file=d.get("assembly_file"),
                strain=strain,
                taxonomy_id=taxonomy_id,
                reference_accession_id=genbank_record.id,
                cds_number=cds_number,
                reference_bp=len(genbank_record),
                reference_description=genbank_record.description)
Пример #9
0
 os.makedirs(card_digest_dir, exist_ok=True)
 association_digest.reset_index().to_csv(os.path.join(
     card_digest_dir,
     "digest_card_{}_{}.tsv".format(value_col_name,
                                    annotation_col_name)),
                                         sep="\t",
                                         index=False,
                                         header=True)
 association_digest_percentage = association_digest * 100 / association_digest.sum(
 )
 fig = plt.figure()
 sns.set(style="whitegrid", font_scale=1)
 export_df = association_digest.rename(
     columns={
         i: Utilities.safe_findall(
             "/data1/bio/projects/inicolaeva/klebsiella_infants/map_data/Statistics/(.+)_card_v3.0.1_coverage.tsv",
             i)
         for i in list(association_digest)
     }).transpose()
 export_df.index.name = "sample_name"
 ax = export_df.plot(kind='bar', stacked='True', figsize=(20, 10))
 ax.set_ylabel(value_col_name)
 legend = ax.legend(loc="center left",
                    shadow=True,
                    fontsize="x-small",
                    bbox_to_anchor=(1.04, 0.5),
                    borderaxespad=0)
 image_file_name = os.path.join(
     card_digest_dir,
     "digest_card_{}_{}.png".format(value_col_name,
                                    annotation_col_name))
Пример #10
0
node_names = [
    j for j in [i.name for i in tree.find_clades()]
    if j is not None and j.startswith("GCF")
]

annotations_list = []
for node_name in node_names:
    # node_name = "GCF_005377825.1_ASM537782v1"
    genbank_file = os.path.join(genbank_dir,
                                "{}_genomic.gbff".format(node_name))
    seq_records = list(SeqIO.parse(genbank_file, "genbank"))
    annotation_dict = {
        i: flatten_string(seq_records[0].annotations.get(i))
        for i in ["organism", "date", "comment"]
    }
    annotation_dict["comment"] = remove_maintenance_comments(
        annotation_dict["comment"])
    annotation_dict["strain"] = Utilities.safe_findall(
        "[S|s]train:* ([^ ]+)", seq_records[0].description)
    annotation_dict["refseq_id"] = Utilities.safe_findall(
        "GCF_[^_]+", node_name)
    annotation_dict["assembly_id"] = node_name.replace(
        annotation_dict["refseq_id"], "").strip("_")
    annotations_list.append(annotation_dict)

annotations_df = pd.DataFrame(annotations_list)
Utilities.dump_tsv(
    annotations_df,
    "/data1/bio/projects/inicolaeva/klebsiella_infants/roary/newick/iTOL_collapsed_tree_annotation.tsv"
)
Пример #11
0
# Get the raw reads files
raw_reads_files_dir = ProjectDescriber.RAW_DATA_DIR
raw_reads_files_list = [
    i for i in Utilities.scan_whole_dir(raw_reads_files_dir)
    if i.endswith("_001.fastq.gz")
]

# Split them into the two groups
STRANDS = ("R1", "R2")
raw_reads_list = []
for raw_reads_files_pair in Utilities.get_most_similar_word_pairs(
        raw_reads_files_list):
    # Illumina file names have template '[sample]_[sequence]_[lane]_[strand]_[number].fastq.gz'
    # E.g: '336g_S1_L001_R1_001.fastq.gz'
    sample_name = Utilities.safe_findall(
        "(.+)_S[0-9]+_L[0-9]+_R[0-9]+_[0-9]+",
        os.path.basename(raw_reads_files_pair[0]))
    raw_reads_dict = dict(sample_name=sample_name)
    for raw_reads_file in raw_reads_files_pair:
        for reads_strand in STRANDS:
            if "_{}_".format(reads_strand) in os.path.splitext(
                    os.path.basename(raw_reads_file))[0]:
                raw_reads_dict[reads_strand] = raw_reads_file
    if all([
            raw_reads_dict.get(STRANDS[0]).replace("_{}_".format(
                STRANDS[0]), "_{}_".format(STRANDS[-1])) == raw_reads_dict.get(
                    STRANDS[-1])
    ] + [
            raw_reads_dict.get(STRANDS[-1]).replace("_{}_".format(
                STRANDS[-1]), "_{}_".format(STRANDS[0])) == raw_reads_dict.get(
                    STRANDS[0])
assembly_files = [
    i for i in Utilities.scan_whole_dir(assembler_result_dir)
    if os.path.basename(i) == "contigs.fasta"
]
assemblies_target_dir = "/data1/bio/projects/inicolaeva/klebsiella_infants/assemblies"

sample_dirs = sorted(
    set([os.path.dirname(os.path.dirname(i)) for i in assembly_files]))

_ = subprocess.getoutput("rm -rf {}".format(assemblies_target_dir))
os.makedirs(assemblies_target_dir, exist_ok=True)

assemblies_annotations = []
for sample_dir in sample_dirs:
    sample_name = os.path.basename(sample_dir)
    sample_number = Utilities.safe_findall("([0-9]+)", sample_name)
    sample_assemblies = [i for i in assembly_files if i.startswith(sample_dir)]
    assemblies_annotation = dict()
    seq_records_processed = []
    plasmid_counter = 0
    assembly_target_file = os.path.join(assemblies_target_dir,
                                        "{}_genome.fna".format(sample_name))
    for assembly_file_raw in sample_assemblies:
        for assembly_type in ASSEMBLY_TYPES:
            if os.path.dirname(assembly_file_raw).endswith(assembly_type):
                seq_records = sorted(list(
                    SeqIO.parse(assembly_file_raw, "fasta")),
                                     key=lambda x: len(x),
                                     reverse=True)
                assemblies_annotation["sample_name"] = sample_name
                assemblies_annotation["{}_file".format(
Пример #13
0
 def generate_genera_dict(keywords: list):
     return DigestAssociationsKeeper.generate_keywords_dict(
         [Utilities.safe_findall("([A-Z][a-z]{4,})", i) for i in keywords])
Пример #14
0
# Get the raw reads files
raw_reads_files_dir = "/data1/bio/190405_M01969_0041_000000000-C6B66/Conversion_shotgun/Klebsiella"
raw_reads_files_list = [i for i in Utilities.scan_whole_dir(raw_reads_files_dir) if
                        os.path.normpath(os.path.dirname(i)) == raw_reads_files_dir and i.endswith("_001.fastq.gz")]
# Split them into the two groups
raw_reads_dict = {
    i: sorted([j for j in raw_reads_files_list if "_{}_".format(i) in os.path.splitext(os.path.basename(j))[0]]) for i
    in ("R1", "R2")}
# Combine the dict into the pandas.DataFrame object
raw_sampledata_df = pd.DataFrame.from_dict(raw_reads_dict)
# Are reads files corresponding to each other?
assert all((raw_sampledata_df["R1"].str.replace("_R1_", "_R2_") == raw_sampledata_df["R2"]).values.tolist() + (
            raw_sampledata_df["R2"].str.replace("_R2_", "_R1_") == raw_sampledata_df["R1"]).values.tolist())
# Get the sample names from reads file names
raw_sampledata_df["sample_name"] = raw_sampledata_df["R1"].map(
    lambda x: Utilities.safe_findall("(.+)_S[0-9]{2}_R[1|2]_001.fastq.gz", os.path.basename(x)))
# Export sampledata
project_describer = ProjectDescriber()
raw_sampledata_file = os.path.join(project_describer.ROOT_DIR, "sample_data", "raw_reads.sampledata")

Utilities.dump_tsv(df=raw_sampledata_df, table_file=raw_sampledata_file, col_names=["sample_name", "R1", "R2"])

print(raw_sampledata_file)  # /data1/bio/projects/inicolaeva/klebsiella_infants/sample_data/raw_reads.sampledata
# Create more detailed sampledata
raw_sampledata_df["reads_files"] = raw_sampledata_df.loc[:, ["R1", "R2"]].apply(lambda x: ";".join(x), axis=1)
raw_sampledata_df["taxon"] = "Klebsiella pneumoniae"
pipeline_sampledata_file = os.path.join(project_describer.ROOT_DIR, "sample_data", "raw_reads_pipeline.sampledata")

Utilities.dump_tsv(df=raw_sampledata_df, table_file=pipeline_sampledata_file,
                   col_names=["sample_name", "reads", "taxon"])