示例#1
0
    def load(self):
        super().load()
        start = perf_counter()
        parsed_nfasta_headers = jb.Parallel(n_jobs=-1)(
            jb.delayed(mp_parse_nfasta_header)(i)
            for i in self.raw_nucleotide_fasta_headers)
        self.nucleotide_header_df = pd.DataFrame(parsed_nfasta_headers)
        print(
            f"Nucleotide FASTA headers parsed into table with shape {self.nucleotide_header_df.shape} with {count_elapsed_seconds(start)}"
        )

        start = perf_counter()
        pfasta_file = find_file_by_tail(self.reference_dir,
                                        "VFDB_setB_pro.fas")
        print(f"Use the protein FASTA file: '{pfasta_file}'")
        raw_pfasta_headers = load_headers_from_fasta(pfasta_file)
        print(f"Loaded {len(raw_pfasta_headers)} protein FASTA headers")
        parsed_pfasta_headers = jb.Parallel(n_jobs=-1)(
            jb.delayed(mp_parse_pfasta_header)(i) for i in raw_pfasta_headers)
        self.protein_header_df = pd.DataFrame(parsed_pfasta_headers)
        print(
            f"Protein FASTA headers parsed into table with shape {self.protein_header_df.shape} with {count_elapsed_seconds(start)}"
        )

        vfs_table_file = find_file_by_tail(self.reference_dir, "VFs.xls")
        print(f"Use the VFs description file: '{vfs_table_file}'")
        self.vfs_df = pd.read_excel(vfs_table_file, engine="xlrd",
                                    header=1).fillna("")
        print(f"Loaded VFs description table with shape {self.vfs_df.shape}")
        self.vfs_df["vfdb_number"] = self.vfs_df["VFID"].str.extract(
            "([0-9]+)").astype(int)
    def load(self):
        super().load()
        start = perf_counter()
        parsed_nfasta_headers = jb.Parallel(n_jobs=-1)(
            jb.delayed(mp_parse_nfasta_header)(i)
            for i in self.raw_nucleotide_fasta_headers)
        self.nucleotide_header_df = pd.DataFrame(parsed_nfasta_headers)
        print(
            f"Nucleotide FASTA headers parsed into table with shape {self.nucleotide_header_df.shape} with {count_elapsed_seconds(start)}"
        )

        reference_file = find_file_by_tail(self.reference_dir,
                                           "completeMvirDBTable.txt")
        print(f"Use the reference description file: '{reference_file}'")
        self.reference_df = pd.read_csv(
            reference_file,
            engine="python",
            header=0,
            on_bad_lines="warn",
            sep="\t").rename(columns={
                "#Virulence Factor ID": "vfid"
            }).sort_values("vfid")
        print(
            f"Loaded reference description table with shape {self.reference_df.shape}"
        )
示例#3
0
 def find_and_load_refdata(directory: str):
     print(f"Looking for '*refdata.json' in '{directory}'")
     refdata_file = find_file_by_tail(directory, "_refdata.json")
     if len(refdata_file) == 0:
         print(
             f"Cannot find a ReferenceData compatible file within the directory '{directory}'"
         )
         raise ValueError
     print(f"Reference data found at '{refdata_file}'")
     return ReferenceData.load(refdata_file)
示例#4
0
 def retrieve(self):
     if len(self.download_page_soup) == 0:
         self.get_latest_version()
     self.download_links = [
         urljoin(self.DOMAIN_ROOT, j) for j in [
             i["href"]
             for i in self.download_page_soup.find_all("a", href=True)
         ] if not j.endswith("htm")
     ]
     self.download()
     downloaded_nfasta = find_file_by_tail(
         self.REFERENCE_DOWNLOAD_DIRECTORY, "VFDB_setB_nt.fas")
     self.create_nucleotide_fasta_symlink(downloaded_nfasta,
                                          default_nfasta=True)
示例#5
0
def create_sampledata_dict_from_dir(directory: str,
                                    reads_extension: str = DEFAULT_REGEX):
    reads_files = find_file_by_tail(directory,
                                    ".{}".format(reads_extension.strip(".")),
                                    multiple=True)
    tokenized_reads_files = [tokenize_reads_file_name(i) for i in reads_files]

    out = dict()
    for token_dict in tokenized_reads_files:
        sample_name = token_dict["sample_name"]
        if sample_name in out.keys():
            out[sample_name]["reads"].append(token_dict["reads_file"])
            out[sample_name]["reads"].sort()
        else:
            out[sample_name] = {
                "name": sample_name,
                "reads": [
                    token_dict["reads_file"],
                ],
                "taxa": ""
            }
    return out
示例#6
0
                        help="nBee stage directory")
    parser.add_argument("-o",
                        "--output_file",
                        metavar="<dir>",
                        required=True,
                        help="Output file")
    _namespace = parser.parse_args()
    return (_namespace.rgi_dir, _namespace.card_version, _namespace.nbee_dir,
            _namespace.output_file)


if __name__ == '__main__':
    (rgi_dir, card_version, nbee_dir, out_file) = _parse_args()
    # RGI
    rgi_tables = find_file_by_tail(dir_name=rgi_dir,
                                   multiple=True,
                                   tail=".txt")
    merged_rgi_df = pd.DataFrame()
    for rgi_table in rgi_tables:
        rgi_df = load_tsv(rgi_table)
        if rgi_df.shape[0] == 0:
            continue
        rgi_df = remove_longest_columns(rgi_df, CELL_SIZE_LIMIT)
        columns = rgi_df.columns.tolist()
        rgi_df["sample_name"] = filename_only(rgi_table)
        rgi_df = rgi_df.loc[:, ["sample_name"] + columns]
        print(
            f"Concatenate dataframes with shapes {rgi_df.shape}, {merged_rgi_df.shape}"
        )
        merged_rgi_df = pd.concat([merged_rgi_df, rgi_df],
                                  axis=0,