def load(self): super().load() start = perf_counter() parsed_nfasta_headers = jb.Parallel(n_jobs=-1)( jb.delayed(mp_parse_nfasta_header)(i) for i in self.raw_nucleotide_fasta_headers) self.nucleotide_header_df = pd.DataFrame(parsed_nfasta_headers) print( f"Nucleotide FASTA headers parsed into table with shape {self.nucleotide_header_df.shape} with {count_elapsed_seconds(start)}" ) start = perf_counter() pfasta_file = find_file_by_tail(self.reference_dir, "VFDB_setB_pro.fas") print(f"Use the protein FASTA file: '{pfasta_file}'") raw_pfasta_headers = load_headers_from_fasta(pfasta_file) print(f"Loaded {len(raw_pfasta_headers)} protein FASTA headers") parsed_pfasta_headers = jb.Parallel(n_jobs=-1)( jb.delayed(mp_parse_pfasta_header)(i) for i in raw_pfasta_headers) self.protein_header_df = pd.DataFrame(parsed_pfasta_headers) print( f"Protein FASTA headers parsed into table with shape {self.protein_header_df.shape} with {count_elapsed_seconds(start)}" ) vfs_table_file = find_file_by_tail(self.reference_dir, "VFs.xls") print(f"Use the VFs description file: '{vfs_table_file}'") self.vfs_df = pd.read_excel(vfs_table_file, engine="xlrd", header=1).fillna("") print(f"Loaded VFs description table with shape {self.vfs_df.shape}") self.vfs_df["vfdb_number"] = self.vfs_df["VFID"].str.extract( "([0-9]+)").astype(int)
def load(self): super().load() start = perf_counter() parsed_nfasta_headers = jb.Parallel(n_jobs=-1)( jb.delayed(mp_parse_nfasta_header)(i) for i in self.raw_nucleotide_fasta_headers) self.nucleotide_header_df = pd.DataFrame(parsed_nfasta_headers) print( f"Nucleotide FASTA headers parsed into table with shape {self.nucleotide_header_df.shape} with {count_elapsed_seconds(start)}" ) reference_file = find_file_by_tail(self.reference_dir, "completeMvirDBTable.txt") print(f"Use the reference description file: '{reference_file}'") self.reference_df = pd.read_csv( reference_file, engine="python", header=0, on_bad_lines="warn", sep="\t").rename(columns={ "#Virulence Factor ID": "vfid" }).sort_values("vfid") print( f"Loaded reference description table with shape {self.reference_df.shape}" )
def find_and_load_refdata(directory: str): print(f"Looking for '*refdata.json' in '{directory}'") refdata_file = find_file_by_tail(directory, "_refdata.json") if len(refdata_file) == 0: print( f"Cannot find a ReferenceData compatible file within the directory '{directory}'" ) raise ValueError print(f"Reference data found at '{refdata_file}'") return ReferenceData.load(refdata_file)
def retrieve(self): if len(self.download_page_soup) == 0: self.get_latest_version() self.download_links = [ urljoin(self.DOMAIN_ROOT, j) for j in [ i["href"] for i in self.download_page_soup.find_all("a", href=True) ] if not j.endswith("htm") ] self.download() downloaded_nfasta = find_file_by_tail( self.REFERENCE_DOWNLOAD_DIRECTORY, "VFDB_setB_nt.fas") self.create_nucleotide_fasta_symlink(downloaded_nfasta, default_nfasta=True)
def create_sampledata_dict_from_dir(directory: str, reads_extension: str = DEFAULT_REGEX): reads_files = find_file_by_tail(directory, ".{}".format(reads_extension.strip(".")), multiple=True) tokenized_reads_files = [tokenize_reads_file_name(i) for i in reads_files] out = dict() for token_dict in tokenized_reads_files: sample_name = token_dict["sample_name"] if sample_name in out.keys(): out[sample_name]["reads"].append(token_dict["reads_file"]) out[sample_name]["reads"].sort() else: out[sample_name] = { "name": sample_name, "reads": [ token_dict["reads_file"], ], "taxa": "" } return out
help="nBee stage directory") parser.add_argument("-o", "--output_file", metavar="<dir>", required=True, help="Output file") _namespace = parser.parse_args() return (_namespace.rgi_dir, _namespace.card_version, _namespace.nbee_dir, _namespace.output_file) if __name__ == '__main__': (rgi_dir, card_version, nbee_dir, out_file) = _parse_args() # RGI rgi_tables = find_file_by_tail(dir_name=rgi_dir, multiple=True, tail=".txt") merged_rgi_df = pd.DataFrame() for rgi_table in rgi_tables: rgi_df = load_tsv(rgi_table) if rgi_df.shape[0] == 0: continue rgi_df = remove_longest_columns(rgi_df, CELL_SIZE_LIMIT) columns = rgi_df.columns.tolist() rgi_df["sample_name"] = filename_only(rgi_table) rgi_df = rgi_df.loc[:, ["sample_name"] + columns] print( f"Concatenate dataframes with shapes {rgi_df.shape}, {merged_rgi_df.shape}" ) merged_rgi_df = pd.concat([merged_rgi_df, rgi_df], axis=0,