def annotate(self): self._raw_nfasta_df = Utilities.load_tsv(self.annotation_file) raw_nfasta_headers = self._raw_nfasta_df["former_id"].values.tolist() processed_nfasta_headers = [ Utilities.dict2pd_series(i) for i in Utilities.multi_core_queue( self._mp_parse_nfasta_header, raw_nfasta_headers) ] self._processed_nfasta_df = Utilities.merge_pd_series_list( processed_nfasta_headers).sort_values("former_id") zf_len = len(max(self._processed_nfasta_df["vfdb_id"].values.tolist())) # Join table assembled from pFASTA headers raw_pfasta_headers = [] with open(self._raw_pfasta_file, mode="r", encoding="utf-8") as _f: for _line in _f: if _line.startswith(">"): raw_pfasta_headers.append(re.sub("^>", "", _line).strip()) _f.close() raw_pfasta_headers = sorted( set([i for i in raw_pfasta_headers if len(i) > 0])) processed_pfasta_headers = [ Utilities.dict2pd_series(i) for i in Utilities.multi_core_queue( self._mp_parse_pfasta_header, raw_pfasta_headers) ] self._processed_pfasta_df = Utilities.merge_pd_series_list( processed_pfasta_headers).sort_values("protein_header") self._processed_pfasta_df["vfdb_id"] = self._processed_pfasta_df[ "vfdb_id"].str.zfill(zf_len) # Join provided table. Note the table file placed into the same dir with the merged protein FASTA file vfs_table_file = os.path.join(os.path.dirname(self._raw_pfasta_file), "VFs.xls") vfs_df = pd.read_excel(vfs_table_file, sheet_name="VFs", header=1).fillna("") vfs_df["vfdb_id"] = vfs_df["VFID"].str.extract("VF(\d+)")[0].str.zfill( zf_len) self.merged_df = pd.concat([ i.set_index("vfdb_id").sort_index() for i in [self._processed_nfasta_df, self._processed_pfasta_df, vfs_df] ], axis=1, sort=False).sort_index() self.merged_df.index.names = ["vfdb_id"] self.merged_df = self.merged_df.loc[ self.merged_df["former_id"].str.len() > 0].reset_index() self.merged_df = Utilities.left_merge(self._raw_nfasta_df, self.merged_df, "former_id")
def annotate(self): # Process nucleotide FASTA self._raw_nfasta_df = pd.read_table(self.annotation_file, sep="\t", header=0) raw_nfasta_headers = self._raw_nfasta_df["former_id"].values.tolist() processed_nfasta_headers = [Utilities.dict2pd_series(i) for i in Utilities.multi_core_queue(self._mp_parse_nfasta_header, raw_nfasta_headers)] self._processed_nfasta_df = Utilities.merge_pd_series_list(processed_nfasta_headers).sort_values("former_id") self.nfasta_df = Utilities.left_merge(self._raw_nfasta_df, self._processed_nfasta_df, "former_id") # Process protein FASTA raw_pfasta_headers = sorted(set([j for j in [re.sub("^>", "", i).strip() for i in open(self._raw_pfasta_file, mode="r", encoding="utf-8") if i.startswith(">")] if len(j) > 0])) processed_pfasta_headers = [Utilities.dict2pd_series(i) for i in Utilities.multi_core_queue(self._mp_parse_nfasta_header, raw_pfasta_headers)] self.pfasta_df = Utilities.merge_pd_series_list(processed_pfasta_headers).sort_values("former_id") self.pfasta_df.rename(columns={"geninfo_id": "protein_geninfo_id", "refseq_id": "genpept_id", "description": "protein_description", "host": "protein_host"}, inplace=True) self.merged_df = Utilities.left_merge(self.nfasta_df, self.pfasta_df, "tadb_id", "category", "gene_symbol") self.merged_df = Utilities.combine_duplicate_rows(self.merged_df, "reference_id")
def annotate(self): self.annotation_file = self.describer.get_refdata_dict().get( "sequence_1").annotation_file self._raw_nfasta_df = pd.read_table(self.annotation_file, sep='\t', header=0) mp_result = Utilities.multi_core_queue( self._mp_parse_nfasta_header, self._raw_nfasta_df["former_id"].values.tolist()) self._processed_nfasta_df = Utilities.merge_pd_series_list( mp_result).sort_values("former_id") self.nfasta_df = Utilities.left_merge(self._raw_nfasta_df, self._processed_nfasta_df, "former_id") # Join 'aro_index.tsv' aro_index_df = pd.read_table(os.path.join(self.reference_dir, "data", "aro_index.tsv"), sep='\t', header=0) aro_index_df["aro_id"] = aro_index_df["ARO Accession"].str.extract( "ARO:(\d+)") # 'aro_index.tsv' has more entries than 'nucleotide_fasta_protein_homolog_model.fasta' provides self.nfasta_df = Utilities.left_merge(self.nfasta_df, aro_index_df, "aro_id") # Join 'aro_categories_index.tsv' aro_categories_index_df = pd.read_table(os.path.join( self.reference_dir, "data", "aro_categories_index.tsv"), sep='\t', header=0) self.nfasta_df = Utilities.left_merge(self.nfasta_df, aro_categories_index_df, "Protein Accession") # Joining 'aro_categories.tsv' is useless: the resulting 'ARO Category' is filled by NaN # Join 'aro.tsv' aro_df = pd.read_table(os.path.join(self.reference_dir, "ontology", "aro.tsv"), sep='\t', header=0) aro_df.rename(columns={ "Accession": "ARO Accession", "Name": "ARO Name" }, inplace=True) self.nfasta_df = Utilities.left_merge(self.nfasta_df, aro_df, "ARO Accession") self.nfasta_df = Utilities.combine_duplicate_rows( self.nfasta_df, "reference_id")
def digest_df(self, df: pd.DataFrame, associations: dict, columns_with_keywords: list, include_key: bool = True, all_in_lowercase: bool = False, strict: bool = False): """ :param df: Pandas DataFrame object containing an index, keyword columns and value columns :param associations: Dictionary '{key: (keywords...)}' :param columns_with_keywords: List of columns to search keywords :param include_key: Should the key of keyword group be included? :param all_in_lowercase: Convert both strings to lowercase? :param strict: Only count full match :return: Pandas DataFrame object with keys as index and columns sums as values and dictionary with corresponding intermediate grouped Pandas DataFrame objects """ def __regular_search(s: str): return any(i in str(s) for i in key_words) def __strict_search(s: str): return any(i == str(s) for i in key_words) df = df.copy() df_columns = list(df) columns_with_keywords = Utilities.remove_empty_values([i for i in columns_with_keywords]) columns_without_keywords = Utilities.remove_empty_values( [i for i in df_columns if i not in columns_with_keywords]) if len(columns_with_keywords) == 0: print("No column for keyword search specified!") return try: # 'columns_with_keywords' might be more than 1 df["lookup_column"] = df.loc[:, columns_with_keywords].astype(str).apply( lambda x: " ".join(self.prepare_list(x, lowercase=all_in_lowercase)), axis=1) except KeyError as e: print(e, list(df), associations, columns_with_keywords) keywords_series = [] raw_values_ds = pd.DataFrame() for main_word in associations: key_words = associations.get(main_word) if not key_words or len(key_words) == 0: key_words = () if include_key: key_words = list(key_words) + [main_word, ] key_words = sorted(set(self.prepare_list(key_words, lowercase=all_in_lowercase))) if len(key_words) == 0: raise ValueError("No values to search: '{}: {}'".format(main_word, key_words)) if strict: df_with_keywords = df.loc[df["lookup_column"].apply(__strict_search) == True, columns_without_keywords] else: df_with_keywords = df.loc[df["lookup_column"].apply(__regular_search) == True, columns_without_keywords] keywords_series.append(df_with_keywords.sum().rename(main_word)) # Reset index to avoid exceptions thrown by duplicates raw_values_df = df_with_keywords.reset_index() raw_values_df["keyword"] = main_word if raw_values_ds.shape[0] == 0: raw_values_ds = raw_values_df else: raw_values_ds = pd.concat([raw_values_ds, raw_values_df], axis=0, ignore_index=True) out_df = Utilities.merge_pd_series_list(keywords_series).fillna(0) out_df.columns.name = "value" out_df.index.name = "keyword" return out_df, raw_values_ds