def annotate(self): self._raw_nfasta_df = Utilities.load_tsv(self.annotation_file) raw_nfasta_headers = self._raw_nfasta_df["former_id"].values.tolist() processed_nfasta_headers = [ Utilities.dict2pd_series(i) for i in Utilities.multi_core_queue( self._mp_parse_nfasta_header, raw_nfasta_headers) ] self._processed_nfasta_df = Utilities.merge_pd_series_list( processed_nfasta_headers).sort_values("former_id") zf_len = len(max(self._processed_nfasta_df["vfdb_id"].values.tolist())) # Join table assembled from pFASTA headers raw_pfasta_headers = [] with open(self._raw_pfasta_file, mode="r", encoding="utf-8") as _f: for _line in _f: if _line.startswith(">"): raw_pfasta_headers.append(re.sub("^>", "", _line).strip()) _f.close() raw_pfasta_headers = sorted( set([i for i in raw_pfasta_headers if len(i) > 0])) processed_pfasta_headers = [ Utilities.dict2pd_series(i) for i in Utilities.multi_core_queue( self._mp_parse_pfasta_header, raw_pfasta_headers) ] self._processed_pfasta_df = Utilities.merge_pd_series_list( processed_pfasta_headers).sort_values("protein_header") self._processed_pfasta_df["vfdb_id"] = self._processed_pfasta_df[ "vfdb_id"].str.zfill(zf_len) # Join provided table. Note the table file placed into the same dir with the merged protein FASTA file vfs_table_file = os.path.join(os.path.dirname(self._raw_pfasta_file), "VFs.xls") vfs_df = pd.read_excel(vfs_table_file, sheet_name="VFs", header=1).fillna("") vfs_df["vfdb_id"] = vfs_df["VFID"].str.extract("VF(\d+)")[0].str.zfill( zf_len) self.merged_df = pd.concat([ i.set_index("vfdb_id").sort_index() for i in [self._processed_nfasta_df, self._processed_pfasta_df, vfs_df] ], axis=1, sort=False).sort_index() self.merged_df.index.names = ["vfdb_id"] self.merged_df = self.merged_df.loc[ self.merged_df["former_id"].str.len() > 0].reset_index() self.merged_df = Utilities.left_merge(self._raw_nfasta_df, self.merged_df, "former_id")
def annotate(self): # Process nucleotide FASTA self._raw_nfasta_df = pd.read_table(self.annotation_file, sep="\t", header=0) raw_nfasta_headers = self._raw_nfasta_df["former_id"].values.tolist() processed_nfasta_headers = [Utilities.dict2pd_series(i) for i in Utilities.multi_core_queue(self._mp_parse_nfasta_header, raw_nfasta_headers)] self._processed_nfasta_df = Utilities.merge_pd_series_list(processed_nfasta_headers).sort_values("former_id") self.nfasta_df = Utilities.left_merge(self._raw_nfasta_df, self._processed_nfasta_df, "former_id") # Process protein FASTA raw_pfasta_headers = sorted(set([j for j in [re.sub("^>", "", i).strip() for i in open(self._raw_pfasta_file, mode="r", encoding="utf-8") if i.startswith(">")] if len(j) > 0])) processed_pfasta_headers = [Utilities.dict2pd_series(i) for i in Utilities.multi_core_queue(self._mp_parse_nfasta_header, raw_pfasta_headers)] self.pfasta_df = Utilities.merge_pd_series_list(processed_pfasta_headers).sort_values("former_id") self.pfasta_df.rename(columns={"geninfo_id": "protein_geninfo_id", "refseq_id": "genpept_id", "description": "protein_description", "host": "protein_host"}, inplace=True) self.merged_df = Utilities.left_merge(self.nfasta_df, self.pfasta_df, "tadb_id", "category", "gene_symbol") self.merged_df = Utilities.combine_duplicate_rows(self.merged_df, "reference_id")
def query2fasta(self): output_dict = {"FASTAs_list": [], "annotations_series_list": []} for _soup in self._row_soups_list: _row_dict = self.parse_table_row(_soup) _locations_list = _row_dict["Sequence Location"].split("..") # Filtering expression if (self._gene.lower() in _row_dict["Name"].lower() or self._gene.lower() in _row_dict["Description"].lower() or self._gene.lower() in _row_dict["Aliases"].lower()) and len(_locations_list) == 2: fasta = self._get_fasta(_row_dict["Sequence ID"], _locations_list) output_dict["FASTAs_list"].append(fasta) output_dict["annotations_series_list"].append(pd.Series(Utilities.dict2pd_series(_row_dict), name=fasta.header)) return output_dict
def _mp_parse_nfasta_header(header): output_dict = dict(former_id=header) output_dict["genbank_id"] = Utilities.safe_findall( "^gb\|([^|]+)", header) output_dict["is_antisense_strand"] = header.split("|")[2].startswith( "-") output_dict["locus"] = Utilities.safe_findall("\|(\d+\-\d+)", header) output_dict["aro_id"] = Utilities.safe_findall("\|ARO:(\d+)", header) gene_chunk = header.split("|")[-1] output_dict["host"] = Utilities.safe_findall("\[(.+)\]", gene_chunk) output_dict["gene_description"] = gene_chunk.replace( "[{}]".format(output_dict["host"]), "").strip() _MIN_GENE_SYMBOL_LENGTH = 3 _NON_GENE_SYMBOL_WORDS = ("DNA", "RNA") output_dict["gene_symbol"] = min([ j for j in [ i.strip() for i in output_dict.get("gene_description").split(" ") ] if len(j) >= _MIN_GENE_SYMBOL_LENGTH and j not in _NON_GENE_SYMBOL_WORDS ], key=len) return Utilities.dict2pd_series(output_dict)