def annotate(self):
     self._raw_nfasta_df = Utilities.load_tsv(self.annotation_file)
     raw_nfasta_headers = self._raw_nfasta_df["former_id"].values.tolist()
     processed_nfasta_headers = [
         Utilities.dict2pd_series(i) for i in Utilities.multi_core_queue(
             self._mp_parse_nfasta_header, raw_nfasta_headers)
     ]
     self._processed_nfasta_df = Utilities.merge_pd_series_list(
         processed_nfasta_headers).sort_values("former_id")
     zf_len = len(max(self._processed_nfasta_df["vfdb_id"].values.tolist()))
     # Join table assembled from pFASTA headers
     raw_pfasta_headers = []
     with open(self._raw_pfasta_file, mode="r", encoding="utf-8") as _f:
         for _line in _f:
             if _line.startswith(">"):
                 raw_pfasta_headers.append(re.sub("^>", "", _line).strip())
         _f.close()
     raw_pfasta_headers = sorted(
         set([i for i in raw_pfasta_headers if len(i) > 0]))
     processed_pfasta_headers = [
         Utilities.dict2pd_series(i) for i in Utilities.multi_core_queue(
             self._mp_parse_pfasta_header, raw_pfasta_headers)
     ]
     self._processed_pfasta_df = Utilities.merge_pd_series_list(
         processed_pfasta_headers).sort_values("protein_header")
     self._processed_pfasta_df["vfdb_id"] = self._processed_pfasta_df[
         "vfdb_id"].str.zfill(zf_len)
     # Join provided table. Note the table file placed into the same dir with the merged protein FASTA file
     vfs_table_file = os.path.join(os.path.dirname(self._raw_pfasta_file),
                                   "VFs.xls")
     vfs_df = pd.read_excel(vfs_table_file, sheet_name="VFs",
                            header=1).fillna("")
     vfs_df["vfdb_id"] = vfs_df["VFID"].str.extract("VF(\d+)")[0].str.zfill(
         zf_len)
     self.merged_df = pd.concat([
         i.set_index("vfdb_id").sort_index() for i in
         [self._processed_nfasta_df, self._processed_pfasta_df, vfs_df]
     ],
                                axis=1,
                                sort=False).sort_index()
     self.merged_df.index.names = ["vfdb_id"]
     self.merged_df = self.merged_df.loc[
         self.merged_df["former_id"].str.len() > 0].reset_index()
     self.merged_df = Utilities.left_merge(self._raw_nfasta_df,
                                           self.merged_df, "former_id")
Пример #2
0
 def annotate(self):
     # Process nucleotide FASTA
     self._raw_nfasta_df = pd.read_table(self.annotation_file, sep="\t", header=0)
     raw_nfasta_headers = self._raw_nfasta_df["former_id"].values.tolist()
     processed_nfasta_headers = [Utilities.dict2pd_series(i) for i in
                                 Utilities.multi_core_queue(self._mp_parse_nfasta_header, raw_nfasta_headers)]
     self._processed_nfasta_df = Utilities.merge_pd_series_list(processed_nfasta_headers).sort_values("former_id")
     self.nfasta_df = Utilities.left_merge(self._raw_nfasta_df, self._processed_nfasta_df, "former_id")
     # Process protein FASTA
     raw_pfasta_headers = sorted(set([j for j in [re.sub("^>", "", i).strip() for i in
                                                  open(self._raw_pfasta_file, mode="r", encoding="utf-8") if
                                                  i.startswith(">")] if len(j) > 0]))
     processed_pfasta_headers = [Utilities.dict2pd_series(i) for i in
                                 Utilities.multi_core_queue(self._mp_parse_nfasta_header, raw_pfasta_headers)]
     self.pfasta_df = Utilities.merge_pd_series_list(processed_pfasta_headers).sort_values("former_id")
     self.pfasta_df.rename(columns={"geninfo_id": "protein_geninfo_id", "refseq_id": "genpept_id",
                                    "description": "protein_description", "host": "protein_host"}, inplace=True)
     self.merged_df = Utilities.left_merge(self.nfasta_df, self.pfasta_df, "tadb_id", "category", "gene_symbol")
     self.merged_df = Utilities.combine_duplicate_rows(self.merged_df, "reference_id")
Пример #3
0
 def query2fasta(self):
     output_dict = {"FASTAs_list": [], "annotations_series_list": []}
     for _soup in self._row_soups_list:
         _row_dict = self.parse_table_row(_soup)
         _locations_list = _row_dict["Sequence Location"].split("..")
         # Filtering expression
         if (self._gene.lower() in _row_dict["Name"].lower() or self._gene.lower() in _row_dict["Description"].lower() or self._gene.lower() in _row_dict["Aliases"].lower()) and len(_locations_list) == 2:
             fasta = self._get_fasta(_row_dict["Sequence ID"], _locations_list)
             output_dict["FASTAs_list"].append(fasta)
             output_dict["annotations_series_list"].append(pd.Series(Utilities.dict2pd_series(_row_dict), name=fasta.header))
     return output_dict
Пример #4
0
 def _mp_parse_nfasta_header(header):
     output_dict = dict(former_id=header)
     output_dict["genbank_id"] = Utilities.safe_findall(
         "^gb\|([^|]+)", header)
     output_dict["is_antisense_strand"] = header.split("|")[2].startswith(
         "-")
     output_dict["locus"] = Utilities.safe_findall("\|(\d+\-\d+)", header)
     output_dict["aro_id"] = Utilities.safe_findall("\|ARO:(\d+)", header)
     gene_chunk = header.split("|")[-1]
     output_dict["host"] = Utilities.safe_findall("\[(.+)\]", gene_chunk)
     output_dict["gene_description"] = gene_chunk.replace(
         "[{}]".format(output_dict["host"]), "").strip()
     _MIN_GENE_SYMBOL_LENGTH = 3
     _NON_GENE_SYMBOL_WORDS = ("DNA", "RNA")
     output_dict["gene_symbol"] = min([
         j for j in [
             i.strip()
             for i in output_dict.get("gene_description").split(" ")
         ] if len(j) >= _MIN_GENE_SYMBOL_LENGTH
         and j not in _NON_GENE_SYMBOL_WORDS
     ],
                                      key=len)
     return Utilities.dict2pd_series(output_dict)