Пример #1
0
 def annotate(self):
     self.annotation_file = self.describer.get_refdata_dict().get(
         "sequence_1").annotation_file
     self._raw_nfasta_df = pd.read_table(self.annotation_file,
                                         sep='\t',
                                         header=0)
     mp_result = Utilities.multi_core_queue(
         self._mp_parse_nfasta_header,
         self._raw_nfasta_df["former_id"].values.tolist())
     self._processed_nfasta_df = Utilities.merge_pd_series_list(
         mp_result).sort_values("former_id")
     self.nfasta_df = Utilities.left_merge(self._raw_nfasta_df,
                                           self._processed_nfasta_df,
                                           "former_id")
     # Join 'aro_index.tsv'
     aro_index_df = pd.read_table(os.path.join(self.reference_dir, "data",
                                               "aro_index.tsv"),
                                  sep='\t',
                                  header=0)
     aro_index_df["aro_id"] = aro_index_df["ARO Accession"].str.extract(
         "ARO:(\d+)")
     # 'aro_index.tsv' has more entries than 'nucleotide_fasta_protein_homolog_model.fasta' provides
     self.nfasta_df = Utilities.left_merge(self.nfasta_df, aro_index_df,
                                           "aro_id")
     # Join 'aro_categories_index.tsv'
     aro_categories_index_df = pd.read_table(os.path.join(
         self.reference_dir, "data", "aro_categories_index.tsv"),
                                             sep='\t',
                                             header=0)
     self.nfasta_df = Utilities.left_merge(self.nfasta_df,
                                           aro_categories_index_df,
                                           "Protein Accession")
     # Joining 'aro_categories.tsv' is useless: the resulting 'ARO Category' is filled by NaN
     # Join 'aro.tsv'
     aro_df = pd.read_table(os.path.join(self.reference_dir, "ontology",
                                         "aro.tsv"),
                            sep='\t',
                            header=0)
     aro_df.rename(columns={
         "Accession": "ARO Accession",
         "Name": "ARO Name"
     },
                   inplace=True)
     self.nfasta_df = Utilities.left_merge(self.nfasta_df, aro_df,
                                           "ARO Accession")
     self.nfasta_df = Utilities.combine_duplicate_rows(
         self.nfasta_df, "reference_id")
Пример #2
0
 def annotate(self):
     # Process nucleotide FASTA
     self._raw_nfasta_df = pd.read_table(self.annotation_file, sep="\t", header=0)
     raw_nfasta_headers = self._raw_nfasta_df["former_id"].values.tolist()
     processed_nfasta_headers = [Utilities.dict2pd_series(i) for i in
                                 Utilities.multi_core_queue(self._mp_parse_nfasta_header, raw_nfasta_headers)]
     self._processed_nfasta_df = Utilities.merge_pd_series_list(processed_nfasta_headers).sort_values("former_id")
     self.nfasta_df = Utilities.left_merge(self._raw_nfasta_df, self._processed_nfasta_df, "former_id")
     # Process protein FASTA
     raw_pfasta_headers = sorted(set([j for j in [re.sub("^>", "", i).strip() for i in
                                                  open(self._raw_pfasta_file, mode="r", encoding="utf-8") if
                                                  i.startswith(">")] if len(j) > 0]))
     processed_pfasta_headers = [Utilities.dict2pd_series(i) for i in
                                 Utilities.multi_core_queue(self._mp_parse_nfasta_header, raw_pfasta_headers)]
     self.pfasta_df = Utilities.merge_pd_series_list(processed_pfasta_headers).sort_values("former_id")
     self.pfasta_df.rename(columns={"geninfo_id": "protein_geninfo_id", "refseq_id": "genpept_id",
                                    "description": "protein_description", "host": "protein_host"}, inplace=True)
     self.merged_df = Utilities.left_merge(self.nfasta_df, self.pfasta_df, "tadb_id", "category", "gene_symbol")
     self.merged_df = Utilities.combine_duplicate_rows(self.merged_df, "reference_id")