def annotate(self):
     self._raw_nfasta_df = Utilities.load_tsv(self.annotation_file)
     raw_nfasta_headers = self._raw_nfasta_df["former_id"].values.tolist()
     processed_nfasta_headers = [
         Utilities.dict2pd_series(i) for i in Utilities.multi_core_queue(
             self._mp_parse_nfasta_header, raw_nfasta_headers)
     ]
     self._processed_nfasta_df = Utilities.merge_pd_series_list(
         processed_nfasta_headers).sort_values("former_id")
     zf_len = len(max(self._processed_nfasta_df["vfdb_id"].values.tolist()))
     # Join table assembled from pFASTA headers
     raw_pfasta_headers = []
     with open(self._raw_pfasta_file, mode="r", encoding="utf-8") as _f:
         for _line in _f:
             if _line.startswith(">"):
                 raw_pfasta_headers.append(re.sub("^>", "", _line).strip())
         _f.close()
     raw_pfasta_headers = sorted(
         set([i for i in raw_pfasta_headers if len(i) > 0]))
     processed_pfasta_headers = [
         Utilities.dict2pd_series(i) for i in Utilities.multi_core_queue(
             self._mp_parse_pfasta_header, raw_pfasta_headers)
     ]
     self._processed_pfasta_df = Utilities.merge_pd_series_list(
         processed_pfasta_headers).sort_values("protein_header")
     self._processed_pfasta_df["vfdb_id"] = self._processed_pfasta_df[
         "vfdb_id"].str.zfill(zf_len)
     # Join provided table. Note the table file placed into the same dir with the merged protein FASTA file
     vfs_table_file = os.path.join(os.path.dirname(self._raw_pfasta_file),
                                   "VFs.xls")
     vfs_df = pd.read_excel(vfs_table_file, sheet_name="VFs",
                            header=1).fillna("")
     vfs_df["vfdb_id"] = vfs_df["VFID"].str.extract("VF(\d+)")[0].str.zfill(
         zf_len)
     self.merged_df = pd.concat([
         i.set_index("vfdb_id").sort_index() for i in
         [self._processed_nfasta_df, self._processed_pfasta_df, vfs_df]
     ],
                                axis=1,
                                sort=False).sort_index()
     self.merged_df.index.names = ["vfdb_id"]
     self.merged_df = self.merged_df.loc[
         self.merged_df["former_id"].str.len() > 0].reset_index()
     self.merged_df = Utilities.left_merge(self._raw_nfasta_df,
                                           self.merged_df, "former_id")
Exemplo n.º 2
0
 def annotate(self):
     # Process nucleotide FASTA
     self._raw_nfasta_df = pd.read_table(self.annotation_file, sep="\t", header=0)
     raw_nfasta_headers = self._raw_nfasta_df["former_id"].values.tolist()
     processed_nfasta_headers = [Utilities.dict2pd_series(i) for i in
                                 Utilities.multi_core_queue(self._mp_parse_nfasta_header, raw_nfasta_headers)]
     self._processed_nfasta_df = Utilities.merge_pd_series_list(processed_nfasta_headers).sort_values("former_id")
     self.nfasta_df = Utilities.left_merge(self._raw_nfasta_df, self._processed_nfasta_df, "former_id")
     # Process protein FASTA
     raw_pfasta_headers = sorted(set([j for j in [re.sub("^>", "", i).strip() for i in
                                                  open(self._raw_pfasta_file, mode="r", encoding="utf-8") if
                                                  i.startswith(">")] if len(j) > 0]))
     processed_pfasta_headers = [Utilities.dict2pd_series(i) for i in
                                 Utilities.multi_core_queue(self._mp_parse_nfasta_header, raw_pfasta_headers)]
     self.pfasta_df = Utilities.merge_pd_series_list(processed_pfasta_headers).sort_values("former_id")
     self.pfasta_df.rename(columns={"geninfo_id": "protein_geninfo_id", "refseq_id": "genpept_id",
                                    "description": "protein_description", "host": "protein_host"}, inplace=True)
     self.merged_df = Utilities.left_merge(self.nfasta_df, self.pfasta_df, "tadb_id", "category", "gene_symbol")
     self.merged_df = Utilities.combine_duplicate_rows(self.merged_df, "reference_id")
Exemplo n.º 3
0
 def annotate(self):
     self.annotation_file = self.describer.get_refdata_dict().get(
         "sequence_1").annotation_file
     self._raw_nfasta_df = pd.read_table(self.annotation_file,
                                         sep='\t',
                                         header=0)
     mp_result = Utilities.multi_core_queue(
         self._mp_parse_nfasta_header,
         self._raw_nfasta_df["former_id"].values.tolist())
     self._processed_nfasta_df = Utilities.merge_pd_series_list(
         mp_result).sort_values("former_id")
     self.nfasta_df = Utilities.left_merge(self._raw_nfasta_df,
                                           self._processed_nfasta_df,
                                           "former_id")
     # Join 'aro_index.tsv'
     aro_index_df = pd.read_table(os.path.join(self.reference_dir, "data",
                                               "aro_index.tsv"),
                                  sep='\t',
                                  header=0)
     aro_index_df["aro_id"] = aro_index_df["ARO Accession"].str.extract(
         "ARO:(\d+)")
     # 'aro_index.tsv' has more entries than 'nucleotide_fasta_protein_homolog_model.fasta' provides
     self.nfasta_df = Utilities.left_merge(self.nfasta_df, aro_index_df,
                                           "aro_id")
     # Join 'aro_categories_index.tsv'
     aro_categories_index_df = pd.read_table(os.path.join(
         self.reference_dir, "data", "aro_categories_index.tsv"),
                                             sep='\t',
                                             header=0)
     self.nfasta_df = Utilities.left_merge(self.nfasta_df,
                                           aro_categories_index_df,
                                           "Protein Accession")
     # Joining 'aro_categories.tsv' is useless: the resulting 'ARO Category' is filled by NaN
     # Join 'aro.tsv'
     aro_df = pd.read_table(os.path.join(self.reference_dir, "ontology",
                                         "aro.tsv"),
                            sep='\t',
                            header=0)
     aro_df.rename(columns={
         "Accession": "ARO Accession",
         "Name": "ARO Name"
     },
                   inplace=True)
     self.nfasta_df = Utilities.left_merge(self.nfasta_df, aro_df,
                                           "ARO Accession")
     self.nfasta_df = Utilities.combine_duplicate_rows(
         self.nfasta_df, "reference_id")
Exemplo n.º 4
0
    def digest_df(self, df: pd.DataFrame, associations: dict, columns_with_keywords: list, include_key: bool = True,
                  all_in_lowercase: bool = False, strict: bool = False):
        """
        :param df: Pandas DataFrame object containing an index, keyword columns and value columns
        :param associations: Dictionary '{key: (keywords...)}'
        :param columns_with_keywords: List of columns to search keywords
        :param include_key: Should the key of keyword group be included?
        :param all_in_lowercase: Convert both strings to lowercase?
        :param strict: Only count full match
        :return: Pandas DataFrame object with keys as index and columns sums as values and dictionary with corresponding
                 intermediate grouped Pandas DataFrame objects
        """
        def __regular_search(s: str):
            return any(i in str(s) for i in key_words)

        def __strict_search(s: str):
            return any(i == str(s) for i in key_words)

        df = df.copy()
        df_columns = list(df)
        columns_with_keywords = Utilities.remove_empty_values([i for i in columns_with_keywords])
        columns_without_keywords = Utilities.remove_empty_values(
            [i for i in df_columns if i not in columns_with_keywords])
        if len(columns_with_keywords) == 0:
            print("No column for keyword search specified!")
            return
        try:
            # 'columns_with_keywords' might be more than 1
            df["lookup_column"] = df.loc[:, columns_with_keywords].astype(str).apply(
                lambda x: " ".join(self.prepare_list(x, lowercase=all_in_lowercase)), axis=1)
        except KeyError as e:
            print(e, list(df), associations, columns_with_keywords)
        keywords_series = []
        raw_values_ds = pd.DataFrame()
        for main_word in associations:
            key_words = associations.get(main_word)
            if not key_words or len(key_words) == 0:
                key_words = ()
            if include_key:
                key_words = list(key_words) + [main_word, ]
            key_words = sorted(set(self.prepare_list(key_words, lowercase=all_in_lowercase)))
            if len(key_words) == 0:
                raise ValueError("No values to search: '{}: {}'".format(main_word, key_words))
            if strict:
                df_with_keywords = df.loc[df["lookup_column"].apply(__strict_search) == True,
                                          columns_without_keywords]
            else:
                df_with_keywords = df.loc[df["lookup_column"].apply(__regular_search) == True,
                                          columns_without_keywords]
            keywords_series.append(df_with_keywords.sum().rename(main_word))
            # Reset index to avoid exceptions thrown by duplicates
            raw_values_df = df_with_keywords.reset_index()
            raw_values_df["keyword"] = main_word
            if raw_values_ds.shape[0] == 0:
                raw_values_ds = raw_values_df
            else:
                raw_values_ds = pd.concat([raw_values_ds, raw_values_df], axis=0, ignore_index=True)
        out_df = Utilities.merge_pd_series_list(keywords_series).fillna(0)
        out_df.columns.name = "value"
        out_df.index.name = "keyword"
        return out_df, raw_values_ds