Exemplo n.º 1
0
 def __init__(self, sample_name: str, sample_read_files: list):
     self.state = dict()
     self.name = sample_name.strip()
     self.reads = sorted(Utilities.remove_empty_values(sample_read_files))
     self.taxa = ""
     self.is_valid = False
     self._validate_reads()
Exemplo n.º 2
0
 def __init__(self, fastas_string):
     self._fastas_string = fastas_string
     self._raw_fastas_list = [
         ">{}".format(j) if not j.startswith(">") else j
         for j in [i.strip() for i in re.split("\n>", self._fastas_string)]
     ]
     self._parsed_fastas_list = Utilities.remove_empty_values(
         [FASTA(i) for i in self._raw_fastas_list])
def process_genbank_report(d: dict):
    genbank_records = d.get("genbank_records")
    genbank_record = genbank_records[0]
    cds_number = len([i for i in genbank_record.features if i.type == "CDS"])
    qualifiers_dict = [
        i.qualifiers for i in genbank_record.features if i.type == "source"
    ][0]
    organism = Utilities.remove_empty_values(
        qualifiers_dict.get("organism")[0].split(" "))[:2]
    strain = " ".join(organism + [qualifiers_dict.get("strain")[0]])
    taxonomy_id = Utilities.safe_findall("\d+", [
        i for i in qualifiers_dict.get("db_xref")
        if i.split(":")[0].strip() == "taxon"
    ][0])
    return dict(assembly_file=d.get("assembly_file"),
                strain=strain,
                taxonomy_id=taxonomy_id,
                reference_accession_id=genbank_record.id,
                cds_number=cds_number,
                reference_bp=len(genbank_record),
                reference_description=genbank_record.description)
Exemplo n.º 4
0
    def __init__(self, fna_file, contamination_report):
        self.sequence_file = fna_file
        self.contamination_file = contamination_report
        self.contamination_lines = Utilities.load_2d_array(
            self.contamination_file)

        exclude_index = self.find_index(self.contamination_lines, ["Exclude:"])
        trim_index = self.find_index(self.contamination_lines, ["Trim:"])
        duplicated_index = self.find_index(self.contamination_lines,
                                           ["Duplicated:"])
        # Issue lines order: exclude, trim, duplicated
        headers_to_remove = []
        if exclude_index:
            exclude_lines = []
            if trim_index:
                exclude_lines = self.contamination_lines[exclude_index +
                                                         2:trim_index]
            elif duplicated_index:
                exclude_lines = self.contamination_lines[exclude_index +
                                                         2:duplicated_index]
            else:
                exclude_lines = self.contamination_lines[exclude_index + 2:]
            exclude_lines = Utilities.remove_empty_values(
                [i[0] for i in exclude_lines])
            headers_to_remove.extend(exclude_lines)

        if trim_index:
            trim_lines_processed = dict()
            if duplicated_index:
                trim_lines_raw = self.contamination_lines[trim_index +
                                                          2:duplicated_index]
            else:
                trim_lines_raw = self.contamination_lines[trim_index + 2:]
            for trim_line_raw in Utilities.remove_empty_values(trim_lines_raw):
                for trim_span in Utilities.remove_empty_values(
                        trim_line_raw[2].split(",")):
                    trim_indices = [
                        int(i.strip()) for i in trim_span.split("..")
                    ]
                    # It seems that reported sequence positions are not zero-based
                    trim_indices[0] -= 1
                    trim_lines_processed[trim_line_raw[0]] = trim_indices
            headers_to_remove.extend(list(trim_lines_processed.keys()))

        if duplicated_index:
            processed_duplicated_lines = list()
            duplicated_lines = [
                i for i in self.contamination_lines[duplicated_index + 2:]
            ]
            # Removing only the first occurrence in Duplicates
            for duplicated_line in duplicated_lines:
                duplicated_str = duplicated_line[0]
                if duplicated_str.startswith("# "):
                    continue
                processed_duplicated_lines.append(
                    duplicated_str.strip().split(" ")[0])
            headers_to_remove.extend(processed_duplicated_lines)

        self.seq_records = list(SeqIO.parse(self.sequence_file, "fasta"))
        print(
            f"Imported {len(self.seq_records)} raw records from '{self.sequence_file}'"
        )
        headers_to_remove = sorted(
            set(Utilities.remove_empty_values(headers_to_remove)))
        print("{} headers were marked to remove: '{}'".format(
            len(headers_to_remove), "', '".join(headers_to_remove)))
        out_records = []
        for record_raw in Utilities.remove_duplicate_sequences(
                self.seq_records):
            record_id = record_raw.id.split(" ")[0].strip()
            if record_id not in headers_to_remove:
                out_records.append(record_raw)
            # Some positions from the "Trim" entry after NCBI processing were moved into the "Exclude" entry
            # The point is removing them instead of trimming
        self.valid_records = [
            i for i in out_records if len(i) >= self._NCBI_MIN_SEQ_LENGTH
        ]
Exemplo n.º 5
0
        return d
    d["is_correlation_valid"] = True
    return _process_out()


try:
    print("Running on the node {}".format(os.uname()[1]))
except:
    pass

sleep(np.random.randint(90))
print("Polling the queue")

remote_queue = os.path.join(ProjectDescriber.DATA_DIR, "correlation_data",
                            "group_datasets", "tables.txt")
correlation_tables = Utilities.remove_empty_values(
    Utilities.load_list(remote_queue))
if len(correlation_tables) == 0:
    print("Empty remote queue")
    sys.exit(0)

Utilities.dump_list(correlation_tables[1:], remote_queue)

correlation_table = correlation_tables[0]
print("Now processing: '{}'".format(correlation_table))

group_name = os.path.splitext(os.path.basename(correlation_table))[0]
out_dir = os.path.join(ProjectDescriber.DATA_DIR, "correlation_data",
                       "group_results", group_name)

correlation_df = load_tsv(correlation_table).dropna(axis=0, how="any")
feature_groups = sorted(set([i.split("@")[0] for i in correlation_df.columns]))
Exemplo n.º 6
0
 def process(self):
     value_col_name_raw_pivot_annotated_mask = self.create_mirrored_path(
         [projectDescriber.DATA_DIGEST_DIR, self.value_col_name],
         makedirs=True)
     Utilities.dump_tsv(
         self.raw_annotated_pivot.reset_index(),
         "{}_raw_annotated_pivot.tsv".format(
             value_col_name_raw_pivot_annotated_mask))
     for col_name_with_keywords in KEYWORDS_ASSOCIATIVE_PAIRS:
         df_to_digest = self.raw_annotated_pivot.loc[:, [
             col_name_with_keywords
         ] + self.sample_names]
         associations = KEYWORDS_ASSOCIATIVE_PAIRS.get(
             col_name_with_keywords)
         if col_name_with_keywords == HOST_COL_NAME:
             associations = digestAssociationsKeeper.generate_genera_dict(
                 df_to_digest[col_name_with_keywords].values.tolist())
         digest_df, raw_ds = digestAssociationsKeeper.digest_df(
             df_to_digest,
             associations=associations,
             columns_with_keywords=[col_name_with_keywords])
         raw_ds = Utilities.left_merge(
             raw_ds,
             self.raw_annotated_pivot[RAW_LABEL_COL_NAME].reset_index(),
             REFERENCE_COL_NAME).fillna("")
         raw_ds[RAW_LABEL_COL_NAME] = raw_ds[RAW_LABEL_COL_NAME].apply(
             lambda x: min(Utilities.remove_empty_values(
                 [i for i in x.strip().split(" ")]),
                           key=len))
         keyword_export_mask = self.create_mirrored_path([
             projectDescriber.DATA_DIGEST_DIR, self.value_col_name,
             col_name_with_keywords
         ],
                                                         makedirs=True)
         Utilities.dump_tsv(digest_df.reset_index(),
                            "{}_digest.tsv".format(keyword_export_mask))
         Utilities.dump_tsv(raw_ds,
                            "{}_raw.tsv".format(keyword_export_mask))
         for sample_name in digest_df.columns:
             _BASE_FONT_SIZE = 15
             _WEDGE_WIDTH = 0.3
             _WEDGE_PROPERTIES = dict(width=_WEDGE_WIDTH, edgecolor="w")
             _LABEL_PROPERTIES = dict(fontsize=_BASE_FONT_SIZE,
                                      rotation_mode="anchor",
                                      verticalalignment="center",
                                      horizontalalignment="center")
             major_digest_df = Utilities.get_n_majors_from_df(
                 digest_df, sample_name, n=INNER_DONUT_GROUPS - 1)
             # Create visualization
             fig, ax = plt.subplots()
             plt.rcParams.update({
                 "font.size": _BASE_FONT_SIZE,
                 "figure.figsize": (20, 20)
             })
             ax.axis("equal")
             y_col_name = major_digest_df.columns[0]
             # Returning value: [[wedges...], [labels...], [values...]]
             pie_int = ax.pie(major_digest_df[sample_name],
                              radius=1 - _WEDGE_WIDTH,
                              labels=major_digest_df.index,
                              labeldistance=1 - _WEDGE_WIDTH,
                              rotatelabels=False,
                              autopct=self.make_autopct(
                                  major_digest_df[y_col_name]),
                              pctdistance=1 - _WEDGE_WIDTH / 2.0,
                              wedgeprops=_WEDGE_PROPERTIES,
                              textprops=_LABEL_PROPERTIES)
             # Combine color values in 'RGBA' format into the one dictionary
             pie_int_colors = {
                 pie_int[1][idx].get_text(): wedge.get_facecolor()
                 for idx, wedge in enumerate(pie_int[0])
             }
             # Manual sort the dataset with raw values prior to the order of digest keywords
             major_raw_ds = pd.DataFrame()
             for digest_keyword in major_digest_df.index:
                 if digest_keyword == "Other":
                     major_raw_ds_append = pd.DataFrame(
                         major_digest_df.loc["Other"]).transpose()
                     major_raw_ds_append.index.name = DIGEST_LABEL_COL_NAME
                     major_raw_ds_append = major_raw_ds_append.reset_index()
                 else:
                     major_raw_ds_append_right = raw_ds.loc[
                         raw_ds[DIGEST_LABEL_COL_NAME] == digest_keyword, [
                             REFERENCE_COL_NAME, sample_name,
                             DIGEST_LABEL_COL_NAME, RAW_LABEL_COL_NAME
                         ]]
                     major_raw_ds_append_left = Utilities.get_n_majors_from_df(
                         major_raw_ds_append_right.set_index(
                             REFERENCE_COL_NAME),
                         sample_name,
                         n=OUTER_DONUT_SUBGROUPS -
                         1).rename(index={
                             "Other": digest_keyword
                         }).reset_index()
                     major_raw_ds_append = Utilities.left_merge(
                         major_raw_ds_append_left,
                         major_raw_ds_append_right, REFERENCE_COL_NAME)
                     major_raw_ds_append[
                         RAW_LABEL_COL_NAME] = major_raw_ds_append[
                             RAW_LABEL_COL_NAME].fillna(
                                 "{}_Other".format(digest_keyword))
                     major_raw_ds_append[
                         DIGEST_LABEL_COL_NAME] = major_raw_ds_append[
                             DIGEST_LABEL_COL_NAME].fillna("Other")
                 pie_ext_append_colors = []
                 for row_number in major_raw_ds_append.index.values:
                     row_color = pie_int_colors.get(digest_keyword)
                     if not row_color:
                         continue
                     row_old_alpha = row_color[3]
                     _MINIMAL_ALPHA = 0.2
                     if major_raw_ds_append.shape[0] < 4:
                         row_new_alpha = row_old_alpha - (
                             row_old_alpha * row_number * _MINIMAL_ALPHA)
                     else:
                         row_new_alpha = row_old_alpha - (
                             (row_old_alpha - _MINIMAL_ALPHA) * row_number /
                             float(major_raw_ds_append.shape[0] - 1))
                     pie_ext_append_colors.append(";".join(
                         str(i)
                         for i in list(row_color[:3]) + [row_new_alpha]))
                 major_raw_ds_append["color"] = pie_ext_append_colors
                 if major_raw_ds_append.shape[0] > 0:
                     if major_raw_ds.shape[0] == 0:
                         major_raw_ds = major_raw_ds_append
                     else:
                         major_raw_ds = pd.concat(
                             [major_raw_ds, major_raw_ds_append],
                             axis=0,
                             ignore_index=True,
                             sort=False)
             major_raw_ds = major_raw_ds.fillna("Other")
             pie_ext = ax.pie(
                 major_raw_ds[sample_name],
                 radius=1,
                 labels=major_raw_ds[RAW_LABEL_COL_NAME],
                 labeldistance=1 - _WEDGE_WIDTH / 2,
                 rotatelabels=True,
                 wedgeprops=_WEDGE_PROPERTIES,
                 textprops=_LABEL_PROPERTIES,
                 colors=major_raw_ds["color"].apply(lambda x: tuple(
                     float(i) for i in x.split(";"))).values.tolist())
             # Export visualization tables
             sample_export_mask = self.create_mirrored_path([
                 projectDescriber.DATA_DIGEST_DIR, self.value_col_name,
                 col_name_with_keywords, sample_name
             ],
                                                            makedirs=True)
             Utilities.dump_tsv(
                 major_digest_df.reset_index(),
                 "{}_inner_values.tsv".format(sample_export_mask))
             Utilities.dump_tsv(
                 major_raw_ds,
                 "{}_outer_values.tsv".format(sample_export_mask))
             # Set labels
             ax.set_xlabel(y_col_name)
             ax.set_ylabel(self.value_col_name)
             plt.tight_layout()
             # Export PNG
             pie_file = "{}_double_donut.png".format(sample_export_mask)
             fig.suptitle(pie_file, fontsize=_BASE_FONT_SIZE)
             plt.savefig(pie_file, dpi=300, bbox_inches="tight")
             plt.close("all")
             plt.clf()
Exemplo n.º 7
0
    def digest_df(self, df: pd.DataFrame, associations: dict, columns_with_keywords: list, include_key: bool = True,
                  all_in_lowercase: bool = False, strict: bool = False):
        """
        :param df: Pandas DataFrame object containing an index, keyword columns and value columns
        :param associations: Dictionary '{key: (keywords...)}'
        :param columns_with_keywords: List of columns to search keywords
        :param include_key: Should the key of keyword group be included?
        :param all_in_lowercase: Convert both strings to lowercase?
        :param strict: Only count full match
        :return: Pandas DataFrame object with keys as index and columns sums as values and dictionary with corresponding
                 intermediate grouped Pandas DataFrame objects
        """
        def __regular_search(s: str):
            return any(i in str(s) for i in key_words)

        def __strict_search(s: str):
            return any(i == str(s) for i in key_words)

        df = df.copy()
        df_columns = list(df)
        columns_with_keywords = Utilities.remove_empty_values([i for i in columns_with_keywords])
        columns_without_keywords = Utilities.remove_empty_values(
            [i for i in df_columns if i not in columns_with_keywords])
        if len(columns_with_keywords) == 0:
            print("No column for keyword search specified!")
            return
        try:
            # 'columns_with_keywords' might be more than 1
            df["lookup_column"] = df.loc[:, columns_with_keywords].astype(str).apply(
                lambda x: " ".join(self.prepare_list(x, lowercase=all_in_lowercase)), axis=1)
        except KeyError as e:
            print(e, list(df), associations, columns_with_keywords)
        keywords_series = []
        raw_values_ds = pd.DataFrame()
        for main_word in associations:
            key_words = associations.get(main_word)
            if not key_words or len(key_words) == 0:
                key_words = ()
            if include_key:
                key_words = list(key_words) + [main_word, ]
            key_words = sorted(set(self.prepare_list(key_words, lowercase=all_in_lowercase)))
            if len(key_words) == 0:
                raise ValueError("No values to search: '{}: {}'".format(main_word, key_words))
            if strict:
                df_with_keywords = df.loc[df["lookup_column"].apply(__strict_search) == True,
                                          columns_without_keywords]
            else:
                df_with_keywords = df.loc[df["lookup_column"].apply(__regular_search) == True,
                                          columns_without_keywords]
            keywords_series.append(df_with_keywords.sum().rename(main_word))
            # Reset index to avoid exceptions thrown by duplicates
            raw_values_df = df_with_keywords.reset_index()
            raw_values_df["keyword"] = main_word
            if raw_values_ds.shape[0] == 0:
                raw_values_ds = raw_values_df
            else:
                raw_values_ds = pd.concat([raw_values_ds, raw_values_df], axis=0, ignore_index=True)
        out_df = Utilities.merge_pd_series_list(keywords_series).fillna(0)
        out_df.columns.name = "value"
        out_df.index.name = "keyword"
        return out_df, raw_values_ds
Exemplo n.º 8
0
 def prepare_list(lst: list, lowercase: bool = False):
     return Utilities.remove_empty_values([DigestAssociationsKeeper.prepare_string(i, lowercase) for i in lst])