def __init__(self, sample_name: str, sample_read_files: list): self.state = dict() self.name = sample_name.strip() self.reads = sorted(Utilities.remove_empty_values(sample_read_files)) self.taxa = "" self.is_valid = False self._validate_reads()
def __init__(self, fastas_string): self._fastas_string = fastas_string self._raw_fastas_list = [ ">{}".format(j) if not j.startswith(">") else j for j in [i.strip() for i in re.split("\n>", self._fastas_string)] ] self._parsed_fastas_list = Utilities.remove_empty_values( [FASTA(i) for i in self._raw_fastas_list])
def process_genbank_report(d: dict): genbank_records = d.get("genbank_records") genbank_record = genbank_records[0] cds_number = len([i for i in genbank_record.features if i.type == "CDS"]) qualifiers_dict = [ i.qualifiers for i in genbank_record.features if i.type == "source" ][0] organism = Utilities.remove_empty_values( qualifiers_dict.get("organism")[0].split(" "))[:2] strain = " ".join(organism + [qualifiers_dict.get("strain")[0]]) taxonomy_id = Utilities.safe_findall("\d+", [ i for i in qualifiers_dict.get("db_xref") if i.split(":")[0].strip() == "taxon" ][0]) return dict(assembly_file=d.get("assembly_file"), strain=strain, taxonomy_id=taxonomy_id, reference_accession_id=genbank_record.id, cds_number=cds_number, reference_bp=len(genbank_record), reference_description=genbank_record.description)
def __init__(self, fna_file, contamination_report): self.sequence_file = fna_file self.contamination_file = contamination_report self.contamination_lines = Utilities.load_2d_array( self.contamination_file) exclude_index = self.find_index(self.contamination_lines, ["Exclude:"]) trim_index = self.find_index(self.contamination_lines, ["Trim:"]) duplicated_index = self.find_index(self.contamination_lines, ["Duplicated:"]) # Issue lines order: exclude, trim, duplicated headers_to_remove = [] if exclude_index: exclude_lines = [] if trim_index: exclude_lines = self.contamination_lines[exclude_index + 2:trim_index] elif duplicated_index: exclude_lines = self.contamination_lines[exclude_index + 2:duplicated_index] else: exclude_lines = self.contamination_lines[exclude_index + 2:] exclude_lines = Utilities.remove_empty_values( [i[0] for i in exclude_lines]) headers_to_remove.extend(exclude_lines) if trim_index: trim_lines_processed = dict() if duplicated_index: trim_lines_raw = self.contamination_lines[trim_index + 2:duplicated_index] else: trim_lines_raw = self.contamination_lines[trim_index + 2:] for trim_line_raw in Utilities.remove_empty_values(trim_lines_raw): for trim_span in Utilities.remove_empty_values( trim_line_raw[2].split(",")): trim_indices = [ int(i.strip()) for i in trim_span.split("..") ] # It seems that reported sequence positions are not zero-based trim_indices[0] -= 1 trim_lines_processed[trim_line_raw[0]] = trim_indices headers_to_remove.extend(list(trim_lines_processed.keys())) if duplicated_index: processed_duplicated_lines = list() duplicated_lines = [ i for i in self.contamination_lines[duplicated_index + 2:] ] # Removing only the first occurrence in Duplicates for duplicated_line in duplicated_lines: duplicated_str = duplicated_line[0] if duplicated_str.startswith("# "): continue processed_duplicated_lines.append( duplicated_str.strip().split(" ")[0]) headers_to_remove.extend(processed_duplicated_lines) self.seq_records = list(SeqIO.parse(self.sequence_file, "fasta")) print( f"Imported {len(self.seq_records)} raw records from '{self.sequence_file}'" ) headers_to_remove = sorted( set(Utilities.remove_empty_values(headers_to_remove))) print("{} headers were marked to remove: '{}'".format( len(headers_to_remove), "', '".join(headers_to_remove))) out_records = [] for record_raw in Utilities.remove_duplicate_sequences( self.seq_records): record_id = record_raw.id.split(" ")[0].strip() if record_id not in headers_to_remove: out_records.append(record_raw) # Some positions from the "Trim" entry after NCBI processing were moved into the "Exclude" entry # The point is removing them instead of trimming self.valid_records = [ i for i in out_records if len(i) >= self._NCBI_MIN_SEQ_LENGTH ]
return d d["is_correlation_valid"] = True return _process_out() try: print("Running on the node {}".format(os.uname()[1])) except: pass sleep(np.random.randint(90)) print("Polling the queue") remote_queue = os.path.join(ProjectDescriber.DATA_DIR, "correlation_data", "group_datasets", "tables.txt") correlation_tables = Utilities.remove_empty_values( Utilities.load_list(remote_queue)) if len(correlation_tables) == 0: print("Empty remote queue") sys.exit(0) Utilities.dump_list(correlation_tables[1:], remote_queue) correlation_table = correlation_tables[0] print("Now processing: '{}'".format(correlation_table)) group_name = os.path.splitext(os.path.basename(correlation_table))[0] out_dir = os.path.join(ProjectDescriber.DATA_DIR, "correlation_data", "group_results", group_name) correlation_df = load_tsv(correlation_table).dropna(axis=0, how="any") feature_groups = sorted(set([i.split("@")[0] for i in correlation_df.columns]))
def process(self): value_col_name_raw_pivot_annotated_mask = self.create_mirrored_path( [projectDescriber.DATA_DIGEST_DIR, self.value_col_name], makedirs=True) Utilities.dump_tsv( self.raw_annotated_pivot.reset_index(), "{}_raw_annotated_pivot.tsv".format( value_col_name_raw_pivot_annotated_mask)) for col_name_with_keywords in KEYWORDS_ASSOCIATIVE_PAIRS: df_to_digest = self.raw_annotated_pivot.loc[:, [ col_name_with_keywords ] + self.sample_names] associations = KEYWORDS_ASSOCIATIVE_PAIRS.get( col_name_with_keywords) if col_name_with_keywords == HOST_COL_NAME: associations = digestAssociationsKeeper.generate_genera_dict( df_to_digest[col_name_with_keywords].values.tolist()) digest_df, raw_ds = digestAssociationsKeeper.digest_df( df_to_digest, associations=associations, columns_with_keywords=[col_name_with_keywords]) raw_ds = Utilities.left_merge( raw_ds, self.raw_annotated_pivot[RAW_LABEL_COL_NAME].reset_index(), REFERENCE_COL_NAME).fillna("") raw_ds[RAW_LABEL_COL_NAME] = raw_ds[RAW_LABEL_COL_NAME].apply( lambda x: min(Utilities.remove_empty_values( [i for i in x.strip().split(" ")]), key=len)) keyword_export_mask = self.create_mirrored_path([ projectDescriber.DATA_DIGEST_DIR, self.value_col_name, col_name_with_keywords ], makedirs=True) Utilities.dump_tsv(digest_df.reset_index(), "{}_digest.tsv".format(keyword_export_mask)) Utilities.dump_tsv(raw_ds, "{}_raw.tsv".format(keyword_export_mask)) for sample_name in digest_df.columns: _BASE_FONT_SIZE = 15 _WEDGE_WIDTH = 0.3 _WEDGE_PROPERTIES = dict(width=_WEDGE_WIDTH, edgecolor="w") _LABEL_PROPERTIES = dict(fontsize=_BASE_FONT_SIZE, rotation_mode="anchor", verticalalignment="center", horizontalalignment="center") major_digest_df = Utilities.get_n_majors_from_df( digest_df, sample_name, n=INNER_DONUT_GROUPS - 1) # Create visualization fig, ax = plt.subplots() plt.rcParams.update({ "font.size": _BASE_FONT_SIZE, "figure.figsize": (20, 20) }) ax.axis("equal") y_col_name = major_digest_df.columns[0] # Returning value: [[wedges...], [labels...], [values...]] pie_int = ax.pie(major_digest_df[sample_name], radius=1 - _WEDGE_WIDTH, labels=major_digest_df.index, labeldistance=1 - _WEDGE_WIDTH, rotatelabels=False, autopct=self.make_autopct( major_digest_df[y_col_name]), pctdistance=1 - _WEDGE_WIDTH / 2.0, wedgeprops=_WEDGE_PROPERTIES, textprops=_LABEL_PROPERTIES) # Combine color values in 'RGBA' format into the one dictionary pie_int_colors = { pie_int[1][idx].get_text(): wedge.get_facecolor() for idx, wedge in enumerate(pie_int[0]) } # Manual sort the dataset with raw values prior to the order of digest keywords major_raw_ds = pd.DataFrame() for digest_keyword in major_digest_df.index: if digest_keyword == "Other": major_raw_ds_append = pd.DataFrame( major_digest_df.loc["Other"]).transpose() major_raw_ds_append.index.name = DIGEST_LABEL_COL_NAME major_raw_ds_append = major_raw_ds_append.reset_index() else: major_raw_ds_append_right = raw_ds.loc[ raw_ds[DIGEST_LABEL_COL_NAME] == digest_keyword, [ REFERENCE_COL_NAME, sample_name, DIGEST_LABEL_COL_NAME, RAW_LABEL_COL_NAME ]] major_raw_ds_append_left = Utilities.get_n_majors_from_df( major_raw_ds_append_right.set_index( REFERENCE_COL_NAME), sample_name, n=OUTER_DONUT_SUBGROUPS - 1).rename(index={ "Other": digest_keyword }).reset_index() major_raw_ds_append = Utilities.left_merge( major_raw_ds_append_left, major_raw_ds_append_right, REFERENCE_COL_NAME) major_raw_ds_append[ RAW_LABEL_COL_NAME] = major_raw_ds_append[ RAW_LABEL_COL_NAME].fillna( "{}_Other".format(digest_keyword)) major_raw_ds_append[ DIGEST_LABEL_COL_NAME] = major_raw_ds_append[ DIGEST_LABEL_COL_NAME].fillna("Other") pie_ext_append_colors = [] for row_number in major_raw_ds_append.index.values: row_color = pie_int_colors.get(digest_keyword) if not row_color: continue row_old_alpha = row_color[3] _MINIMAL_ALPHA = 0.2 if major_raw_ds_append.shape[0] < 4: row_new_alpha = row_old_alpha - ( row_old_alpha * row_number * _MINIMAL_ALPHA) else: row_new_alpha = row_old_alpha - ( (row_old_alpha - _MINIMAL_ALPHA) * row_number / float(major_raw_ds_append.shape[0] - 1)) pie_ext_append_colors.append(";".join( str(i) for i in list(row_color[:3]) + [row_new_alpha])) major_raw_ds_append["color"] = pie_ext_append_colors if major_raw_ds_append.shape[0] > 0: if major_raw_ds.shape[0] == 0: major_raw_ds = major_raw_ds_append else: major_raw_ds = pd.concat( [major_raw_ds, major_raw_ds_append], axis=0, ignore_index=True, sort=False) major_raw_ds = major_raw_ds.fillna("Other") pie_ext = ax.pie( major_raw_ds[sample_name], radius=1, labels=major_raw_ds[RAW_LABEL_COL_NAME], labeldistance=1 - _WEDGE_WIDTH / 2, rotatelabels=True, wedgeprops=_WEDGE_PROPERTIES, textprops=_LABEL_PROPERTIES, colors=major_raw_ds["color"].apply(lambda x: tuple( float(i) for i in x.split(";"))).values.tolist()) # Export visualization tables sample_export_mask = self.create_mirrored_path([ projectDescriber.DATA_DIGEST_DIR, self.value_col_name, col_name_with_keywords, sample_name ], makedirs=True) Utilities.dump_tsv( major_digest_df.reset_index(), "{}_inner_values.tsv".format(sample_export_mask)) Utilities.dump_tsv( major_raw_ds, "{}_outer_values.tsv".format(sample_export_mask)) # Set labels ax.set_xlabel(y_col_name) ax.set_ylabel(self.value_col_name) plt.tight_layout() # Export PNG pie_file = "{}_double_donut.png".format(sample_export_mask) fig.suptitle(pie_file, fontsize=_BASE_FONT_SIZE) plt.savefig(pie_file, dpi=300, bbox_inches="tight") plt.close("all") plt.clf()
def digest_df(self, df: pd.DataFrame, associations: dict, columns_with_keywords: list, include_key: bool = True, all_in_lowercase: bool = False, strict: bool = False): """ :param df: Pandas DataFrame object containing an index, keyword columns and value columns :param associations: Dictionary '{key: (keywords...)}' :param columns_with_keywords: List of columns to search keywords :param include_key: Should the key of keyword group be included? :param all_in_lowercase: Convert both strings to lowercase? :param strict: Only count full match :return: Pandas DataFrame object with keys as index and columns sums as values and dictionary with corresponding intermediate grouped Pandas DataFrame objects """ def __regular_search(s: str): return any(i in str(s) for i in key_words) def __strict_search(s: str): return any(i == str(s) for i in key_words) df = df.copy() df_columns = list(df) columns_with_keywords = Utilities.remove_empty_values([i for i in columns_with_keywords]) columns_without_keywords = Utilities.remove_empty_values( [i for i in df_columns if i not in columns_with_keywords]) if len(columns_with_keywords) == 0: print("No column for keyword search specified!") return try: # 'columns_with_keywords' might be more than 1 df["lookup_column"] = df.loc[:, columns_with_keywords].astype(str).apply( lambda x: " ".join(self.prepare_list(x, lowercase=all_in_lowercase)), axis=1) except KeyError as e: print(e, list(df), associations, columns_with_keywords) keywords_series = [] raw_values_ds = pd.DataFrame() for main_word in associations: key_words = associations.get(main_word) if not key_words or len(key_words) == 0: key_words = () if include_key: key_words = list(key_words) + [main_word, ] key_words = sorted(set(self.prepare_list(key_words, lowercase=all_in_lowercase))) if len(key_words) == 0: raise ValueError("No values to search: '{}: {}'".format(main_word, key_words)) if strict: df_with_keywords = df.loc[df["lookup_column"].apply(__strict_search) == True, columns_without_keywords] else: df_with_keywords = df.loc[df["lookup_column"].apply(__regular_search) == True, columns_without_keywords] keywords_series.append(df_with_keywords.sum().rename(main_word)) # Reset index to avoid exceptions thrown by duplicates raw_values_df = df_with_keywords.reset_index() raw_values_df["keyword"] = main_word if raw_values_ds.shape[0] == 0: raw_values_ds = raw_values_df else: raw_values_ds = pd.concat([raw_values_ds, raw_values_df], axis=0, ignore_index=True) out_df = Utilities.merge_pd_series_list(keywords_series).fillna(0) out_df.columns.name = "value" out_df.index.name = "keyword" return out_df, raw_values_ds
def prepare_list(lst: list, lowercase: bool = False): return Utilities.remove_empty_values([DigestAssociationsKeeper.prepare_string(i, lowercase) for i in lst])