def load(self): # Most of reference sequences are short enough to not be split self.annotation_file = self.refdata.get_sequence_dict()["annotation"] print(f"Using the annotation file: '{self.annotation_file}'") self.annotation_df = load_tsv(self.annotation_file) print(f"Loaded annotation table with shape {self.annotation_df.shape}") self._annotation_df = self.annotation_df.copy() self.raw_nucleotide_fasta_headers = self.annotation_df[ "former_id"].values.tolist()
_namespace.separator, _namespace.output_file, ) if __name__ == '__main__': ( target_file, target_column_name, source_table_file, source_column_names, separator, out_file, ) = _parse_args() source_df = load_tsv(source_table_file) source_sub_df = pd.DataFrame() source_sub_df["target"] = source_df[target_column_name].copy() source_sub_df["source"] = source_df.loc[:, source_column_names].fillna( "").agg(separator.join, axis=1) source_sub_df = source_sub_df.loc[ source_sub_df["source"].map(lambda x: len(x) > 0), :] target_string = load_string(target_file) collector_string = str(target_string) counter = 0 for source, target in source_sub_df.values: collector_string = collector_string.replace(str(source), target) counter += 1
choices=["inner", "outer"], help="(Optional) How to handle indexes on other axis (or axes)") parser.add_argument("-o", "--output", required=True, help="Output table") _namespace = parser.parse_args() return _namespace.input, _namespace.axis, _namespace.index, _namespace.join, _namespace.output if __name__ == '__main__': input_tables, axis, index, join, output_table = parse_args() table_files = remove_empty_values( [i for i in input_tables if is_file_valid(i)]) if len(table_files) == 0: raise ValueError("No valid tables!") dataframes = [] for table_file in table_files: dataframe = load_tsv(table_file) if dataframe.shape[0] == 0: continue dataframe[PATH_COL_NAME] = table_file dataframes.append(dataframe) is_index = len(index) > 0 if is_index: for dataframe in dataframes: dataframe.set_index(index, inplace=True) print( f"Concatenate {len(dataframes)} dataframes with the shapes: {[i.shape for i in dataframes]}" ) start = perf_counter() out_df = pd.concat(dataframes,
required=True, help="File with text to perform replacements") parser.add_argument("-t", "--table", required=True, help="Table with values to fetch") parser.add_argument("-o", "--output", required=True, help="Output file") _namespace = parser.parse_args() return _namespace.input, _namespace.table, _namespace.output if __name__ == '__main__': text_file, combined_blast_result_file, out_file = parse_args() text_content = load_string(text_file) combined_blast_result_df = load_tsv(combined_blast_result_file).set_index( "geninfo_id") print( f"Loaded replacer table with the shape {combined_blast_result_df.shape}" ) renaming_dict = combined_blast_result_df["strain"].map( lambda x: str(x).strip()).to_dict() text_content_replaced = re.sub("\.(gbk|gff)", "", text_content) counter = 0 for renaming_key, renaming_value in renaming_dict.items(): text_content_replaced = text_content_replaced.replace( *[str(i) for i in [renaming_key, renaming_value]]) counter += 1 print(f"{counter} replacements were performed")
required=True, help="Output file") _namespace = parser.parse_args() return (_namespace.rgi_dir, _namespace.card_version, _namespace.nbee_dir, _namespace.output_file) if __name__ == '__main__': (rgi_dir, card_version, nbee_dir, out_file) = _parse_args() # RGI rgi_tables = find_file_by_tail(dir_name=rgi_dir, multiple=True, tail=".txt") merged_rgi_df = pd.DataFrame() for rgi_table in rgi_tables: rgi_df = load_tsv(rgi_table) if rgi_df.shape[0] == 0: continue rgi_df = remove_longest_columns(rgi_df, CELL_SIZE_LIMIT) columns = rgi_df.columns.tolist() rgi_df["sample_name"] = filename_only(rgi_table) rgi_df = rgi_df.loc[:, ["sample_name"] + columns] print( f"Concatenate dataframes with shapes {rgi_df.shape}, {merged_rgi_df.shape}" ) merged_rgi_df = pd.concat([merged_rgi_df, rgi_df], axis=0, ignore_index=True) reference_name = "_".join(remove_empty_values(["card", card_version])) print(f"Finished concatenating tables for '{reference_name}'") sheets = {reference_name: merged_rgi_df}