def process_repertoire(repertoire: Repertoire, params: dict) -> Repertoire: data = pd.DataFrame(repertoire.load_data()) groupby_fields = DuplicateSequenceFilter._prepare_group_by_field(params, data.columns) custom_lists = list(set(data.columns) - set(Repertoire.FIELDS)) agg_dict = DuplicateSequenceFilter._prepare_agg_dict(params, data.columns, custom_lists) # Chain objects can not be aggregated, convert to strings if "chains" in data.columns: data["chains"] = [chain.value if isinstance(chain, Chain) else chain for chain in data["chains"]] else: data["chains"] = None no_duplicates = data.groupby(groupby_fields).agg(agg_dict).reset_index() processed_repertoire = Repertoire.build(sequence_aas=list(no_duplicates["sequence_aas"]) if "sequence_aas" in no_duplicates.columns else None, sequences=list(no_duplicates["sequences"]) if "sequences" in no_duplicates.columns else None, v_genes=list(no_duplicates["v_genes"]) if "v_genes" in no_duplicates.columns else None, j_genes=list(no_duplicates["j_genes"]) if 'j_genes' in no_duplicates.columns else None, chains=[Chain(key) for key in list(no_duplicates["chains"])] if "chains" in no_duplicates.columns else None, counts=list(no_duplicates["counts"]) if "counts" in no_duplicates else None, region_types=list(no_duplicates["region_types"]) if "region_types" in no_duplicates else None, custom_lists={key: list(no_duplicates[key]) for key in custom_lists}, sequence_identifiers=list(no_duplicates["sequence_identifiers"]), metadata=copy.deepcopy(repertoire.metadata), path=params["result_path"], filename_base=f"{repertoire.data_filename.stem}_filtered") return processed_repertoire
def _repertoire_to_dataframe(repertoire: Repertoire, region_type): # get all fields (including custom fields) df = pd.DataFrame(repertoire.load_data()) for column in ['v_alleles', 'j_alleles', 'v_genes', 'j_genes']: if column not in df.columns: df.loc[:, column] = '' AIRRExporter.update_gene_columns(df, 'alleles', 'genes') # rename mandatory fields for airr-compliance mapper = { "sequence_identifiers": "sequence_id", "v_alleles": "v_call", "j_alleles": "j_call", "chains": "locus", "counts": "duplicate_count", "sequences": AIRRExporter.get_sequence_field(region_type), "sequence_aas": AIRRExporter.get_sequence_aa_field(region_type) } df = df.rename(mapper=mapper, axis="columns") return df