def preprocess_dataframe(df: pd.DataFrame, params: DatasetImportParams): df["frame_types"] = SequenceFrameType.IN.name ImportHelper.junction_to_cdr3(df, params.region_type) if not params.is_repertoire and params.paired: n_single_chains = sum(df["sequence_identifiers"] == "0") if n_single_chains > 0: df.drop(df.loc[df["sequence_identifiers"] == "0"].index, inplace=True) warnings.warn(f"VDJdbImport: {n_single_chains} single chains were removed when trying to create a ReceptorDataset.\n" f"To import all chains as a SequenceDataset, use paired = False") else: df.loc[df["sequence_identifiers"] == "0", "sequence_identifiers"] = None if "chains" not in df.columns: df.loc[:, "chains"] = ImportHelper.load_chains_from_genes(df) df["receptor_identifiers"] = df["sequence_identifiers"] df["sequence_identifiers"] = VDJdbImport.get_sequence_identifiers(df["sequence_identifiers"], df["chains"]) ImportHelper.update_gene_info(df) ImportHelper.drop_empty_sequences(df, params.import_empty_aa_sequences, params.import_empty_nt_sequences) ImportHelper.drop_illegal_character_sequences(df, params.import_illegal_characters) df = VDJdbImport.extract_meta_columns(df, params) return df
def preprocess_dataframe(df: pd.DataFrame, params: DatasetImportParams): ImportHelper.junction_to_cdr3(df, params.region_type) ImportHelper.drop_empty_sequences(df, params.import_empty_aa_sequences, params.import_empty_nt_sequences) ImportHelper.drop_illegal_character_sequences( df, params.import_illegal_characters) ImportHelper.update_gene_info(df) return df
def preprocess_dataframe(df: pd.DataFrame, params: DatasetImportParams): """ Function for loading the data from one MiXCR file, such that: - for the given region (CDR3/full sequence), both nucleotide and amino acid sequence are loaded - if the region is CDR3, it adapts the sequence to the definition of the CDR3 (IMGT junction vs IMGT CDR3) - the chain for each sequence is extracted from the v gene name - the genes are loaded from the top score for gene without allele info Arguments: metadata: the corresponding row from the metadata file with metadata such as subject_id, age, HLA or other info given there params: DatasetImportParams object defining what to import and how to do it Returns: data frame corresponding to Repertoire.FIELDS and custom lists which can be used to create a Repertoire object """ df["sequence_aas"] = df[MiXCRImport.SEQUENCE_NAME_MAP[ params.region_type]["AA"]] df["sequences"] = df[MiXCRImport.SEQUENCE_NAME_MAP[params.region_type] ["NT"]] ImportHelper.junction_to_cdr3(df, params.region_type) df["counts"] = df["counts"].astype(float).astype(int) df["v_genes"] = MiXCRImport._load_genes(df, "v_genes") df["j_genes"] = MiXCRImport._load_genes(df, "j_genes") df["chains"] = ImportHelper.load_chains_from_genes(df) ImportHelper.update_gene_info(df) ImportHelper.drop_empty_sequences(df, params.import_empty_aa_sequences, params.import_empty_nt_sequences) ImportHelper.drop_illegal_character_sequences( df, params.import_illegal_characters) return df
def preprocess_dataframe(df: pd.DataFrame, params: DatasetImportParams): df["frame_types"] = None df.loc[df["productive"].eq("True"), "frame_types"] = SequenceFrameType.IN.name allowed_productive_values = [] if params.import_productive: allowed_productive_values.append("True") if params.import_unproductive: allowed_productive_values.append("False") df = df[df.productive.isin(allowed_productive_values)] ImportHelper.junction_to_cdr3(df, params.region_type) ImportHelper.drop_empty_sequences(df, params.import_empty_aa_sequences, params.import_empty_nt_sequences) ImportHelper.drop_illegal_character_sequences( df, params.import_illegal_characters) ImportHelper.update_gene_info(df) if "chains" not in df.columns: df.loc[:, "chains"] = ImportHelper.load_chains_from_genes(df) return df
def preprocess_dataframe(df: pd.DataFrame, params: DatasetImportParams): """ Function for preprocessing data from a dataframe containing AIRR data, such that: - productive sequences, sequences with stop codons or out of frame sequences are filtered according to specification - if RegionType is CDR3, the leading C and trailing W are removed from the sequence to match the CDR3 definition - if no chain column was specified, the chain is extracted from the v gene name - the allele information is removed from the V and J genes """ if "productive" in df.columns: df["frame_types"] = SequenceFrameType.OUT.name df.loc[df["productive"] == True, "frame_types"] = SequenceFrameType.IN.name else: df["frame_types"] = None if "vj_in_frame" in df.columns: df.loc[df["vj_in_frame"] == True, "frame_types"] = SequenceFrameType.IN.name if "stop_codon" in df.columns: df.loc[df["stop_codon"] == True, "frame_types"] = SequenceFrameType.STOP.name if "productive" in df.columns: frame_type_list = ImportHelper.prepare_frame_type_list(params) df = df[df["frame_types"].isin(frame_type_list)] if params.region_type == RegionType.IMGT_CDR3: if "sequence_aas" not in df.columns and "sequences" not in df.columns: if "cdr3" in df.columns: df.rename(columns={"cdr3": "sequences"}, inplace=True) if "cdr3_aa" in df.columns: df.rename(columns={"cdr3_aa": "sequence_aas"}, inplace=True) df.loc[:, "region_types"] = params.region_type.name elif "junction" in params.column_mapping or "junction_aa" in params.column_mapping: ImportHelper.junction_to_cdr3(df, params.region_type) # todo else: support "full_sequence" import through regiontype? if "chains" not in df.columns: df.loc[:, "chains"] = ImportHelper.load_chains_from_genes(df) df = ImportHelper.update_gene_info(df) ImportHelper.drop_empty_sequences(df, params.import_empty_aa_sequences, params.import_empty_nt_sequences) ImportHelper.drop_illegal_character_sequences( df, params.import_illegal_characters) return df