def process_repertoire(repertoire: Repertoire, params: dict) -> Repertoire: data = pd.DataFrame(repertoire.load_data()) groupby_fields = DuplicateSequenceFilter._prepare_group_by_field(params, data.columns) custom_lists = list(set(data.columns) - set(Repertoire.FIELDS)) agg_dict = DuplicateSequenceFilter._prepare_agg_dict(params, data.columns, custom_lists) # Chain objects can not be aggregated, convert to strings if "chains" in data.columns: data["chains"] = [chain.value if isinstance(chain, Chain) else chain for chain in data["chains"]] else: data["chains"] = None no_duplicates = data.groupby(groupby_fields).agg(agg_dict).reset_index() processed_repertoire = Repertoire.build(sequence_aas=list(no_duplicates["sequence_aas"]) if "sequence_aas" in no_duplicates.columns else None, sequences=list(no_duplicates["sequences"]) if "sequences" in no_duplicates.columns else None, v_genes=list(no_duplicates["v_genes"]) if "v_genes" in no_duplicates.columns else None, j_genes=list(no_duplicates["j_genes"]) if 'j_genes' in no_duplicates.columns else None, chains=[Chain(key) for key in list(no_duplicates["chains"])] if "chains" in no_duplicates.columns else None, counts=list(no_duplicates["counts"]) if "counts" in no_duplicates else None, region_types=list(no_duplicates["region_types"]) if "region_types" in no_duplicates else None, custom_lists={key: list(no_duplicates[key]) for key in custom_lists}, sequence_identifiers=list(no_duplicates["sequence_identifiers"]), metadata=copy.deepcopy(repertoire.metadata), path=params["result_path"], filename_base=f"{repertoire.data_filename.stem}_filtered") return processed_repertoire
def __init__(self, v_subgroup: str = None, v_gene: str = None, v_allele: str = None, j_subgroup: str = None, j_gene: str = None, j_allele: str = None, chain=None, count: int = None, frame_type: str = SequenceFrameType.IN.name, region_type: str = None, cell_id: str = None, custom_params: dict = None): self.v_subgroup = v_subgroup self.v_gene = v_gene self.v_allele = v_allele self.j_subgroup = j_subgroup self.j_gene = j_gene self.j_allele = j_allele self.chain = Chain.get_chain(chain) if chain and isinstance( chain, str) else chain if isinstance(chain, Chain) else None self.count = int(float(count)) if isinstance(count, str) else count self.frame_type = SequenceFrameType( frame_type) if frame_type and isinstance( frame_type, str) else frame_type if isinstance( frame_type, SequenceFrameType) else None self.region_type = RegionType( region_type) if region_type and isinstance( region_type, str) else region_type if isinstance( region_type, RegionType) else None self.cell_id = cell_id self.custom_params = custom_params if custom_params is not None else {}
def _write_repertoire_sizes(self): """ Writes the repertoire sizes (# clones & # reads) per subject, per chain. """ all_subjects = self.dataset.encoded_data.example_ids all_chains = sorted( set(self.dataset.encoded_data.feature_annotations["chain"])) results_df = pd.DataFrame(list( itertools.product(all_subjects, all_chains)), columns=["subject_id", "chain"]) results_df["n_reads"] = 0 results_df["n_clones"] = 0 for repertoire in self.dataset.repertoires: rep_counts = repertoire.get_counts() rep_chains = repertoire.get_chains() for chain in all_chains: indices = rep_chains == Chain.get_chain(chain.upper()) results_df.loc[( results_df.subject_id == repertoire.metadata["subject_id"]) & (results_df.chain == chain), 'n_reads'] += np.sum(rep_counts[indices]) results_df.loc[( results_df.subject_id == repertoire.metadata["subject_id"]) & (results_df.chain == chain), 'n_clones'] += len(rep_counts[indices]) results_path = self.result_path / "repertoire_sizes.csv" results_df.to_csv(results_path, index=False) return ReportOutput(results_path, "Repertoire sizes")
def _get_feature_info(self): """ returns a pandas dataframe containing: - feature id (id_CHAIN) - regex - v_gene (if match_v_genes == True) only for the motifs for which a regex was specified """ features = {"receptor_id": [], "chain_id": [], "chain": [], "regex": []} if self.match_v_genes: features["v_gene"] = [] for index, row in self.regex_df.iterrows(): for chain_type in self.chains: regex = row[f"{chain_type}_regex"] if regex is not None: features["receptor_id"].append(f"{row['id']}") features["chain_id"].append(f"{row['id']}_{chain_type}") features["chain"].append(Chain.get_chain(chain_type).name.lower()) features["regex"].append(regex) if self.match_v_genes: v_gene = row[f"{chain_type}V"] if f"{chain_type}V" in row else None features["v_gene"].append(v_gene) return pd.DataFrame(features)
def preprocess_dataframe(df: pd.DataFrame, params): subframes = [] chain_dups_to_process = ("1", "2") if params.import_dual_chains is True else ("1") for chain in params.receptor_chains.value: for chain_dup in chain_dups_to_process: subframe_dict = {"cell_ids": df["Clonotype ID"], "sequence_aas": df[f"Chain: {chain} ({chain_dup})"], "v_genes": df[f"{chain} - V gene ({chain_dup})"], "j_genes": df[f"{chain} - J gene ({chain_dup})"], "chains": Chain.get_chain(chain).value} if params.extra_columns_to_load is not None: for extra_col in params.extra_columns_to_load: subframe_dict[extra_col] = df[extra_col] subframes.append(pd.DataFrame(subframe_dict)) df = pd.concat(subframes, axis=0) df.dropna(subset=["sequence_aas", "v_genes", "j_genes"], inplace=True) df.reset_index(drop=True, inplace=True) if params.import_all_gene_combinations: df = IRISImport.import_all_gene_combinations(df) else: for gene_column in ("v_genes", "j_genes"): processed_names = [IRISImport._load_gene(rn.choice(raw_v_string.split(" | "))) for raw_v_string in df[gene_column]] df[gene_column] = processed_names ImportHelper.drop_empty_sequences(df, params.import_empty_aa_sequences, params.import_empty_nt_sequences) return df
def get_chains(self): chains = self.get_attribute("chains") if chains is not None: chains = np.array([ Chain.get_chain(chain_str) if chain_str is not None else None for chain_str in chains ]) return chains
def get_chain_for_row(row): for col in [ "v_subgroup", "j_subgroup", "v_genes", "j_genes", "v_alleles", "j_alleles" ]: if col in row and row[col] is not None: return Chain.get_chain(str(row[col])[0:3]).value return None
def import_receptors(df, params) -> List[Receptor]: identifiers = df["receptor_identifiers"].unique() chain_pair = params.receptor_chains if chain_pair is None: chains = [Chain.get_chain(chain) for chain in df["chains"].unique()] chain_pair = ChainPair.get_chain_pair(chains) metadata_columns = list(params.metadata_column_mapping.values()) if params.metadata_column_mapping else None all_receptors = [] for identifier in identifiers: receptors = ImportHelper.import_receptors_by_id(df, identifier, chain_pair, metadata_columns) all_receptors.extend(receptors) return all_receptors
def _import_from_files(filenames: List[str], generic_params: DatasetImportParams) -> ReceptorDataset: elements = [] for file in filenames: df = pd.read_csv(file, sep=generic_params.separator, usecols=generic_params.columns_to_load) df.dropna() df.drop_duplicates() df.rename(columns=generic_params.column_mapping, inplace=True) if "alpha_amino_acid_sequence" in df: df["alpha_amino_acid_sequence"] = df["alpha_amino_acid_sequence"].str[1:-1] if "beta_amino_acid_sequence" in df: df["beta_amino_acid_sequence"] = df["beta_amino_acid_sequence"].str[1:-1] if "alpha_nucleotide_sequence" in df: df["alpha_nucleotide_sequence"] = df["alpha_nucleotide_sequence"].str[3:-3] if "beta_nucleotide_sequence" in df: df["beta_nucleotide_sequence"] = df["beta_nucleotide_sequence"].str[3:-3] chain_vals = [ch for ch in generic_params.receptor_chains.value] chain_names = [Chain.get_chain(ch).name.lower() for ch in generic_params.receptor_chains.value] for chain_name in chain_names: df = SingleLineReceptorImport.make_gene_columns(df, ["v", "j"], chain_name) for index, row in df.iterrows(): sequences = {chain_vals[i]: ReceptorSequence(amino_acid_sequence=row[ chain_name + "_amino_acid_sequence"] if chain_name + "_amino_acid_sequence" in row else None, nucleotide_sequence=row[ chain_name + "_nucleotide_sequence"] if chain_name + "_nucleotide_sequence" in row else None, metadata=SequenceMetadata( v_gene=row[f"{chain_name}_v_gene"], v_allele=row[f"{chain_name}_v_allele"], v_subgroup=row[f'{chain_name}_v_subgroup'], j_gene=row[f"{chain_name}_j_gene"], j_allele=row[f"{chain_name}_j_allele"], j_subgroup=row[f'{chain_name}_j_subgroup'], chain=chain_name, count=row["count"], region_type=generic_params.region_type.value)) for i, chain_name in enumerate(chain_names)} elements.append(ReceptorBuilder.build_object(sequences, row["identifier"], {key: row[key] for key in row.keys() if all(item not in key for item in ["v_gene", 'j_gene', "count", "identifier"] + chain_names)})) return ReceptorDataset.build(elements, generic_params.sequence_file_size, generic_params.result_path)
def process(dataset: RepertoireDataset, params: dict) -> RepertoireDataset: processed_dataset = dataset.clone() PathBuilder.build(params["result_path"]) repertoires = [] indices = [] for index, repertoire in enumerate(dataset.get_data()): if all(sequence.metadata.chain == Chain.get_chain( params["keep_chain"]) for sequence in repertoire.sequences): repertoires.append(repertoire) indices.append(index) processed_dataset.repertoires = repertoires processed_dataset.metadata_file = ChainRepertoireFilter.build_new_metadata( processed_dataset, indices, params["result_path"]) Filter.check_dataset_not_empty(processed_dataset, "ChainRepertoireFilter") return processed_dataset
def _postprocess_dataframe(df): if "locus" in df.columns: df["locus"] = [ Chain.get_chain(chain).value if chain else '' for chain in df["locus"] ] if "frame_types" in df.columns: AIRRExporter._enums_to_strings(df, "frame_types") df["productive"] = df["frame_types"] == SequenceFrameType.IN.name df.loc[df["frame_types"].isnull(), "productive"] = '' df["vj_in_frame"] = df["productive"] df["stop_codon"] = df["frame_types"] == SequenceFrameType.STOP.name df.loc[df["frame_types"].isnull(), "stop_codon"] = '' df.drop(columns=["frame_types"]) if "region_types" in df.columns: df.drop(columns=["region_types"]) return df
def load_chains_from_chains(df: pd.DataFrame) -> list: return [ Chain.get_chain(chain_str).value if chain_str is not None else None for chain_str in df["chains"] ]
def __init__(self, keep_chain, result_path: Path = None): super().__init__(result_path) self.keep_chain = Chain.get_chain(keep_chain)
def test_process(self): path = EnvironmentSettings.root_path / "test/tmp/duplicatesequencefilter/" PathBuilder.build(path) dataset = RepertoireDataset(repertoires=[ Repertoire.build( sequence_aas=["AAA", "AAA", "CCC", "AAA", "CCC", "CCC", "CCC"], sequences=[ "ntAAA", "ntBBB", "ntCCC", "ntAAA", "ntCCC", "ntCCC", "ntDDD" ], v_genes=["v1", "v1", "v1", "v1", "v1", "v1", "v1"], j_genes=["j1", "j1", "j1", "j1", "j1", "j1", "j1"], chains=[ Chain.ALPHA, Chain.ALPHA, Chain.ALPHA, Chain.ALPHA, Chain.ALPHA, Chain.ALPHA, Chain.BETA ], counts=[10, 20, 30, 5, 20, None, 40], region_types=[ "IMGT_CDR3", "IMGT_CDR3", "IMGT_CDR3", "IMGT_CDR3", "IMGT_CDR3", "IMGT_CDR3", "IMGT_CDR3" ], custom_lists={ "custom1": ["yes", "yes", "yes", "no", "no", "no", "no"], "custom2": ["yes", "yes", "yes", "no", "no", "no", "no"] }, sequence_identifiers=[1, 2, 3, 4, 5, 6, 7], path=path) ]) # collapse by amino acids & use sum counts dupfilter = DuplicateSequenceFilter( filter_sequence_type=SequenceType.AMINO_ACID, count_agg=CountAggregationFunction.SUM, batch_size=4) reduced_repertoire = dupfilter.process_dataset( dataset=dataset, result_path=path).repertoires[0] attr = reduced_repertoire.get_attributes([ "sequence_identifiers", "sequence_aas", "sequences", "counts", "chains" ]) self.assertEqual(3, len(reduced_repertoire.get_sequence_identifiers())) self.assertListEqual(["AAA", "CCC", "CCC"], list(attr["sequence_aas"])) self.assertListEqual(["ntAAA", "ntCCC", "ntDDD"], list(attr["sequences"])) self.assertListEqual([35, 50, 40], list(attr["counts"])) self.assertListEqual([1, 3, 7], list(attr["sequence_identifiers"])) self.assertListEqual( [Chain.get_chain("A"), Chain.get_chain("A"), Chain.get_chain('B')], list(attr["chains"])) # collapse by nucleotides & use min counts dupfilter = DuplicateSequenceFilter( filter_sequence_type=SequenceType.NUCLEOTIDE, count_agg=CountAggregationFunction.MIN, batch_size=4) reduced_repertoire = dupfilter.process_dataset( dataset=dataset, result_path=path).repertoires[0] attr = reduced_repertoire.get_attributes( ["sequence_identifiers", "sequence_aas", "sequences", "counts"]) self.assertEqual(4, len(reduced_repertoire.get_sequence_identifiers())) self.assertListEqual([1, 2, 3, 7], list(attr["sequence_identifiers"])) self.assertListEqual(["AAA", "AAA", "CCC", "CCC"], list(attr["sequence_aas"])) self.assertListEqual(["ntAAA", "ntBBB", "ntCCC", "ntDDD"], list(attr["sequences"])) self.assertListEqual([5, 20, 20, 40], list(attr["counts"])) shutil.rmtree(path)
def load_chains_from_column(df: pd.DataFrame, column_name) -> list: return [ Chain.get_chain(chain_str).value if chain_str is not None else None for chain_str in df[column_name].str[0:3] ]
def load_chains(df: pd.DataFrame, column_name="chains") -> list: return [ Chain.get_chain(chain_str).value if chain_str is not None else None for chain_str in df[column_name] ]