def _get_feature_info(self):
        """
        returns a pandas dataframe containing:
         - feature id (id_CHAIN)
         - regex
         - v_gene (if match_v_genes == True)
        only for the motifs for which a regex was specified
        """
        features = {"receptor_id": [], "chain_id": [], "chain": [], "regex": []}

        if self.match_v_genes:
            features["v_gene"] = []

        for index, row in self.regex_df.iterrows():
            for chain_type in self.chains:
                regex = row[f"{chain_type}_regex"]

                if regex is not None:
                    features["receptor_id"].append(f"{row['id']}")
                    features["chain_id"].append(f"{row['id']}_{chain_type}")
                    features["chain"].append(Chain.get_chain(chain_type).name.lower())
                    features["regex"].append(regex)

                    if self.match_v_genes:
                        v_gene = row[f"{chain_type}V"] if f"{chain_type}V" in row else None
                        features["v_gene"].append(v_gene)

        return pd.DataFrame(features)
예제 #2
0
 def __init__(self,
              v_subgroup: str = None,
              v_gene: str = None,
              v_allele: str = None,
              j_subgroup: str = None,
              j_gene: str = None,
              j_allele: str = None,
              chain=None,
              count: int = None,
              frame_type: str = SequenceFrameType.IN.name,
              region_type: str = None,
              cell_id: str = None,
              custom_params: dict = None):
     self.v_subgroup = v_subgroup
     self.v_gene = v_gene
     self.v_allele = v_allele
     self.j_subgroup = j_subgroup
     self.j_gene = j_gene
     self.j_allele = j_allele
     self.chain = Chain.get_chain(chain) if chain and isinstance(
         chain, str) else chain if isinstance(chain, Chain) else None
     self.count = int(float(count)) if isinstance(count, str) else count
     self.frame_type = SequenceFrameType(
         frame_type) if frame_type and isinstance(
             frame_type, str) else frame_type if isinstance(
                 frame_type, SequenceFrameType) else None
     self.region_type = RegionType(
         region_type) if region_type and isinstance(
             region_type, str) else region_type if isinstance(
                 region_type, RegionType) else None
     self.cell_id = cell_id
     self.custom_params = custom_params if custom_params is not None else {}
예제 #3
0
파일: Matches.py 프로젝트: uio-bmi/immuneML
    def _write_repertoire_sizes(self):
        """
        Writes the repertoire sizes (# clones & # reads) per subject, per chain.
        """
        all_subjects = self.dataset.encoded_data.example_ids
        all_chains = sorted(
            set(self.dataset.encoded_data.feature_annotations["chain"]))

        results_df = pd.DataFrame(list(
            itertools.product(all_subjects, all_chains)),
                                  columns=["subject_id", "chain"])
        results_df["n_reads"] = 0
        results_df["n_clones"] = 0

        for repertoire in self.dataset.repertoires:
            rep_counts = repertoire.get_counts()
            rep_chains = repertoire.get_chains()

            for chain in all_chains:
                indices = rep_chains == Chain.get_chain(chain.upper())
                results_df.loc[(
                    results_df.subject_id == repertoire.metadata["subject_id"])
                               & (results_df.chain == chain),
                               'n_reads'] += np.sum(rep_counts[indices])
                results_df.loc[(
                    results_df.subject_id == repertoire.metadata["subject_id"])
                               & (results_df.chain == chain),
                               'n_clones'] += len(rep_counts[indices])

        results_path = self.result_path / "repertoire_sizes.csv"
        results_df.to_csv(results_path, index=False)

        return ReportOutput(results_path, "Repertoire sizes")
예제 #4
0
    def preprocess_dataframe(df: pd.DataFrame, params):

        subframes = []

        chain_dups_to_process = ("1", "2") if params.import_dual_chains is True else ("1")

        for chain in params.receptor_chains.value:
            for chain_dup in chain_dups_to_process:
                subframe_dict = {"cell_ids": df["Clonotype ID"],
                                 "sequence_aas": df[f"Chain: {chain} ({chain_dup})"],
                                 "v_genes": df[f"{chain} - V gene ({chain_dup})"],
                                 "j_genes": df[f"{chain} - J gene ({chain_dup})"],
                                 "chains": Chain.get_chain(chain).value}
                if params.extra_columns_to_load is not None:
                    for extra_col in params.extra_columns_to_load:
                        subframe_dict[extra_col] = df[extra_col]
                subframes.append(pd.DataFrame(subframe_dict))

        df = pd.concat(subframes, axis=0)
        df.dropna(subset=["sequence_aas", "v_genes", "j_genes"], inplace=True)

        df.reset_index(drop=True, inplace=True)

        if params.import_all_gene_combinations:
            df = IRISImport.import_all_gene_combinations(df)
        else:
            for gene_column in ("v_genes", "j_genes"):
                processed_names = [IRISImport._load_gene(rn.choice(raw_v_string.split(" | "))) for raw_v_string in df[gene_column]]
                df[gene_column] = processed_names

        ImportHelper.drop_empty_sequences(df, params.import_empty_aa_sequences, params.import_empty_nt_sequences)

        return df
예제 #5
0
 def get_chains(self):
     chains = self.get_attribute("chains")
     if chains is not None:
         chains = np.array([
             Chain.get_chain(chain_str) if chain_str is not None else None
             for chain_str in chains
         ])
     return chains
예제 #6
0
 def get_chain_for_row(row):
     for col in [
             "v_subgroup", "j_subgroup", "v_genes", "j_genes", "v_alleles",
             "j_alleles"
     ]:
         if col in row and row[col] is not None:
             return Chain.get_chain(str(row[col])[0:3]).value
     return None
예제 #7
0
    def import_receptors(df, params) -> List[Receptor]:
        identifiers = df["receptor_identifiers"].unique()

        chain_pair = params.receptor_chains
        if chain_pair is None:
            chains = [Chain.get_chain(chain) for chain in df["chains"].unique()]
            chain_pair = ChainPair.get_chain_pair(chains)

        metadata_columns = list(params.metadata_column_mapping.values()) if params.metadata_column_mapping else None

        all_receptors = []

        for identifier in identifiers:
            receptors = ImportHelper.import_receptors_by_id(df, identifier, chain_pair, metadata_columns)
            all_receptors.extend(receptors)

        return all_receptors
예제 #8
0
    def _process_repertoire(self, repertoire: Repertoire) -> Repertoire:
        data = pd.DataFrame(repertoire.load_data())

        groupby_fields = self._prepare_group_by_field(data.columns)
        custom_lists = list(set(data.columns) - set(Repertoire.FIELDS))
        agg_dict = self._prepare_agg_dict(data.columns, custom_lists)

        # Chain objects can not be aggregated, convert to strings
        if "chains" in data.columns:
            data["chains"] = [
                chain.value if isinstance(chain, Chain) else chain
                for chain in data["chains"]
            ]
        else:
            data["chains"] = None

        no_duplicates = data.groupby(groupby_fields).agg(
            agg_dict).reset_index()

        processed_repertoire = Repertoire.build(
            sequence_aas=list(no_duplicates["sequence_aas"])
            if "sequence_aas" in no_duplicates.columns else None,
            sequences=list(no_duplicates["sequences"])
            if "sequences" in no_duplicates.columns else None,
            v_genes=list(no_duplicates["v_genes"])
            if "v_genes" in no_duplicates.columns else None,
            j_genes=list(no_duplicates["j_genes"])
            if 'j_genes' in no_duplicates.columns else None,
            chains=[
                Chain.get_chain(key) for key in list(no_duplicates["chains"])
            ] if "chains" in no_duplicates.columns else None,
            counts=list(no_duplicates["counts"])
            if "counts" in no_duplicates else None,
            region_types=list(no_duplicates["region_types"])
            if "region_types" in no_duplicates else None,
            custom_lists={
                key: list(no_duplicates[key])
                for key in custom_lists
            },
            sequence_identifiers=list(no_duplicates["sequence_identifiers"]),
            metadata=copy.deepcopy(repertoire.metadata),
            path=self.result_path,
            filename_base=f"{repertoire.data_filename.stem}_filtered")

        return processed_repertoire
    def _import_from_files(filenames: List[str], generic_params: DatasetImportParams) -> ReceptorDataset:
        elements = []

        for file in filenames:
            df = pd.read_csv(file, sep=generic_params.separator, usecols=generic_params.columns_to_load)
            df.dropna()
            df.drop_duplicates()
            df.rename(columns=generic_params.column_mapping, inplace=True)

            if "alpha_amino_acid_sequence" in df:
                df["alpha_amino_acid_sequence"] = df["alpha_amino_acid_sequence"].str[1:-1]
            if "beta_amino_acid_sequence" in df:
                df["beta_amino_acid_sequence"] = df["beta_amino_acid_sequence"].str[1:-1]
            if "alpha_nucleotide_sequence" in df:
                df["alpha_nucleotide_sequence"] = df["alpha_nucleotide_sequence"].str[3:-3]
            if "beta_nucleotide_sequence" in df:
                df["beta_nucleotide_sequence"] = df["beta_nucleotide_sequence"].str[3:-3]

            chain_vals = [ch for ch in generic_params.receptor_chains.value]
            chain_names = [Chain.get_chain(ch).name.lower() for ch in generic_params.receptor_chains.value]

            for chain_name in chain_names:
                df = SingleLineReceptorImport.make_gene_columns(df, ["v", "j"], chain_name)

            for index, row in df.iterrows():
                sequences = {chain_vals[i]: ReceptorSequence(amino_acid_sequence=row[
                                     chain_name + "_amino_acid_sequence"] if chain_name + "_amino_acid_sequence" in row else None,
                                                  nucleotide_sequence=row[
                                                      chain_name + "_nucleotide_sequence"] if chain_name + "_nucleotide_sequence" in row else None,
                                                  metadata=SequenceMetadata(
                                                      v_gene=row[f"{chain_name}_v_gene"], v_allele=row[f"{chain_name}_v_allele"],
                                                      v_subgroup=row[f'{chain_name}_v_subgroup'],
                                                      j_gene=row[f"{chain_name}_j_gene"], j_allele=row[f"{chain_name}_j_allele"],
                                                      j_subgroup=row[f'{chain_name}_j_subgroup'],
                                                      chain=chain_name, count=row["count"], region_type=generic_params.region_type.value))
                             for i, chain_name in enumerate(chain_names)}

                elements.append(ReceptorBuilder.build_object(sequences, row["identifier"],
                                                             {key: row[key] for key in row.keys()
                                                              if all(item not in key for item in
                                                                     ["v_gene", 'j_gene', "count", "identifier"] + chain_names)}))

        return ReceptorDataset.build(elements, generic_params.sequence_file_size, generic_params.result_path)
예제 #10
0
    def process(dataset: RepertoireDataset, params: dict) -> RepertoireDataset:
        processed_dataset = dataset.clone()
        PathBuilder.build(params["result_path"])
        repertoires = []
        indices = []
        for index, repertoire in enumerate(dataset.get_data()):
            if all(sequence.metadata.chain == Chain.get_chain(
                    params["keep_chain"])
                   for sequence in repertoire.sequences):
                repertoires.append(repertoire)
                indices.append(index)

        processed_dataset.repertoires = repertoires
        processed_dataset.metadata_file = ChainRepertoireFilter.build_new_metadata(
            processed_dataset, indices, params["result_path"])

        Filter.check_dataset_not_empty(processed_dataset,
                                       "ChainRepertoireFilter")

        return processed_dataset
예제 #11
0
    def _postprocess_dataframe(df):
        if "locus" in df.columns:
            df["locus"] = [
                Chain.get_chain(chain).value if chain else ''
                for chain in df["locus"]
            ]

        if "frame_types" in df.columns:
            AIRRExporter._enums_to_strings(df, "frame_types")

            df["productive"] = df["frame_types"] == SequenceFrameType.IN.name
            df.loc[df["frame_types"].isnull(), "productive"] = ''

            df["vj_in_frame"] = df["productive"]

            df["stop_codon"] = df["frame_types"] == SequenceFrameType.STOP.name
            df.loc[df["frame_types"].isnull(), "stop_codon"] = ''

            df.drop(columns=["frame_types"])

        if "region_types" in df.columns:
            df.drop(columns=["region_types"])

        return df
예제 #12
0
 def load_chains_from_chains(df: pd.DataFrame) -> list:
     return [
         Chain.get_chain(chain_str).value if chain_str is not None else None
         for chain_str in df["chains"]
     ]
예제 #13
0
 def __init__(self, keep_chain, result_path: Path = None):
     super().__init__(result_path)
     self.keep_chain = Chain.get_chain(keep_chain)
    def test_process(self):
        path = EnvironmentSettings.root_path / "test/tmp/duplicatesequencefilter/"
        PathBuilder.build(path)

        dataset = RepertoireDataset(repertoires=[
            Repertoire.build(
                sequence_aas=["AAA", "AAA", "CCC", "AAA", "CCC", "CCC", "CCC"],
                sequences=[
                    "ntAAA", "ntBBB", "ntCCC", "ntAAA", "ntCCC", "ntCCC",
                    "ntDDD"
                ],
                v_genes=["v1", "v1", "v1", "v1", "v1", "v1", "v1"],
                j_genes=["j1", "j1", "j1", "j1", "j1", "j1", "j1"],
                chains=[
                    Chain.ALPHA, Chain.ALPHA, Chain.ALPHA, Chain.ALPHA,
                    Chain.ALPHA, Chain.ALPHA, Chain.BETA
                ],
                counts=[10, 20, 30, 5, 20, None, 40],
                region_types=[
                    "IMGT_CDR3", "IMGT_CDR3", "IMGT_CDR3", "IMGT_CDR3",
                    "IMGT_CDR3", "IMGT_CDR3", "IMGT_CDR3"
                ],
                custom_lists={
                    "custom1": ["yes", "yes", "yes", "no", "no", "no", "no"],
                    "custom2": ["yes", "yes", "yes", "no", "no", "no", "no"]
                },
                sequence_identifiers=[1, 2, 3, 4, 5, 6, 7],
                path=path)
        ])

        # collapse by amino acids & use sum counts
        dupfilter = DuplicateSequenceFilter(
            filter_sequence_type=SequenceType.AMINO_ACID,
            count_agg=CountAggregationFunction.SUM,
            batch_size=4)

        reduced_repertoire = dupfilter.process_dataset(
            dataset=dataset, result_path=path).repertoires[0]

        attr = reduced_repertoire.get_attributes([
            "sequence_identifiers", "sequence_aas", "sequences", "counts",
            "chains"
        ])

        self.assertEqual(3, len(reduced_repertoire.get_sequence_identifiers()))
        self.assertListEqual(["AAA", "CCC", "CCC"], list(attr["sequence_aas"]))
        self.assertListEqual(["ntAAA", "ntCCC", "ntDDD"],
                             list(attr["sequences"]))
        self.assertListEqual([35, 50, 40], list(attr["counts"]))
        self.assertListEqual([1, 3, 7], list(attr["sequence_identifiers"]))
        self.assertListEqual(
            [Chain.get_chain("A"),
             Chain.get_chain("A"),
             Chain.get_chain('B')], list(attr["chains"]))

        # collapse by nucleotides & use min counts
        dupfilter = DuplicateSequenceFilter(
            filter_sequence_type=SequenceType.NUCLEOTIDE,
            count_agg=CountAggregationFunction.MIN,
            batch_size=4)

        reduced_repertoire = dupfilter.process_dataset(
            dataset=dataset, result_path=path).repertoires[0]

        attr = reduced_repertoire.get_attributes(
            ["sequence_identifiers", "sequence_aas", "sequences", "counts"])

        self.assertEqual(4, len(reduced_repertoire.get_sequence_identifiers()))
        self.assertListEqual([1, 2, 3, 7], list(attr["sequence_identifiers"]))
        self.assertListEqual(["AAA", "AAA", "CCC", "CCC"],
                             list(attr["sequence_aas"]))
        self.assertListEqual(["ntAAA", "ntBBB", "ntCCC", "ntDDD"],
                             list(attr["sequences"]))
        self.assertListEqual([5, 20, 20, 40], list(attr["counts"]))

        shutil.rmtree(path)
예제 #15
0
 def load_chains_from_column(df: pd.DataFrame, column_name) -> list:
     return [
         Chain.get_chain(chain_str).value if chain_str is not None else None
         for chain_str in df[column_name].str[0:3]
     ]
예제 #16
0
 def load_chains(df: pd.DataFrame, column_name="chains") -> list:
     return [
         Chain.get_chain(chain_str).value if chain_str is not None else None
         for chain_str in df[column_name]
     ]