예제 #1
0
    def process_dataset(self,
                        dataset: RepertoireDataset,
                        result_path: Path = None):
        self.check_dataset_type(dataset, [RepertoireDataset],
                                "ClonesPerRepertoireFilter")
        self.result_path = result_path if result_path is not None else self.result_path

        processed_dataset = dataset.clone()
        repertoires, indices = [], []

        for index, repertoire in enumerate(dataset.get_data()):
            if self.lower_limit != -1 and len(
                    repertoire.sequences) < self.lower_limit:
                continue
            if self.upper_limit != -1 and len(
                    repertoire.sequences) > self.upper_limit:
                continue
            repertoires.append(dataset.repertoires[index])
            indices.append(index)

        processed_dataset.repertoires = repertoires
        processed_dataset.metadata_file = self._build_new_metadata(
            dataset, indices)

        self.check_dataset_not_empty(processed_dataset,
                                     "ClonesPerRepertoireFilter")

        return processed_dataset
예제 #2
0
    def test_process(self):
        path = EnvironmentSettings.root_path / "test/tmp/subject_rep_collector"
        PathBuilder.build(path)

        reps = [
            Repertoire.build_from_sequence_objects(
                [ReceptorSequence("AAA", identifier="1")],
                path=path,
                metadata={"subject_id": "patient1"}),
            Repertoire.build_from_sequence_objects(
                [ReceptorSequence("AAC", identifier="2")],
                path=path,
                metadata={"subject_id": "patient1"}),
            Repertoire.build_from_sequence_objects(
                [ReceptorSequence("AAC", identifier="3")],
                path=path,
                metadata={"subject_id": "patient3"})
        ]

        dataset = RepertoireDataset(repertoires=reps)

        dataset2 = SubjectRepertoireCollector.process(
            dataset, {"result_path": path / "result"})

        self.assertEqual(2, len(dataset2.get_data()))
        self.assertEqual(3, len(dataset.get_data()))

        values = [2, 1]
        for index, rep in enumerate(dataset2.get_data()):
            self.assertEqual(values[index], len(rep.sequences))

        shutil.rmtree(path)
예제 #3
0
 def process_dataset(self, dataset: RepertoireDataset):
     extract_fn = self.build_matching_fn()
     repertoire_count = dataset.get_example_count()
     for index, repertoire in enumerate(dataset.get_data()):
         self.process_repertoire(repertoire, str(repertoire.identifier),
                                 extract_fn)
         logging.info("Repertoire {} ({}/{}) processed.".format(
             repertoire.identifier, index + 1, repertoire_count))
         logging.info(
             f"Currently, there are {self.item_count} items in the comparison data matrix."
         )
     self.merge_tmp_batches_to_matrix()
예제 #4
0
    def match(self, dataset: RepertoireDataset, reference_sequences: list,
              max_distance: int,
              summary_type: SequenceMatchingSummaryType) -> dict:

        matched = {"repertoires": []}

        for index, repertoire in enumerate(dataset.get_data()):
            matched["repertoires"].append(
                self.match_repertoire(repertoire, index, reference_sequences,
                                      max_distance, summary_type))

        return matched
예제 #5
0
    def create_model(self, dataset: RepertoireDataset, k: int, vector_size: int, batch_size: int, model_path: Path):
        model = Word2Vec(size=vector_size, min_count=1, window=5)  # creates an empty model
        all_kmers = KmerHelper.create_all_kmers(k=k, alphabet=EnvironmentSettings.get_sequence_alphabet())
        all_kmers = [[kmer] for kmer in all_kmers]
        model.build_vocab(all_kmers)

        for repertoire in dataset.get_data(batch_size=batch_size):
            sentences = KmerHelper.create_sentences_from_repertoire(repertoire=repertoire, k=k)
            model.train(sentences=sentences, total_words=len(all_kmers), epochs=15)

        model.save(str(model_path))

        return model
예제 #6
0
 def export_updated_metadata(dataset: RepertoireDataset, result_path: Path,
                             repertoire_folder: str):
     df = pd.read_csv(dataset.metadata_file, comment=Constants.COMMENT_SIGN)
     identifiers = df["identifier"].values.tolist(
     ) if "identifier" in df.columns else dataset.get_example_ids()
     df["filename"] = [
         str(
             Path(repertoire_folder) /
             f"{repertoire.data_filename.stem}.tsv")
         for repertoire in dataset.get_data()
     ]
     df['identifier'] = identifiers
     df.to_csv(result_path / "metadata.csv", index=False)
    def _encode_repertoires(self, dataset: RepertoireDataset, params: EncoderParams):
        # Rows = repertoires, Columns = reference chains (two per sequence receptor)
        encoded_repertories = np.zeros((dataset.get_example_count(),
                                        len(self.reference_receptors) * 2),
                                       dtype=int)
        labels = {label: [] for label in params.label_config.get_labels_by_name()} if params.encode_labels else None

        for i, repertoire in enumerate(dataset.get_data()):
            encoded_repertories[i] = self._match_repertoire_to_receptors(repertoire)

            if labels is not None:
                for label_name in params.label_config.get_labels_by_name():
                    labels[label_name].append(repertoire.metadata[label_name])

        return encoded_repertories, labels, dataset.get_repertoire_ids()
    def process(dataset: RepertoireDataset, params: dict) -> RepertoireDataset:
        Preprocessor.check_dataset_type(dataset, [RepertoireDataset], "ClonesPerRepertoireFilter")
        processed_dataset = dataset.clone()
        repertoires = []
        indices = []
        for index, repertoire in enumerate(dataset.get_data()):
            if "lower_limit" in params.keys() and len(repertoire.sequences) >= params["lower_limit"] or \
                "upper_limit" in params.keys() and len(repertoire.sequences) <= params["upper_limit"]:
                repertoires.append(dataset.repertoires[index])
                indices.append(index)
        processed_dataset.repertoires = repertoires
        processed_dataset.metadata_file = ClonesPerRepertoireFilter.build_new_metadata(dataset, indices, params["result_path"])

        Filter.check_dataset_not_empty(processed_dataset, "ClonesPerRepertoireFilter")

        return processed_dataset
    def test_process(self):

        path = EnvironmentSettings.root_path / "test/tmp/chain_filter/"
        PathBuilder.build(path)

        rep1 = Repertoire.build_from_sequence_objects([
            ReceptorSequence(
                "AAA", metadata=SequenceMetadata(chain="A"), identifier="1")
        ],
                                                      path=path,
                                                      metadata={})
        rep2 = Repertoire.build_from_sequence_objects([
            ReceptorSequence(
                "AAC", metadata=SequenceMetadata(chain="B"), identifier="2")
        ],
                                                      path=path,
                                                      metadata={})

        metadata = pd.DataFrame({"CD": [1, 0]})
        metadata.to_csv(path / "metadata.csv")

        dataset = RepertoireDataset(repertoires=[rep1, rep2],
                                    metadata_file=path / "metadata.csv")

        dataset2 = ChainRepertoireFilter.process(
            dataset, {
                "keep_chain": "ALPHA",
                "result_path": path / "results"
            })

        self.assertEqual(1, len(dataset2.get_data()))
        self.assertEqual(2, len(dataset.get_data()))

        metadata_dict = dataset2.get_metadata(["CD"])
        self.assertEqual(1, len(metadata_dict["CD"]))
        self.assertEqual(1, metadata_dict["CD"][0])

        for rep in dataset2.get_data():
            self.assertEqual("AAA", rep.sequences[0].get_sequence())

        self.assertRaises(AssertionError, ChainRepertoireFilter.process,
                          dataset, {
                              "keep_chain": "GAMMA",
                              "result_path": path / "results"
                          })

        shutil.rmtree(path)
    def _encode_repertoires(self, dataset: RepertoireDataset, params: EncoderParams):
        # Rows = repertoires, Columns = regex matches (one chain per column)
        encoded_repertoires = np.zeros((dataset.get_example_count(),
                                        self.feature_count),
                                       dtype=int)
        labels = {label: [] for label in params.label_config.get_labels_by_name()} if params.encode_labels else None

        n_repertoires = dataset.get_example_count()

        for i, repertoire in enumerate(dataset.get_data()):
            print(f"{datetime.datetime.now()}: Encoding repertoire {i+1}/{n_repertoires}")
            encoded_repertoires[i] = self._match_repertoire_to_regexes(repertoire)

            if labels is not None:
                for label_name in params.label_config.get_labels_by_name():
                    labels[label_name].append(repertoire.metadata[label_name])

        return encoded_repertoires, labels
예제 #11
0
    def _encode_repertoires(self, dataset: RepertoireDataset, params):
        # Rows = repertoires, Columns = reference sequences
        encoded_repertories = np.zeros(
            (dataset.get_example_count(), len(self.reference_sequences)),
            dtype=int)

        labels = {
            label: []
            for label in params.label_config.get_labels_by_name()
        } if params.encode_labels else None

        for i, repertoire in enumerate(dataset.get_data()):
            encoded_repertories[i] = self._match_repertoire_to_reference(
                repertoire)

            for label_name in params.label_config.get_labels_by_name():
                labels[label_name].append(repertoire.metadata[label_name])

        return encoded_repertories, labels
예제 #12
0
    def process(dataset: RepertoireDataset, params: dict) -> RepertoireDataset:
        processed_dataset = dataset.clone()
        PathBuilder.build(params["result_path"])
        repertoires = []
        indices = []
        for index, repertoire in enumerate(dataset.get_data()):
            if all(sequence.metadata.chain == Chain.get_chain(
                    params["keep_chain"])
                   for sequence in repertoire.sequences):
                repertoires.append(repertoire)
                indices.append(index)

        processed_dataset.repertoires = repertoires
        processed_dataset.metadata_file = ChainRepertoireFilter.build_new_metadata(
            processed_dataset, indices, params["result_path"])

        Filter.check_dataset_not_empty(processed_dataset,
                                       "ChainRepertoireFilter")

        return processed_dataset
예제 #13
0
    def process_dataset(self, dataset: RepertoireDataset, result_path: Path):
        self.check_dataset_type(dataset, [RepertoireDataset],
                                "ChainRepertoireFilter")
        processed_dataset = dataset.clone()
        self.result_path = result_path if result_path is not None else self.result_path

        repertoires = []
        indices = []
        for index, repertoire in enumerate(dataset.get_data()):
            if all(sequence.metadata.chain == self.keep_chain
                   for sequence in repertoire.sequences):
                repertoires.append(repertoire)
                indices.append(index)

        processed_dataset.repertoires = repertoires
        processed_dataset.metadata_file = self._build_new_metadata(
            processed_dataset, indices)

        self.check_dataset_not_empty(processed_dataset,
                                     "ChainRepertoireFilter")

        return processed_dataset