def test_create_all_kmers(self): alphabet = list("ABCD") k = 2 kmers = KmerHelper.create_all_kmers(k=k, alphabet=alphabet) self.assertEqual(len(kmers), 16) self.assertTrue("BD" in kmers) self.assertTrue("DA" in kmers)
def create_model(self, dataset: RepertoireDataset, k: int, vector_size: int, batch_size: int, model_path: Path): model = Word2Vec(size=vector_size, min_count=1, window=5) # creates an empty model all_kmers = KmerHelper.create_all_kmers(k=k, alphabet=EnvironmentSettings.get_sequence_alphabet()) all_kmers = [[kmer] for kmer in all_kmers] model.build_vocab(all_kmers) for repertoire in dataset.get_data(batch_size=batch_size): sentences = KmerHelper.create_sentences_from_repertoire(repertoire=repertoire, k=k) model.train(sentences=sentences, total_words=len(all_kmers), epochs=15) model.save(str(model_path)) return model
def create_model(self, dataset: RepertoireDataset, k: int, vector_size: int, batch_size: int, model_path: Path): model = Word2Vec(size=vector_size, min_count=1, window=5) # creates an empty model all_kmers = KmerHelper.create_all_kmers( k=k, alphabet=EnvironmentSettings.get_sequence_alphabet()) all_kmers = [[kmer] for kmer in all_kmers] model.build_vocab(all_kmers) for kmer in all_kmers: sentences = KmerHelper.create_kmers_within_HD( kmer=kmer[0], alphabet=EnvironmentSettings.get_sequence_alphabet(), distance=1) model.train(sentences=sentences, total_words=len(all_kmers), epochs=model.epochs) model.save(str(model_path)) return model