class FastTextWordEmbeddings(Transformator): def __init__(self): super().__init__(embedding_dim=300) self.transformer = {} self.fitted: bool = False self.dict_length: int = 0 self.api = MongoDb() def fit(self, instances: List[Dict[str, Any]], labels: List[str]) -> None: if self.fitted is False: print("transforming data...") transformed_data = [ _transform_text(sample) for sample in tqdm(instances) ] tokenized_text = [ token for text in transformed_data for token in stop_words_filter(tokenizer(text)) ] distinct_tokens = set(tokenized_text) print("creating (words -> embeddings) hash map...") for token in tqdm(distinct_tokens): embedding = self.get_embedding_from_database(token) if embedding is not None: self.transformer[token] = embedding print("creating labels encoding hash map...") self.encoding_mapper = { value: index for index, value in enumerate(set(labels)) } self.reverse_mapper = { index: value for index, value in enumerate(set(labels)) } self.fitted = True self.dict_length = len(self.transformer.keys()) def get_embedding_from_database(self, token: str) -> torch.Tensor: embedding = self.api.get_record( collection_name="Fasttext_pretrained_embeddings", collection_id=11, hash=create_hash([token]), )[0] if embedding is None: return torch.tensor([0.0 for i in range(300)]) else: return torch.tensor(embedding["record"]["value"]) def transform_instances( self, data: List[Dict[str, Any]]) -> List[List[torch.Tensor]]: transformed_data = [_transform_text(sample) for sample in tqdm(data)] with torch.no_grad(): transformed_instances = [[ _sentence_embed([ self.transformer[token] for token in stop_words_filter(tokenizer(sentence)) if token in self.transformer ] + [torch.zeros((300))]) for sentence in sent_tokenize(text) ] for text in transformed_data] return transformed_instances def transform_labels(self, data: List[str]) -> List[int]: return [self.transform_label(sample) for sample in tqdm(data)] def reset(self) -> None: self.transformer = {} self.fitted = False self.dict_length = 0
def main(): api = MongoDb() records_1 = api.get_record( collection_name=COLLECTION_NAME, collection_id=COLLECTION_ID, label="Fake", limit=200, ) records_2 = api.get_record( collection_name=COLLECTION_NAME, collection_id=COLLECTION_ID, label="True", limit=200, ) # tweet_positive_records = api.get_record( # collection_name=TweetsEmotions_collection_name, # collection_id=TweetsEmotions_collection_id, # label=0, # limit=500, # ) # tweet_negative_records = api.get_record( # collection_name=TweetsEmotions_collection_name, # collection_id=TweetsEmotions_collection_id, # label=4, # limit=500, # ) # Define model specifications model_1 = "bayesian_dropout_nn_fast_text_embeddings" model_2 = "bayesian_dropout_nn_fast_text_embeddings" algorithm = "nn" acquisition_function_1 = "random" acquisition_function_2 = "entropy" active_learning_loops = 1 active_learning_step = 10 max_active_learning_iters = 10 initial_training_data_size = 10 validation_data_size = 400 category_1 = "SPORTS" category_2 = "COMEDY" transformation_needed = False instances = records_1 + records_2 labels = [sample["record"]["label"] for sample in records_1 + records_2] # instances = tweet_positive_records + tweet_negative_records # labels = [ # sample["record"]["label"] # for sample in tweet_positive_records + tweet_negative_records # ] instances_from_db, labels_from_db = shuffle(instances, labels, random_state=0) # HuffPostTransform = word_embed_transformator() HuffPostTransform = ( transformator() ) # I'm using here not HuffPost transformator but I'm too lazy to change all variable names HuffPostTransform.fit(instances_from_db, labels_from_db) if transformation_needed: instances = instances_from_db labels = labels_from_db else: instances = HuffPostTransform.transform_instances(instances_from_db) labels = HuffPostTransform.transform_labels(labels_from_db) # Get AUC results from an active learning simulation auc_active_learning_random_10_runs_nn = active_learning_simulation( HuffPostTransform, None, active_learning_loops, max_active_learning_iters, active_learning_step, algorithm, instances, labels, initial_training_data_size, transformation_needed, ) # Pack specifications and resutls to the list for uploading to Peony Database list_to_upload = [ model_1, acquisition_function_1, active_learning_loops, active_learning_step, max_active_learning_iters, initial_training_data_size, validation_data_size, category_1, category_2, auc_active_learning_random_10_runs_nn, ] # Upload results to Peony Database # api.load_model_results(*list_to_upload) # Get AUC results from an active learning simulation auc_active_learning_entropy_10_runs_nn = active_learning_simulation( HuffPostTransform, entropy_sampling, # false_positive_sampling, active_learning_loops, max_active_learning_iters, active_learning_step, algorithm, instances, labels, initial_training_data_size, transformation_needed, ) # Pack specifications and resutls to the list for uploading to Peony Database list_to_upload = [ model_2, acquisition_function_2, active_learning_loops, active_learning_step, max_active_learning_iters, initial_training_data_size, validation_data_size, category_1, category_2, auc_active_learning_entropy_10_runs_nn, ] # Upload results to Peony Database # api.load_model_results(*list_to_upload) visualize_two_auc_evolutions(auc_active_learning_random_10_runs_nn, auc_active_learning_entropy_10_runs_nn)
def main(): api = MongoDb() laebl_1 = api.get_record( collection_name=COLLECTION_NAME, collection_id=COLLECTION_ID, label=0, limit=300, ) laebl_2 = api.get_record( collection_name=COLLECTION_NAME, collection_id=COLLECTION_ID, label=4, limit=300, ) # laebl_1 = api.get_record( # collection_name=COLLECTION_NAME, # collection_id=COLLECTION_ID, # label=0, # limit=10, # ) # laebl_2 = api.get_record( # collection_name=COLLECTION_NAME, # collection_id=COLLECTION_ID, # label=4, # limit=10, # ) instances = laebl_1 + laebl_2 labels = [sample["record"]["label"] for sample in laebl_1 + laebl_2] instances, labels = shuffle(instances, labels, random_state=0) Transformator = transformator() # Transformator.fit(instances, labels) Transformator.fit(labels) peony_model = PeonyBoxModel( Transformator, active_learning_step=5, acquisition_function=entropy_sampling, ) # peony_model.bayesian_dropout_nn.fit(instances[50:], labels[50:]) # peony_model.bayesian_denfi_nn.reset() # peony_model.bayesian_denfi_nn.epsilon_greedy_coef = 1 # indexes = peony_model.bayesian_denfi_nn.get_learning_samples(instances[:50]) # add_training = [instances[index] for index in indexes.tolist()] # add_labels = [labels[index] for index in indexes.tolist()] # peony_model.feed_forward_nn.add_new_learning_samples(add_training, add_labels) # peony_model.feed_forward_nn.fit(instances, labels) # predicted = peony_model.bayesian_dropout_nn.predict(instances[50:]) start_time = time.time() k_fold = k_fold_corss_validation(peony_model.bayesian_dropout_nn, Transformator, instances, labels, 2) print(f"elapsed time is {time.time() - start_time}") print(auc_metrics(k_fold)) scores = [ accuracy_score(eval["true"], eval["predicted"], normalize=True) for eval in k_fold ] print(scores) print("test")