示例#1
0
class FastTextWordEmbeddings(Transformator):
    def __init__(self):
        super().__init__(embedding_dim=300)
        self.transformer = {}
        self.fitted: bool = False
        self.dict_length: int = 0
        self.api = MongoDb()

    def fit(self, instances: List[Dict[str, Any]], labels: List[str]) -> None:
        if self.fitted is False:
            print("transforming data...")
            transformed_data = [
                _transform_text(sample) for sample in tqdm(instances)
            ]
            tokenized_text = [
                token for text in transformed_data
                for token in stop_words_filter(tokenizer(text))
            ]
            distinct_tokens = set(tokenized_text)
            print("creating (words -> embeddings) hash map...")
            for token in tqdm(distinct_tokens):
                embedding = self.get_embedding_from_database(token)
                if embedding is not None:
                    self.transformer[token] = embedding
            print("creating labels encoding hash map...")
            self.encoding_mapper = {
                value: index
                for index, value in enumerate(set(labels))
            }
            self.reverse_mapper = {
                index: value
                for index, value in enumerate(set(labels))
            }
            self.fitted = True
            self.dict_length = len(self.transformer.keys())

    def get_embedding_from_database(self, token: str) -> torch.Tensor:
        embedding = self.api.get_record(
            collection_name="Fasttext_pretrained_embeddings",
            collection_id=11,
            hash=create_hash([token]),
        )[0]
        if embedding is None:
            return torch.tensor([0.0 for i in range(300)])
        else:
            return torch.tensor(embedding["record"]["value"])

    def transform_instances(
            self, data: List[Dict[str, Any]]) -> List[List[torch.Tensor]]:
        transformed_data = [_transform_text(sample) for sample in tqdm(data)]

        with torch.no_grad():
            transformed_instances = [[
                _sentence_embed([
                    self.transformer[token]
                    for token in stop_words_filter(tokenizer(sentence))
                    if token in self.transformer
                ] + [torch.zeros((300))]) for sentence in sent_tokenize(text)
            ] for text in transformed_data]
        return transformed_instances

    def transform_labels(self, data: List[str]) -> List[int]:
        return [self.transform_label(sample) for sample in tqdm(data)]

    def reset(self) -> None:
        self.transformer = {}
        self.fitted = False
        self.dict_length = 0
示例#2
0
def main():

    api = MongoDb()

    records_1 = api.get_record(
        collection_name=COLLECTION_NAME,
        collection_id=COLLECTION_ID,
        label="Fake",
        limit=200,
    )

    records_2 = api.get_record(
        collection_name=COLLECTION_NAME,
        collection_id=COLLECTION_ID,
        label="True",
        limit=200,
    )

    # tweet_positive_records = api.get_record(
    #     collection_name=TweetsEmotions_collection_name,
    #     collection_id=TweetsEmotions_collection_id,
    #     label=0,
    #     limit=500,
    # )
    # tweet_negative_records = api.get_record(
    #     collection_name=TweetsEmotions_collection_name,
    #     collection_id=TweetsEmotions_collection_id,
    #     label=4,
    #     limit=500,
    # )

    # Define model specifications
    model_1 = "bayesian_dropout_nn_fast_text_embeddings"
    model_2 = "bayesian_dropout_nn_fast_text_embeddings"
    algorithm = "nn"
    acquisition_function_1 = "random"
    acquisition_function_2 = "entropy"
    active_learning_loops = 1
    active_learning_step = 10
    max_active_learning_iters = 10
    initial_training_data_size = 10
    validation_data_size = 400
    category_1 = "SPORTS"
    category_2 = "COMEDY"
    transformation_needed = False

    instances = records_1 + records_2
    labels = [sample["record"]["label"] for sample in records_1 + records_2]

    # instances = tweet_positive_records + tweet_negative_records
    # labels = [
    #     sample["record"]["label"]
    #     for sample in tweet_positive_records + tweet_negative_records
    # ]

    instances_from_db, labels_from_db = shuffle(instances,
                                                labels,
                                                random_state=0)

    # HuffPostTransform = word_embed_transformator()

    HuffPostTransform = (
        transformator()
    )  # I'm using here not HuffPost transformator but I'm too lazy to change all variable names

    HuffPostTransform.fit(instances_from_db, labels_from_db)

    if transformation_needed:
        instances = instances_from_db
        labels = labels_from_db
    else:
        instances = HuffPostTransform.transform_instances(instances_from_db)
        labels = HuffPostTransform.transform_labels(labels_from_db)

    # Get AUC results from an active learning simulation
    auc_active_learning_random_10_runs_nn = active_learning_simulation(
        HuffPostTransform,
        None,
        active_learning_loops,
        max_active_learning_iters,
        active_learning_step,
        algorithm,
        instances,
        labels,
        initial_training_data_size,
        transformation_needed,
    )

    # Pack specifications and resutls to the list for uploading to Peony Database
    list_to_upload = [
        model_1,
        acquisition_function_1,
        active_learning_loops,
        active_learning_step,
        max_active_learning_iters,
        initial_training_data_size,
        validation_data_size,
        category_1,
        category_2,
        auc_active_learning_random_10_runs_nn,
    ]

    # Upload results to Peony Database
    # api.load_model_results(*list_to_upload)

    # Get AUC results from an active learning simulation
    auc_active_learning_entropy_10_runs_nn = active_learning_simulation(
        HuffPostTransform,
        entropy_sampling,  # false_positive_sampling,
        active_learning_loops,
        max_active_learning_iters,
        active_learning_step,
        algorithm,
        instances,
        labels,
        initial_training_data_size,
        transformation_needed,
    )

    # Pack specifications and resutls to the list for uploading to Peony Database
    list_to_upload = [
        model_2,
        acquisition_function_2,
        active_learning_loops,
        active_learning_step,
        max_active_learning_iters,
        initial_training_data_size,
        validation_data_size,
        category_1,
        category_2,
        auc_active_learning_entropy_10_runs_nn,
    ]

    # Upload results to Peony Database
    # api.load_model_results(*list_to_upload)

    visualize_two_auc_evolutions(auc_active_learning_random_10_runs_nn,
                                 auc_active_learning_entropy_10_runs_nn)
示例#3
0
def main():
    api = MongoDb()
    laebl_1 = api.get_record(
        collection_name=COLLECTION_NAME,
        collection_id=COLLECTION_ID,
        label=0,
        limit=300,
    )

    laebl_2 = api.get_record(
        collection_name=COLLECTION_NAME,
        collection_id=COLLECTION_ID,
        label=4,
        limit=300,
    )

    # laebl_1 = api.get_record(
    #     collection_name=COLLECTION_NAME,
    #     collection_id=COLLECTION_ID,
    #     label=0,
    #     limit=10,
    # )
    # laebl_2 = api.get_record(
    #     collection_name=COLLECTION_NAME,
    #     collection_id=COLLECTION_ID,
    #     label=4,
    #     limit=10,
    # )

    instances = laebl_1 + laebl_2
    labels = [sample["record"]["label"] for sample in laebl_1 + laebl_2]

    instances, labels = shuffle(instances, labels, random_state=0)

    Transformator = transformator()
    # Transformator.fit(instances, labels)
    Transformator.fit(labels)

    peony_model = PeonyBoxModel(
        Transformator,
        active_learning_step=5,
        acquisition_function=entropy_sampling,
    )
    # peony_model.bayesian_dropout_nn.fit(instances[50:], labels[50:])
    # peony_model.bayesian_denfi_nn.reset()
    # peony_model.bayesian_denfi_nn.epsilon_greedy_coef = 1
    # indexes = peony_model.bayesian_denfi_nn.get_learning_samples(instances[:50])

    # add_training = [instances[index] for index in indexes.tolist()]
    # add_labels = [labels[index] for index in indexes.tolist()]

    # peony_model.feed_forward_nn.add_new_learning_samples(add_training, add_labels)
    # peony_model.feed_forward_nn.fit(instances, labels)
    # predicted = peony_model.bayesian_dropout_nn.predict(instances[50:])

    start_time = time.time()
    k_fold = k_fold_corss_validation(peony_model.bayesian_dropout_nn,
                                     Transformator, instances, labels, 2)
    print(f"elapsed time is {time.time() - start_time}")

    print(auc_metrics(k_fold))

    scores = [
        accuracy_score(eval["true"], eval["predicted"], normalize=True)
        for eval in k_fold
    ]

    print(scores)
    print("test")