示例#1
0
 def __init__(self):
     self.transformer = {}
     self.fitted: bool = False
     self.dict_length: int = 0
     self.api = MongoDb()
     self.encoding_mapper: Dict[int, int] = {}
     self.reverse_mapper: Dict[int, str] = {}
示例#2
0
def main():
    args = input_args().parse_args()

    api = MongoDb()

    api.load_data_to_database(
        fasttext_collection_name,
        Path(args.fasttext),
        fasttext_loader,
        fasttext_transformer,
    )
示例#3
0
def main():

    api = MongoDb()

    records_1 = api.get_record(
        collection_name=COLLECTION_NAME,
        collection_id=COLLECTION_ID,
        label="Fake",
        limit=200,
    )

    records_2 = api.get_record(
        collection_name=COLLECTION_NAME,
        collection_id=COLLECTION_ID,
        label="True",
        limit=200,
    )

    # tweet_positive_records = api.get_record(
    #     collection_name=TweetsEmotions_collection_name,
    #     collection_id=TweetsEmotions_collection_id,
    #     label=0,
    #     limit=500,
    # )
    # tweet_negative_records = api.get_record(
    #     collection_name=TweetsEmotions_collection_name,
    #     collection_id=TweetsEmotions_collection_id,
    #     label=4,
    #     limit=500,
    # )

    # Define model specifications
    model_1 = "bayesian_dropout_nn_fast_text_embeddings"
    model_2 = "bayesian_dropout_nn_fast_text_embeddings"
    algorithm = "nn"
    acquisition_function_1 = "random"
    acquisition_function_2 = "entropy"
    active_learning_loops = 1
    active_learning_step = 10
    max_active_learning_iters = 10
    initial_training_data_size = 10
    validation_data_size = 400
    category_1 = "SPORTS"
    category_2 = "COMEDY"
    transformation_needed = False

    instances = records_1 + records_2
    labels = [sample["record"]["label"] for sample in records_1 + records_2]

    # instances = tweet_positive_records + tweet_negative_records
    # labels = [
    #     sample["record"]["label"]
    #     for sample in tweet_positive_records + tweet_negative_records
    # ]

    instances_from_db, labels_from_db = shuffle(instances,
                                                labels,
                                                random_state=0)

    # HuffPostTransform = word_embed_transformator()

    HuffPostTransform = (
        transformator()
    )  # I'm using here not HuffPost transformator but I'm too lazy to change all variable names

    HuffPostTransform.fit(instances_from_db, labels_from_db)

    if transformation_needed:
        instances = instances_from_db
        labels = labels_from_db
    else:
        instances = HuffPostTransform.transform_instances(instances_from_db)
        labels = HuffPostTransform.transform_labels(labels_from_db)

    # Get AUC results from an active learning simulation
    auc_active_learning_random_10_runs_nn = active_learning_simulation(
        HuffPostTransform,
        None,
        active_learning_loops,
        max_active_learning_iters,
        active_learning_step,
        algorithm,
        instances,
        labels,
        initial_training_data_size,
        transformation_needed,
    )

    # Pack specifications and resutls to the list for uploading to Peony Database
    list_to_upload = [
        model_1,
        acquisition_function_1,
        active_learning_loops,
        active_learning_step,
        max_active_learning_iters,
        initial_training_data_size,
        validation_data_size,
        category_1,
        category_2,
        auc_active_learning_random_10_runs_nn,
    ]

    # Upload results to Peony Database
    # api.load_model_results(*list_to_upload)

    # Get AUC results from an active learning simulation
    auc_active_learning_entropy_10_runs_nn = active_learning_simulation(
        HuffPostTransform,
        entropy_sampling,  # false_positive_sampling,
        active_learning_loops,
        max_active_learning_iters,
        active_learning_step,
        algorithm,
        instances,
        labels,
        initial_training_data_size,
        transformation_needed,
    )

    # Pack specifications and resutls to the list for uploading to Peony Database
    list_to_upload = [
        model_2,
        acquisition_function_2,
        active_learning_loops,
        active_learning_step,
        max_active_learning_iters,
        initial_training_data_size,
        validation_data_size,
        category_1,
        category_2,
        auc_active_learning_entropy_10_runs_nn,
    ]

    # Upload results to Peony Database
    # api.load_model_results(*list_to_upload)

    visualize_two_auc_evolutions(auc_active_learning_random_10_runs_nn,
                                 auc_active_learning_entropy_10_runs_nn)
示例#4
0
from PeonyPackage.PeonyDb import MongoDb
from Peony_visualization.src.peony_visualization import visualize_two_auc_evolutions

api = MongoDb()

# Random acquisition function
svm_random_sampling_results = api.get_model_results({
    "model":
    "bayesian_denfi_nn_hot_start_fast_text_embeddings",
    "acquisition_function":
    "random",
    "category_1":
    "POSITIVE_EMOTIONS_TWEETS",
})
svm_random_sampling_results = [
    item for val in svm_random_sampling_results for item in val["results"]
]

# Entropy acquisition function
svm_false_positive_sampling_results = api.get_model_results({
    "model":
    "bayesian_denfi_nn_hot_start_fast_text_embeddings",
    "acquisition_function":
    "entropy",
    "category_1":
    "POSITIVE_EMOTIONS_TWEETS",
})
svm_false_positive_sampling_results = [
    item for val in svm_false_positive_sampling_results
    for item in val["results"]
]
示例#5
0
 def __init__(self):
     self.api = MongoDb()
     self.data = self.api.get_model_results(filter_dict={})
示例#6
0
 def __init__(self):
     super().__init__(embedding_dim=300)
     self.transformer = {}
     self.fitted: bool = False
     self.dict_length: int = 0
     self.api = MongoDb()
示例#7
0
def main():
    args = input_args().parse_args()

    api = MongoDb()

    if args.huffpost:
        api.load_data_to_database(
            HuffPost_collection_name,
            Path(args.huffpost),
            HuffPost_loader,
            HuffPost_transformer,
        )

    if args.newsgroups:
        api.load_data_to_database(
            NewsGroups_collection_name,
            Path(args.newsgroups),
            NewsGroups_loader,
            NewsGroups_transformer,
        )

    if args.tweets:
        api.load_data_to_database(
            Tweets_collection_name, Path(args.tweets), Tweets_loader, Tweets_transformer
        )

    if args.comments:
        api.load_data_to_database(
            Comments_collection_name,
            Path(args.comments),
            Comments_loader,
            Comments_transformer,
        )

    if args.emotions:
        api.load_data_to_database(
            Emotions_collection_name,
            Path(args.emotions),
            Emotions_loader,
            Emotions_transformer,
        )

    if args.fake_news:
        api.load_data_to_database(
            fake_news_collection_name,
            Path(args.fake_news),
            fake_news_loader,
            fake_news_transformer,
        )

    if args.fake_news_detection:
        api.load_data_to_database(
            fake_news_detection_collection_name,
            Path(args.fake_news_detection),
            fake_news_detection_loader,
            fake_news_detection_transformer,
        )

    if args.liar_paragraph:
        api.load_data_to_database(
            liar_paragraph_collection_name,
            Path(args.liar_paragraph),
            liar_paragraph_loader,
            liar_paragraph_transformer,
        )

    if args.liar_full_text:
        api.load_data_to_database(
            liar_full_text_collection_name,
            Path(args.liar_full_text),
            liar_full_text_loader,
            liar_full_text_transformer,
        )
示例#8
0
def main():
    api = MongoDb()
    laebl_1 = api.get_record(
        collection_name=COLLECTION_NAME,
        collection_id=COLLECTION_ID,
        label=0,
        limit=300,
    )

    laebl_2 = api.get_record(
        collection_name=COLLECTION_NAME,
        collection_id=COLLECTION_ID,
        label=4,
        limit=300,
    )

    # laebl_1 = api.get_record(
    #     collection_name=COLLECTION_NAME,
    #     collection_id=COLLECTION_ID,
    #     label=0,
    #     limit=10,
    # )
    # laebl_2 = api.get_record(
    #     collection_name=COLLECTION_NAME,
    #     collection_id=COLLECTION_ID,
    #     label=4,
    #     limit=10,
    # )

    instances = laebl_1 + laebl_2
    labels = [sample["record"]["label"] for sample in laebl_1 + laebl_2]

    instances, labels = shuffle(instances, labels, random_state=0)

    Transformator = transformator()
    # Transformator.fit(instances, labels)
    Transformator.fit(labels)

    peony_model = PeonyBoxModel(
        Transformator,
        active_learning_step=5,
        acquisition_function=entropy_sampling,
    )
    # peony_model.bayesian_dropout_nn.fit(instances[50:], labels[50:])
    # peony_model.bayesian_denfi_nn.reset()
    # peony_model.bayesian_denfi_nn.epsilon_greedy_coef = 1
    # indexes = peony_model.bayesian_denfi_nn.get_learning_samples(instances[:50])

    # add_training = [instances[index] for index in indexes.tolist()]
    # add_labels = [labels[index] for index in indexes.tolist()]

    # peony_model.feed_forward_nn.add_new_learning_samples(add_training, add_labels)
    # peony_model.feed_forward_nn.fit(instances, labels)
    # predicted = peony_model.bayesian_dropout_nn.predict(instances[50:])

    start_time = time.time()
    k_fold = k_fold_corss_validation(peony_model.bayesian_dropout_nn,
                                     Transformator, instances, labels, 2)
    print(f"elapsed time is {time.time() - start_time}")

    print(auc_metrics(k_fold))

    scores = [
        accuracy_score(eval["true"], eval["predicted"], normalize=True)
        for eval in k_fold
    ]

    print(scores)
    print("test")