示例#1
0
def main():
    args = input_args().parse_args()

    api = MongoDb()

    api.load_data_to_database(
        fasttext_collection_name,
        Path(args.fasttext),
        fasttext_loader,
        fasttext_transformer,
    )
示例#2
0
 def __init__(self):
     self.transformer = {}
     self.fitted: bool = False
     self.dict_length: int = 0
     self.api = MongoDb()
     self.encoding_mapper: Dict[int, int] = {}
     self.reverse_mapper: Dict[int, str] = {}
示例#3
0
class PeonyDbResults:
    def __init__(self):
        self.api = MongoDb()
        self.data = self.api.get_model_results(filter_dict={})

    def structurize_data(self) -> Dict[str, Dict["str", dict]]:
        structurized_data: Dict[str, Dict["str", dict]] = {}
        for record in self.data:

            model = record["model"]
            category_1 = record["category_1"]
            category_2 = record["category_2"]
            dataset = f"{category_1} / {category_2}"
            acquisition_function = record["acquisition_function"]

            if model not in structurized_data:
                structurized_data[model] = {dataset: {acquisition_function: record}}
            else:
                if dataset not in structurized_data[model]:
                    structurized_data[model][dataset] = {acquisition_function: record}
                else:
                    if acquisition_function not in structurized_data[model][dataset]:
                        structurized_data[model][dataset][acquisition_function] = record
                    else:
                        structurized_data[model][dataset][acquisition_function][
                            "results"
                        ] = (
                            structurized_data[model][dataset][acquisition_function][
                                "results"
                            ]
                            + record["results"]
                        )
        return structurized_data
示例#4
0
def main():

    api = MongoDb()

    records_1 = api.get_record(
        collection_name=COLLECTION_NAME,
        collection_id=COLLECTION_ID,
        label="Fake",
        limit=200,
    )

    records_2 = api.get_record(
        collection_name=COLLECTION_NAME,
        collection_id=COLLECTION_ID,
        label="True",
        limit=200,
    )

    # tweet_positive_records = api.get_record(
    #     collection_name=TweetsEmotions_collection_name,
    #     collection_id=TweetsEmotions_collection_id,
    #     label=0,
    #     limit=500,
    # )
    # tweet_negative_records = api.get_record(
    #     collection_name=TweetsEmotions_collection_name,
    #     collection_id=TweetsEmotions_collection_id,
    #     label=4,
    #     limit=500,
    # )

    # Define model specifications
    model_1 = "bayesian_dropout_nn_fast_text_embeddings"
    model_2 = "bayesian_dropout_nn_fast_text_embeddings"
    algorithm = "nn"
    acquisition_function_1 = "random"
    acquisition_function_2 = "entropy"
    active_learning_loops = 1
    active_learning_step = 10
    max_active_learning_iters = 10
    initial_training_data_size = 10
    validation_data_size = 400
    category_1 = "SPORTS"
    category_2 = "COMEDY"
    transformation_needed = False

    instances = records_1 + records_2
    labels = [sample["record"]["label"] for sample in records_1 + records_2]

    # instances = tweet_positive_records + tweet_negative_records
    # labels = [
    #     sample["record"]["label"]
    #     for sample in tweet_positive_records + tweet_negative_records
    # ]

    instances_from_db, labels_from_db = shuffle(instances,
                                                labels,
                                                random_state=0)

    # HuffPostTransform = word_embed_transformator()

    HuffPostTransform = (
        transformator()
    )  # I'm using here not HuffPost transformator but I'm too lazy to change all variable names

    HuffPostTransform.fit(instances_from_db, labels_from_db)

    if transformation_needed:
        instances = instances_from_db
        labels = labels_from_db
    else:
        instances = HuffPostTransform.transform_instances(instances_from_db)
        labels = HuffPostTransform.transform_labels(labels_from_db)

    # Get AUC results from an active learning simulation
    auc_active_learning_random_10_runs_nn = active_learning_simulation(
        HuffPostTransform,
        None,
        active_learning_loops,
        max_active_learning_iters,
        active_learning_step,
        algorithm,
        instances,
        labels,
        initial_training_data_size,
        transformation_needed,
    )

    # Pack specifications and resutls to the list for uploading to Peony Database
    list_to_upload = [
        model_1,
        acquisition_function_1,
        active_learning_loops,
        active_learning_step,
        max_active_learning_iters,
        initial_training_data_size,
        validation_data_size,
        category_1,
        category_2,
        auc_active_learning_random_10_runs_nn,
    ]

    # Upload results to Peony Database
    # api.load_model_results(*list_to_upload)

    # Get AUC results from an active learning simulation
    auc_active_learning_entropy_10_runs_nn = active_learning_simulation(
        HuffPostTransform,
        entropy_sampling,  # false_positive_sampling,
        active_learning_loops,
        max_active_learning_iters,
        active_learning_step,
        algorithm,
        instances,
        labels,
        initial_training_data_size,
        transformation_needed,
    )

    # Pack specifications and resutls to the list for uploading to Peony Database
    list_to_upload = [
        model_2,
        acquisition_function_2,
        active_learning_loops,
        active_learning_step,
        max_active_learning_iters,
        initial_training_data_size,
        validation_data_size,
        category_1,
        category_2,
        auc_active_learning_entropy_10_runs_nn,
    ]

    # Upload results to Peony Database
    # api.load_model_results(*list_to_upload)

    visualize_two_auc_evolutions(auc_active_learning_random_10_runs_nn,
                                 auc_active_learning_entropy_10_runs_nn)
示例#5
0
from PeonyPackage.PeonyDb import MongoDb
from Peony_visualization.src.peony_visualization import visualize_two_auc_evolutions

api = MongoDb()

# Random acquisition function
svm_random_sampling_results = api.get_model_results({
    "model":
    "bayesian_denfi_nn_hot_start_fast_text_embeddings",
    "acquisition_function":
    "random",
    "category_1":
    "POSITIVE_EMOTIONS_TWEETS",
})
svm_random_sampling_results = [
    item for val in svm_random_sampling_results for item in val["results"]
]

# Entropy acquisition function
svm_false_positive_sampling_results = api.get_model_results({
    "model":
    "bayesian_denfi_nn_hot_start_fast_text_embeddings",
    "acquisition_function":
    "entropy",
    "category_1":
    "POSITIVE_EMOTIONS_TWEETS",
})
svm_false_positive_sampling_results = [
    item for val in svm_false_positive_sampling_results
    for item in val["results"]
]
示例#6
0
 def __init__(self):
     self.api = MongoDb()
     self.data = self.api.get_model_results(filter_dict={})
示例#7
0
import dash_html_components as html
import plotly.express as px
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import plotly.graph_objects as go
import datetime
import re

from dash.dependencies import Input, Output, State
from plotly.colors import n_colors
from PeonyPackage.PeonyDb import MongoDb
from Peony_database.src.database_results.results_summary import PeonyDbResults

external_stylesheets = ["https://codepen.io/chriddyp/pen/bWLwgP.css"]
api = MongoDb()
app = dash.Dash(__name__, external_stylesheets=external_stylesheets)
database_results = PeonyDbResults()

tabs_styles = {"height": "44px"}

tab_style = {
    "borderBottom": "1px solid #d6d6d6",
    "padding": "6px",
}

tab_selected_style = {
    "borderTop": "1px solid #d6d6d6",
    "borderBottom": "1px solid #d6d6d6",
    "backgroundColor": "#1f2e2e",
    "color": "white",
示例#8
0
 def __init__(self):
     super().__init__(embedding_dim=300)
     self.transformer = {}
     self.fitted: bool = False
     self.dict_length: int = 0
     self.api = MongoDb()
示例#9
0
class FastTextWordEmbeddings(Transformator):
    def __init__(self):
        super().__init__(embedding_dim=300)
        self.transformer = {}
        self.fitted: bool = False
        self.dict_length: int = 0
        self.api = MongoDb()

    def fit(self, instances: List[Dict[str, Any]], labels: List[str]) -> None:
        if self.fitted is False:
            print("transforming data...")
            transformed_data = [
                _transform_text(sample) for sample in tqdm(instances)
            ]
            tokenized_text = [
                token for text in transformed_data
                for token in stop_words_filter(tokenizer(text))
            ]
            distinct_tokens = set(tokenized_text)
            print("creating (words -> embeddings) hash map...")
            for token in tqdm(distinct_tokens):
                embedding = self.get_embedding_from_database(token)
                if embedding is not None:
                    self.transformer[token] = embedding
            print("creating labels encoding hash map...")
            self.encoding_mapper = {
                value: index
                for index, value in enumerate(set(labels))
            }
            self.reverse_mapper = {
                index: value
                for index, value in enumerate(set(labels))
            }
            self.fitted = True
            self.dict_length = len(self.transformer.keys())

    def get_embedding_from_database(self, token: str) -> torch.Tensor:
        embedding = self.api.get_record(
            collection_name="Fasttext_pretrained_embeddings",
            collection_id=11,
            hash=create_hash([token]),
        )[0]
        if embedding is None:
            return torch.tensor([0.0 for i in range(300)])
        else:
            return torch.tensor(embedding["record"]["value"])

    def transform_instances(
            self, data: List[Dict[str, Any]]) -> List[List[torch.Tensor]]:
        transformed_data = [_transform_text(sample) for sample in tqdm(data)]

        with torch.no_grad():
            transformed_instances = [[
                _sentence_embed([
                    self.transformer[token]
                    for token in stop_words_filter(tokenizer(sentence))
                    if token in self.transformer
                ] + [torch.zeros((300))]) for sentence in sent_tokenize(text)
            ] for text in transformed_data]
        return transformed_instances

    def transform_labels(self, data: List[str]) -> List[int]:
        return [self.transform_label(sample) for sample in tqdm(data)]

    def reset(self) -> None:
        self.transformer = {}
        self.fitted = False
        self.dict_length = 0
示例#10
0
def main():
    args = input_args().parse_args()

    api = MongoDb()

    if args.huffpost:
        api.load_data_to_database(
            HuffPost_collection_name,
            Path(args.huffpost),
            HuffPost_loader,
            HuffPost_transformer,
        )

    if args.newsgroups:
        api.load_data_to_database(
            NewsGroups_collection_name,
            Path(args.newsgroups),
            NewsGroups_loader,
            NewsGroups_transformer,
        )

    if args.tweets:
        api.load_data_to_database(
            Tweets_collection_name, Path(args.tweets), Tweets_loader, Tweets_transformer
        )

    if args.comments:
        api.load_data_to_database(
            Comments_collection_name,
            Path(args.comments),
            Comments_loader,
            Comments_transformer,
        )

    if args.emotions:
        api.load_data_to_database(
            Emotions_collection_name,
            Path(args.emotions),
            Emotions_loader,
            Emotions_transformer,
        )

    if args.fake_news:
        api.load_data_to_database(
            fake_news_collection_name,
            Path(args.fake_news),
            fake_news_loader,
            fake_news_transformer,
        )

    if args.fake_news_detection:
        api.load_data_to_database(
            fake_news_detection_collection_name,
            Path(args.fake_news_detection),
            fake_news_detection_loader,
            fake_news_detection_transformer,
        )

    if args.liar_paragraph:
        api.load_data_to_database(
            liar_paragraph_collection_name,
            Path(args.liar_paragraph),
            liar_paragraph_loader,
            liar_paragraph_transformer,
        )

    if args.liar_full_text:
        api.load_data_to_database(
            liar_full_text_collection_name,
            Path(args.liar_full_text),
            liar_full_text_loader,
            liar_full_text_transformer,
        )
示例#11
0
def main():
    api = MongoDb()
    laebl_1 = api.get_record(
        collection_name=COLLECTION_NAME,
        collection_id=COLLECTION_ID,
        label=0,
        limit=300,
    )

    laebl_2 = api.get_record(
        collection_name=COLLECTION_NAME,
        collection_id=COLLECTION_ID,
        label=4,
        limit=300,
    )

    # laebl_1 = api.get_record(
    #     collection_name=COLLECTION_NAME,
    #     collection_id=COLLECTION_ID,
    #     label=0,
    #     limit=10,
    # )
    # laebl_2 = api.get_record(
    #     collection_name=COLLECTION_NAME,
    #     collection_id=COLLECTION_ID,
    #     label=4,
    #     limit=10,
    # )

    instances = laebl_1 + laebl_2
    labels = [sample["record"]["label"] for sample in laebl_1 + laebl_2]

    instances, labels = shuffle(instances, labels, random_state=0)

    Transformator = transformator()
    # Transformator.fit(instances, labels)
    Transformator.fit(labels)

    peony_model = PeonyBoxModel(
        Transformator,
        active_learning_step=5,
        acquisition_function=entropy_sampling,
    )
    # peony_model.bayesian_dropout_nn.fit(instances[50:], labels[50:])
    # peony_model.bayesian_denfi_nn.reset()
    # peony_model.bayesian_denfi_nn.epsilon_greedy_coef = 1
    # indexes = peony_model.bayesian_denfi_nn.get_learning_samples(instances[:50])

    # add_training = [instances[index] for index in indexes.tolist()]
    # add_labels = [labels[index] for index in indexes.tolist()]

    # peony_model.feed_forward_nn.add_new_learning_samples(add_training, add_labels)
    # peony_model.feed_forward_nn.fit(instances, labels)
    # predicted = peony_model.bayesian_dropout_nn.predict(instances[50:])

    start_time = time.time()
    k_fold = k_fold_corss_validation(peony_model.bayesian_dropout_nn,
                                     Transformator, instances, labels, 2)
    print(f"elapsed time is {time.time() - start_time}")

    print(auc_metrics(k_fold))

    scores = [
        accuracy_score(eval["true"], eval["predicted"], normalize=True)
        for eval in k_fold
    ]

    print(scores)
    print("test")