def tagging(documents):
    nDocs = len(documents)
    documentsProcessed = []
    unigram_tagger = []
    try:        
        unigram_tagger = file_utils.load_object('tagger1','tagger', None)
    except:
        train_set =  mac_morpho.tagged_sents() # Treinamento do tagger via dicionário mac_morpho
        unigram_tagger = nltk.UnigramTagger(train_set) # Aplicação do tagger ao documento
        file_utils.save_object(unigram_tagger, 'tagger1','tagger', None) # Salva os unigramas tagueados a serem dicionados aos documentos processados 
    for iDoc in range(0,nDocs): # Ordenação dos diferentes documentos a serem processados
        documentsProcessed.append(unigram_tagger.tag(documents[iDoc])) # Adiciona à lista a partir da ordenação
    return documentsProcessed
Exemplo n.º 2
0
def get_random_predicted_samples(model, dataset, labels):
    samples_amount = 5
    correct_predictions = []
    incorrect_predictions = []
    num_samples, _ = dataset.shape
    i = 0

    while i < num_samples and \
            (len(correct_predictions) < samples_amount or len(incorrect_predictions) < samples_amount):
        feature_vector = np.array(dataset.iloc[i, :]).reshape(1, -1)
        predicted_label = model.predict(feature_vector)[0]
        correct_label = labels[i]
        prediction = (i, predicted_label, correct_label)
        if predicted_label == correct_label:
            correct_predictions.append(prediction)
        else:
            incorrect_predictions.append(prediction)
        i += 1

    for k, (i, predicted_label,
            correct_label) in enumerate(correct_predictions):
        tweet_id = dataset.index[i]
        tweet = load_object("data/tweets/" + str(tweet_id))
        tweet_prediction = (tweet, predicted_label, correct_label)
        correct_predictions[k] = tweet_prediction
    for k, (i, predicted_label,
            correct_label) in enumerate(incorrect_predictions):
        tweet_id = dataset.index[i]
        tweet = load_object("data/tweets/" + str(tweet_id))
        tweet_prediction = (tweet, predicted_label, correct_label)
        incorrect_predictions[k] = tweet_prediction

    if len(correct_predictions) > 5:
        correct_predictions = sample(correct_predictions, samples_amount)
    if len(incorrect_predictions) > 5:
        incorrect_predictions = sample(incorrect_predictions, samples_amount)

    return correct_predictions, incorrect_predictions
Exemplo n.º 3
0
def tagging(documents):
    nDocs = len(documents)
    #    print nDocs
    documentsProcessed = []
    unigram_tagger = []
    try:
        unigram_tagger = file_utils.load_object('tagger1', 'tagger', None)
        print unigram_tagger
    except:
        train_set = mac_morpho.tagged_sents()
        #test_set =  mac_morpho.tagged_sents()[10001:10010]
        unigram_tagger = nltk.UnigramTagger(train_set)
        file_utils.save_object(unigram_tagger, 'tagger1', 'tagger', None)

    for iDoc in range(0, nDocs):
        #tokens = documents[iDoc]
        documentsProcessed.append(unigram_tagger.tag(documents[iDoc]))
    return documentsProcessed
Exemplo n.º 4
0
def tagging(documents):
    nDocs = len(documents)
    documentsProcessed = []
    unigram_tagger = []
    try:
        unigram_tagger = file_utils.load_object('tagger1', 'tagger', None)
    except:
        train_set = mac_morpho.tagged_sents(
        )  # Treinamento do tagger via dicionário mac_morpho
        unigram_tagger = nltk.UnigramTagger(
            train_set)  # Aplicação do tagger ao documento
        file_utils.save_object(
            unigram_tagger, 'tagger1', 'tagger', None
        )  # Salva os unigramas tagueados a serem dicionados aos documentos processados
    for iDoc in range(
            0,
            nDocs):  # Ordenação dos diferentes documentos a serem processados
        documentsProcessed.append(unigram_tagger.tag(
            documents[iDoc]))  # Adiciona à lista a partir da ordenação
    return documentsProcessed
Exemplo n.º 5
0
    def tagging(self, documents, savePath, language):
        nDocs = len(documents)
        documentsProcessed = []
        unigram_tagger = []
        from data_core.file_utils import FileUtils
        file_utils = FileUtils(savePath)
        try:
            unigram_tagger = file_utils.load_object('tagger_' + language,
                                                    'tagger')
        except:
            if language == "pt":
                train_set = mac_morpho.tagged_sents()
            elif language == "en":
                train_set = brown.tagged_sents(tagset='universal')
                #print(train_set[0:1])
                nSents = len(train_set)
                train_set_lower = []
                for iSent in range(0, nSents):
                    nWords = len(train_set[iSent])
                    words = []
                    for iWord in range(0, nWords):
                        words.append(
                            (self.text_lower_one([train_set[iSent][iWord][0]
                                                  ])[0],
                             train_set[iSent][iWord][1]))

                    train_set_lower.append(words)

            #print(train_set_lower[0:1])
            #test_set =  mac_morpho.tagged_sents()[10001:10010]
            unigram_tagger = nltk.UnigramTagger(train_set_lower)
            file_utils.save_object(unigram_tagger, 'tagger_' + language,
                                   'tagger')

        for iDoc in range(0, nDocs):
            #tokens = documents[iDoc]
            documentsProcessed.append(unigram_tagger.tag(documents[iDoc]))
        return documentsProcessed
Exemplo n.º 6
0
    plt.plot([0, 1], [0, 1], color='red', lw=3, linestyle='--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.xlabel('False Positive Rate', fontsize=18)
    plt.xticks(fontsize=14)
    plt.ylabel('True Positive Rate', fontsize=18)
    plt.yticks(fontsize=14)
    plt.title('ROC Curve - ' + str(name) + '\nAccuracy: ' + str(accuracy) +
              '\nROC AUC score: ' + str(roc_score),
              fontsize=24)
    plt.savefig(file, dpi="figure")
    return confusion_mat


if __name__ == "__main__":
    subreddits_dictionary = load_object(
        'data/dictionaries/subreddits_dictionary_reduced')

    involvement_dictionary = load_object(
        'data/dictionaries/involvement_dictionary_reduced')
    involvement = [value for value in involvement_dictionary.values()]
    involvement = remove_extreme_values(involvement)

    generate_distribution_plot(
        involvement, 'User-subreddit involvement',
        "plots/user_subreddit_involvement_distribution.png")

    sus_network = nx.read_gpickle('data/networks/SuS')
    sus_degrees = dict(nx.degree(sus_network))
    sus_degrees = list(sus_degrees.values())

    sus_network_reduced = reduce_graph_by_weight_threshold(
Exemplo n.º 7
0
    if len(incorrect_predictions) > 5:
        incorrect_predictions = sample(incorrect_predictions, samples_amount)

    return correct_predictions, incorrect_predictions


def print_tweet_samples(tweet_samples):
    for tweet, predicted_class, correct_class in tweet_samples:
        print("Tweet:", tweet)
        print("Predicted class:", predicted_class)
        print("Tweet class:", correct_class)
        print()


if __name__ == "__main__":
    top_comment_features = load_object(
        "data/features/top/top_comment_features")
    print("Top comment features:", top_comment_features)
    top_query_features = load_object("data/features/top/top_query_features")
    print("Top query features:", top_query_features)
    top_support_deny_features = load_object(
        "data/features/top/top_support_deny_features")
    print("Top support vs deny features:", top_support_deny_features)
    top_veracity_features = load_object(
        "data/features/top/top_veracity_features")
    print("Top veracity features:", top_veracity_features)
    print()

    comment_feature_labels = load_object(
        'data/features/labels/task_a_comment_feature_labels')
    query_feature_labels = load_object(
        'data/features/labels/task_a_query_feature_labels')
Exemplo n.º 8
0
import numpy as np
from file_utils import load_object
from graph_utils import get_average_distance_from_node_to_nodes
import pandas as pd

subreddits_dictionary = load_object(
    'data/dictionaries/subreddits_dictionary_reduced')
all_subreddits = list(subreddits_dictionary.keys())
users_dictionary = load_object('data/dictionaries/users_dictionary_reduced')
all_users = list(users_dictionary.keys())
involvement_dictionary = load_object(
    'data/dictionaries/involvement_dictionary_reduced')

sus_node_features = load_object("data/features/SuS_node_features")
usu_node_features = load_object("data/features/UsU_node_features")
sus_hop_count_matrix = load_object("data/distances/SuS_hop_count")
sus_weighted_distance_matrix = load_object("data/distances/SuS_weighted")
usu_hop_count_matrix = load_object("data/distances/UsU_hop_count")
usu_weighted_distance_matrix = load_object("data/distances/UsU_weighted")
sus_community_membership = load_object(
    "data/features/SuS_community_membership")
usu_community_membership = load_object(
    "data/features/UsU_community_membership")

sus_node_embeddings = load_object("data/features/SuS_node_embeddings")
usu_node_embeddings = load_object("data/features/UsU_node_embeddings")

subreddits_keyword_embeddings = load_object(
    "data/features/subreddits_keyword_embeddings")
users_keyword_embeddings = load_object(
    "data/features/users_keyword_embeddings")
Exemplo n.º 9
0
    datasets.append(dataset_without_columns)
    dataset_without_columns = dataset.drop(extra_columns, axis=1)
    datasets.append(dataset_without_columns)
    return datasets


if __name__ == "__main__":
    if not isfile('scores/ablation/task_a_comment_scores.tsv'):
        print("Performing ablation experiment for comment model...")
        comment_dataset = pd.read_csv(
            'data/datasets/task_a_comment_dataset.tsv',
            sep='\t',
            index_col=False,
            header=0,
            encoding='utf-8')
        comment_class_labels = load_object(
            'data/class_labels/task_a_comment_class_labels')

        comment_ablation_scores = {}
        ablation_datasets = get_ablation_datasets(comment_dataset)
        del comment_dataset

        for ablation_label, ablation_dataset in zip(ablation_labels,
                                                    ablation_datasets):
            _, scores = train_model(ablation_dataset,
                                    comment_class_labels,
                                    optimize_parameters=False)
            comment_ablation_scores[ablation_label] = scores['Ensemble model']
        comment_ablation_scores = pd.DataFrame(np.array(list(
            comment_ablation_scores.values()),
                                                        dtype='float'),
                                               index=ablation_labels,