def tagging(documents): nDocs = len(documents) documentsProcessed = [] unigram_tagger = [] try: unigram_tagger = file_utils.load_object('tagger1','tagger', None) except: train_set = mac_morpho.tagged_sents() # Treinamento do tagger via dicionário mac_morpho unigram_tagger = nltk.UnigramTagger(train_set) # Aplicação do tagger ao documento file_utils.save_object(unigram_tagger, 'tagger1','tagger', None) # Salva os unigramas tagueados a serem dicionados aos documentos processados for iDoc in range(0,nDocs): # Ordenação dos diferentes documentos a serem processados documentsProcessed.append(unigram_tagger.tag(documents[iDoc])) # Adiciona à lista a partir da ordenação return documentsProcessed
def get_random_predicted_samples(model, dataset, labels): samples_amount = 5 correct_predictions = [] incorrect_predictions = [] num_samples, _ = dataset.shape i = 0 while i < num_samples and \ (len(correct_predictions) < samples_amount or len(incorrect_predictions) < samples_amount): feature_vector = np.array(dataset.iloc[i, :]).reshape(1, -1) predicted_label = model.predict(feature_vector)[0] correct_label = labels[i] prediction = (i, predicted_label, correct_label) if predicted_label == correct_label: correct_predictions.append(prediction) else: incorrect_predictions.append(prediction) i += 1 for k, (i, predicted_label, correct_label) in enumerate(correct_predictions): tweet_id = dataset.index[i] tweet = load_object("data/tweets/" + str(tweet_id)) tweet_prediction = (tweet, predicted_label, correct_label) correct_predictions[k] = tweet_prediction for k, (i, predicted_label, correct_label) in enumerate(incorrect_predictions): tweet_id = dataset.index[i] tweet = load_object("data/tweets/" + str(tweet_id)) tweet_prediction = (tweet, predicted_label, correct_label) incorrect_predictions[k] = tweet_prediction if len(correct_predictions) > 5: correct_predictions = sample(correct_predictions, samples_amount) if len(incorrect_predictions) > 5: incorrect_predictions = sample(incorrect_predictions, samples_amount) return correct_predictions, incorrect_predictions
def tagging(documents): nDocs = len(documents) # print nDocs documentsProcessed = [] unigram_tagger = [] try: unigram_tagger = file_utils.load_object('tagger1', 'tagger', None) print unigram_tagger except: train_set = mac_morpho.tagged_sents() #test_set = mac_morpho.tagged_sents()[10001:10010] unigram_tagger = nltk.UnigramTagger(train_set) file_utils.save_object(unigram_tagger, 'tagger1', 'tagger', None) for iDoc in range(0, nDocs): #tokens = documents[iDoc] documentsProcessed.append(unigram_tagger.tag(documents[iDoc])) return documentsProcessed
def tagging(documents): nDocs = len(documents) documentsProcessed = [] unigram_tagger = [] try: unigram_tagger = file_utils.load_object('tagger1', 'tagger', None) except: train_set = mac_morpho.tagged_sents( ) # Treinamento do tagger via dicionário mac_morpho unigram_tagger = nltk.UnigramTagger( train_set) # Aplicação do tagger ao documento file_utils.save_object( unigram_tagger, 'tagger1', 'tagger', None ) # Salva os unigramas tagueados a serem dicionados aos documentos processados for iDoc in range( 0, nDocs): # Ordenação dos diferentes documentos a serem processados documentsProcessed.append(unigram_tagger.tag( documents[iDoc])) # Adiciona à lista a partir da ordenação return documentsProcessed
def tagging(self, documents, savePath, language): nDocs = len(documents) documentsProcessed = [] unigram_tagger = [] from data_core.file_utils import FileUtils file_utils = FileUtils(savePath) try: unigram_tagger = file_utils.load_object('tagger_' + language, 'tagger') except: if language == "pt": train_set = mac_morpho.tagged_sents() elif language == "en": train_set = brown.tagged_sents(tagset='universal') #print(train_set[0:1]) nSents = len(train_set) train_set_lower = [] for iSent in range(0, nSents): nWords = len(train_set[iSent]) words = [] for iWord in range(0, nWords): words.append( (self.text_lower_one([train_set[iSent][iWord][0] ])[0], train_set[iSent][iWord][1])) train_set_lower.append(words) #print(train_set_lower[0:1]) #test_set = mac_morpho.tagged_sents()[10001:10010] unigram_tagger = nltk.UnigramTagger(train_set_lower) file_utils.save_object(unigram_tagger, 'tagger_' + language, 'tagger') for iDoc in range(0, nDocs): #tokens = documents[iDoc] documentsProcessed.append(unigram_tagger.tag(documents[iDoc])) return documentsProcessed
plt.plot([0, 1], [0, 1], color='red', lw=3, linestyle='--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.xlabel('False Positive Rate', fontsize=18) plt.xticks(fontsize=14) plt.ylabel('True Positive Rate', fontsize=18) plt.yticks(fontsize=14) plt.title('ROC Curve - ' + str(name) + '\nAccuracy: ' + str(accuracy) + '\nROC AUC score: ' + str(roc_score), fontsize=24) plt.savefig(file, dpi="figure") return confusion_mat if __name__ == "__main__": subreddits_dictionary = load_object( 'data/dictionaries/subreddits_dictionary_reduced') involvement_dictionary = load_object( 'data/dictionaries/involvement_dictionary_reduced') involvement = [value for value in involvement_dictionary.values()] involvement = remove_extreme_values(involvement) generate_distribution_plot( involvement, 'User-subreddit involvement', "plots/user_subreddit_involvement_distribution.png") sus_network = nx.read_gpickle('data/networks/SuS') sus_degrees = dict(nx.degree(sus_network)) sus_degrees = list(sus_degrees.values()) sus_network_reduced = reduce_graph_by_weight_threshold(
if len(incorrect_predictions) > 5: incorrect_predictions = sample(incorrect_predictions, samples_amount) return correct_predictions, incorrect_predictions def print_tweet_samples(tweet_samples): for tweet, predicted_class, correct_class in tweet_samples: print("Tweet:", tweet) print("Predicted class:", predicted_class) print("Tweet class:", correct_class) print() if __name__ == "__main__": top_comment_features = load_object( "data/features/top/top_comment_features") print("Top comment features:", top_comment_features) top_query_features = load_object("data/features/top/top_query_features") print("Top query features:", top_query_features) top_support_deny_features = load_object( "data/features/top/top_support_deny_features") print("Top support vs deny features:", top_support_deny_features) top_veracity_features = load_object( "data/features/top/top_veracity_features") print("Top veracity features:", top_veracity_features) print() comment_feature_labels = load_object( 'data/features/labels/task_a_comment_feature_labels') query_feature_labels = load_object( 'data/features/labels/task_a_query_feature_labels')
import numpy as np from file_utils import load_object from graph_utils import get_average_distance_from_node_to_nodes import pandas as pd subreddits_dictionary = load_object( 'data/dictionaries/subreddits_dictionary_reduced') all_subreddits = list(subreddits_dictionary.keys()) users_dictionary = load_object('data/dictionaries/users_dictionary_reduced') all_users = list(users_dictionary.keys()) involvement_dictionary = load_object( 'data/dictionaries/involvement_dictionary_reduced') sus_node_features = load_object("data/features/SuS_node_features") usu_node_features = load_object("data/features/UsU_node_features") sus_hop_count_matrix = load_object("data/distances/SuS_hop_count") sus_weighted_distance_matrix = load_object("data/distances/SuS_weighted") usu_hop_count_matrix = load_object("data/distances/UsU_hop_count") usu_weighted_distance_matrix = load_object("data/distances/UsU_weighted") sus_community_membership = load_object( "data/features/SuS_community_membership") usu_community_membership = load_object( "data/features/UsU_community_membership") sus_node_embeddings = load_object("data/features/SuS_node_embeddings") usu_node_embeddings = load_object("data/features/UsU_node_embeddings") subreddits_keyword_embeddings = load_object( "data/features/subreddits_keyword_embeddings") users_keyword_embeddings = load_object( "data/features/users_keyword_embeddings")
datasets.append(dataset_without_columns) dataset_without_columns = dataset.drop(extra_columns, axis=1) datasets.append(dataset_without_columns) return datasets if __name__ == "__main__": if not isfile('scores/ablation/task_a_comment_scores.tsv'): print("Performing ablation experiment for comment model...") comment_dataset = pd.read_csv( 'data/datasets/task_a_comment_dataset.tsv', sep='\t', index_col=False, header=0, encoding='utf-8') comment_class_labels = load_object( 'data/class_labels/task_a_comment_class_labels') comment_ablation_scores = {} ablation_datasets = get_ablation_datasets(comment_dataset) del comment_dataset for ablation_label, ablation_dataset in zip(ablation_labels, ablation_datasets): _, scores = train_model(ablation_dataset, comment_class_labels, optimize_parameters=False) comment_ablation_scores[ablation_label] = scores['Ensemble model'] comment_ablation_scores = pd.DataFrame(np.array(list( comment_ablation_scores.values()), dtype='float'), index=ablation_labels,