def create_co_occurrence_matrix(interesting_words: [str], filename: str = None): entities = dbase_helper.generate_pkl_cached("prepared_ner_articles.pkl", generate_article_ner_frame) data = entities[entities['Text'].isin(interesting_words)].groupby( by='ID_Article', as_index=False).agg(lambda x: ' '.join(list(x)))[[ 'ID_Article', 'Text' ]] interesting_articles = np.array(data['ID_Article']) percent_interesting_articles = (interesting_articles.size / np.unique( entities['ID_Article']).size) * 100 print("We look at " + str(len(interesting_words)) + " entities and therefore at " + str(round(percent_interesting_articles, 2)) + "% of all articles for co-occurrence") count_model = CountVectorizer(ngram_range=(1, 1)) # default unigram model X = count_model.fit_transform(np.array(data['Text'])) names = count_model.get_feature_names() # X[X > 0] = 1 # run this line if you don't want extra within-text cooccurence (see below) Xc = (X.T * X) # this is co-occurrence matrix in sparse csr format Xc.setdiag(0) # fill same word cooccurence to 0 co_occurrences = pandas.DataFrame(data=Xc.toarray(), columns=names, index=names) if filename: co_occurrences.to_csv(dbase_helper.PKL_CACHE_FOLDER + '/' + filename, sep=',') return pandas.DataFrame(data=X.toarray(), columns=names, index=data.ID_Article.values), co_occurrences
def create_co_occurrence_all(): entities = dbase_helper.generate_pkl_cached("prepared_ner_articles.pkl", generate_article_ner_frame) num_top_entities = 50 pandas.DataFrame( entities['Text'].value_counts().head(num_top_entities)).plot.bar() plt.title("Distribution of top " + str(num_top_entities) + " named entities over all " + str(entities['ID_Article'].size) + " Articles") plt.show() word_occurrences = pandas.DataFrame(entities['Text'].value_counts()) word_occurrences = word_occurrences[word_occurrences['Text'] >= 10] word_occurrences = word_occurrences.rename( columns={'Text': 'NumOccurrences'}) interesting_words = word_occurrences.index.values create_co_occurrence_matrix(interesting_words, 'article_co_occurrences.csv') entities_without_locations = entities[entities.Label != 'LOC'] word_occurrences = pandas.DataFrame( entities_without_locations['Text'].value_counts()) word_occurrences = word_occurrences[word_occurrences['Text'] >= 10] word_occurrences = word_occurrences.rename( columns={'Text': 'NumOccurrences'}) interesting_words = word_occurrences.index.values create_co_occurrence_matrix( interesting_words, 'article_co_occurrences_without_locations.csv') print("done")
def prepare_data(): posts = load_raw_posts() post_embeddings = load_or_create_post_embeddings(posts) data = { "stylometric": dbase_helper.generate_pkl_cached( "stylometric_features_with_headlines.pkl", compute_stylometric_features, posts=posts), "embedded_posts": load_or_embed_posts(posts, post_embeddings), "date_stats": compute_date_stats(posts), "article_stats": compute_article_category_stats(posts), "article_entities": encode_article_named_entities(posts), "post_ratings": load_post_ratings(posts), "parent_posts": load_parent_posts(posts), "targets": tf.keras.utils.to_categorical(posts["ID_User"].cat.codes) } return posts, data
def encode_article_named_entities(posts): entities = dbase_helper.generate_pkl_cached("prepared_ner_articles.pkl", ner.generate_article_ner_frame) # Select named entities with minimal occurrence minimal_number_word_occurrences = 20 word_occurrences = pd.DataFrame(entities['Text'].value_counts()) word_occurrences = word_occurrences[ word_occurrences['Text'] >= minimal_number_word_occurrences] word_occurrences = word_occurrences.rename( columns={'Text': 'NumOccurrences'}) entity_occurrences, co_occurrences = ner.create_co_occurrence_matrix( word_occurrences.index.values) num_articles = dbase_helper.query_to_data_frame( """ SELECT MAX(Articles.ID_Article) FROM Articles; """, "number_articles.pkl")[0][0] entity_occurrences = entity_occurrences.reindex( index=range(num_articles), fill_value=0).astype('uint8') posts = posts[['ID_Post', 'ID_Article']] posts_entity_occurrences_in_article = posts.join(entity_occurrences, on='ID_Article').drop( 'ID_Article', axis=1) return posts_entity_occurrences_in_article.drop("ID_Post", axis=1)
def prepare_data(): entities = dbase_helper.generate_pkl_cached("prepared_ner_articles.pkl", ner.generate_article_ner_frame) # Select named entities with minimal occurrence minimal_number_word_occurrences = 5 minimal_number_words_per_article = 5 word_occurrences = pandas.DataFrame(entities['Text'].value_counts()) word_occurrences = word_occurrences[ word_occurrences['Text'] >= minimal_number_word_occurrences] word_occurrences = word_occurrences.rename( columns={'Text': 'NumOccurrences'}) interesting_words = word_occurrences.index.values occurrences, co_occurrences = ner.create_co_occurrence_matrix( interesting_words) article_ids = occurrences.index.values data = data_analysis.generate_joined_rating_articles_frame() data = data[data.ID_Article.isin(article_ids)] interesting_words_per_article = entities[entities['Text'].isin( interesting_words)].groupby( by='ID_Article', as_index=False).agg(lambda x: len(list(x)))[['ID_Article', 'Text']] article_ids = interesting_words_per_article[ interesting_words_per_article.Text > minimal_number_words_per_article].ID_Article data = data[data.ID_Article.isin(article_ids)] articles = data[[ 'ID_Article', 'Title', 'MainCategory', 'SubCategory', 'RemainingPath' ]] ratings = data[['ID_Article', 'PositiveVotesCount', 'NegativeVotesCount']] # Plot the data we shall predict plt.hist(data.PositiveVotesCount, label="PositiveVotesCount") plt.hist(-data.NegativeVotesCount, label="NegativeVotesCount") ax = plt.gca() ax.set_yscale('log') plt.legend() plt_helper.save_and_show_plot( "Logarithmic Vote Distribution over Articles") plt.hist(data.PositiveVotesCount, label="PositiveVotesCount") plt.hist(-data.NegativeVotesCount, label="NegativeVotesCount") plt.legend() plt_helper.save_and_show_plot("Vote Distribution over Articles") normalize = False if normalize: pos_mean = data.PositiveVotesCount.mean() pos_std = data.PositiveVotesCount.std() data.PositiveVotesCount = (data.PositiveVotesCount - pos_mean) / pos_std neg_mean = data.NegativeVotesCount.mean() neg_std = data.NegativeVotesCount.std() data.NegativeVotesCount = (data.NegativeVotesCount - neg_mean) / neg_std plt.hist(data.PositiveVotesCount, label="PositiveVotesCount") plt.hist(-data.NegativeVotesCount, label="NegativeVotesCount") ax = plt.gca() ax.set_yscale('log') plt.title("Normalized Data") plt.legend() plt.show() training_article_ids = np.random.choice(article_ids, round(len(article_ids) * 0.8)) training_data = { "articles": articles[articles.ID_Article.isin(training_article_ids)], "ratings": ratings[ratings.ID_Article.isin(training_article_ids)], "occurrences": occurrences[occurrences.index.isin(training_article_ids)], } test_article_ids = np.setdiff1d(article_ids, training_article_ids) test_data = { "articles": articles[articles.ID_Article.isin(test_article_ids)], "ratings": ratings[ratings.ID_Article.isin(test_article_ids)], "occurrences": occurrences[occurrences.index.isin(test_article_ids)] } return training_data, test_data
def ner_article_plots(): entities = dbase_helper.generate_pkl_cached("prepared_ner_articles.pkl", generate_article_ner_frame) pandas.DataFrame(entities['Text'].value_counts().head(30)).plot.bar() plt_helper.save_and_show_plot("Entity Distribution") entities["Label"].value_counts().plot.bar() plt.xlabel("Number of Occurrences") plt_helper.save_and_show_plot("Entity Label Distribution") joined_article_categories = data_analysis.generate_joined_category_articles_frame( ) articles_time = joined_article_categories[['ID_Article', 'PublishingDate']] for label in set(entities["Label"]): print("Doing plots for: " + label) label_entities = entities[entities['Label'] == label] label_series = label_entities["Text"].value_counts().head(20) if label == "PER": print( "For top person entries try to unify first+last name and first-name/last-name only entries" ) persons = label_series.index.values for person in persons: for compare_person in persons: if compare_person in person and person != compare_person: print( str(person) + " is not unique, subset of " + str(compare_person)) label_series[compare_person] += label_series[person] label_series = label_series.drop(labels=[person]) break pandas.DataFrame(label_series.sort_values()).plot.barh() ax = plt.gca() ax.get_legend().remove() plt.xlabel("Number of Occurrences") plt_helper.save_and_show_plot("Entities - " + label + " Distribution") top_entities = label_series.sort_values( ascending=False).head(6).index.values years = [2015, 2016] top_entity_entries = [] for entity in top_entities: if label == "PER": entity_entries = label_entities[ label_entities.Text.str.contains(entity)] entity_entries = entity_entries.assign(Text=entity) else: entity_entries = label_entities[label_entities.Text == entity] top_entity_entries.append(entity_entries) top_entity_entries = pandas.concat(top_entity_entries) top_entity_entries = pandas.merge(top_entity_entries, articles_time) plt.style.use('seaborn-deep') year_entity_entries = top_entity_entries[ top_entity_entries.PublishingDate.dt.year > 2014][[ 'PublishingDate', 'Text' ]] year_entity_entries.PublishingDate = year_entity_entries.PublishingDate.dt.date plots = year_entity_entries['PublishingDate'].hist( by=year_entity_entries['Text'], histtype='bar', alpha=0.8, bins=12) fig = plt.gca().figure title = "Top Entities from " + label + " over time" fig.suptitle(title, y=0.99) plt_helper.save_and_show_plot(title, False) values = [] labels = [] for entity in top_entities: values.append(year_entity_entries[year_entity_entries.Text == entity]['PublishingDate']) labels.append(entity) plt.hist(values, label=labels) plt.legend() plt_helper.save_and_show_plot("Top Entities from " + label + " over time") print("done")