from gabry_dataset_parser import get_labeled_instances
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
from nltk.parse.corenlp import CoreNLPDependencyParser
from pycorenlp import StanfordCoreNLP
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import csv

import numpy as np
import matplotlib.pyplot as plt

labeled_instances = get_labeled_instances(
    "./train_set/instances_converted_small.pickle",
    "./train_set/truth_converted_small.pickle")

clickbait_df = labeled_instances[(labeled_instances.truthClass == 'clickbait')]
no_clickbait_df = labeled_instances[(
    labeled_instances.truthClass == 'no-clickbait')]

print(clickbait_df.columns)


def get_slang_words_list():
    slang_data = []
    with open('slang_dict.doc', 'r') as exRtFile:
        exchReader = csv.reader(exRtFile,
                                delimiter='`',
                                quoting=csv.QUOTE_NONE)
        for row in exchReader:
            slang_data.append(row)
import pandas as pd
from sklearn import preprocessing
from sklearn.feature_selection import mutual_info_classif
from gabry_dataset_parser import get_labeled_instances

DATASET = 'big'

path = "../features/{}/pos_features_{}_targetTitle_normalized{}.csv"
POS_FEAT_PATH = path.format(DATASET, DATASET, "")
feat_data = pd.read_csv(POS_FEAT_PATH)

data_df = get_labeled_instances(
    "../train_set/instances_converted_{}.pickle".format(DATASET),
    "../train_set/truth_converted_{}.pickle".format(DATASET))[[
        'id', 'truthClass'
    ]]
print(
    f"Labeled instances loaded. Shape: {data_df.shape}. Only 'id' and 'truthClass' kept."
)
feat_data['id'] = feat_data['id'].astype(str)
data_df = pd.merge(data_df, feat_data, on=['id'])

le = preprocessing.LabelEncoder()
label_encoded = le.fit_transform(data_df['truthClass'])
label_encoded = [1 if lab == 0 else 0 for lab in list(label_encoded)]
print(
    f"Labels encoded. Class '{data_df['truthClass'][0]}' --> label '{label_encoded[0]}'"
)
label_encoded = pd.DataFrame(label_encoded, columns=['label'])

data_df = data_df.drop(['id', 'truthClass'], 1)
示例#3
0
    # TODO: big targetTitle normalized      --> Done
    # TODO: big targetTitle no-normalized   --> Running

    DATASET = 'big'  # 'small' or 'big'
    target = "targetTitle"  # "postText" or "targetTitle"
    prefix = "PT" if target == "postText" else "TA"
    NORMALIZE = False

    FEATURES_DATA_PATH = r"../features/pos_features_{}_{}_{}.csv".format(
        DATASET, target, 'normalized' if NORMALIZE else "no-normalized")
    print(
        f"Generating POS features... it might take a while :P\n Path: '{FEATURES_DATA_PATH}' | {target} | {prefix}"
    )

    labeled_instances = get_labeled_instances(
        "../train_set/instances_converted_{}.pickle".format(DATASET),
        "../train_set/truth_converted_{}.pickle".format(DATASET))

    tagger = nltk.StanfordNERTagger(
        '../ner/english.all.3class.distsim.crf.ser.gz',
        '../ner/stanford-ner.jar',
        encoding='utf-8')

    tagset = nltk.load("help/tagsets/upenn_tagset.pickle")
    possible_tags = list(tagset.keys())

    ids = list(labeled_instances.id)
    if target == 'postText':
        texts = [txt[0] for txt in list(labeled_instances.postText)]
    else:
        texts = [txt for txt in list(labeled_instances.targetTitle)]
示例#4
0
for k in ngrams_no_clickbait.keys():
    for ngram in ngrams_no_clickbait[k]:
        all_ngrams[ngram] = all_ngrams.get(ngram,
                                           0) + ngrams_no_clickbait[k][ngram]

THRESHOLD = 5

filtered_ngrams = {}
for ngram in all_ngrams:
    if all_ngrams[ngram] >= THRESHOLD:
        filtered_ngrams[ngram] = all_ngrams[ngram]

labeled_instances = get_labeled_instances(
    "../train_set/instances_converted_big.pickle",
    "../train_set/truth_converted_big.pickle")[[
        'truthClass', 'postText', 'id'
    ]]

postTexts = list(labeled_instances.postText)
ids = list(labeled_instances.id)
dict_list = []
for idx, post_text in enumerate(postTexts):
    print(idx)
    post_text = post_text[0]
    post_dict = {x: 0 for x in filtered_ngrams.keys()}
    post_dict['id'] = ids[idx]
    ngrams = get_all_ngrams_for_post(post_text)
    for ngram in ngrams:
        if ngram in post_dict:
            post_dict[ngram] += 1