Пример #1
0
def get_ngram_feature_vector(post, n, ngram_word_corpus: dict):
    """
    Function that creates the n-gram feature vector of a post

    :arg post: The post the we want to extract the n-gram features from
    :arg n: The n in n-gram
    :arg ngram_word_corpus: The entire feature vector initialized with zeroes

    :return: The final n-gram feature vector of the specified post
    """
    # Make a copy of the initialized feature vector to avoid changing it by reference
    ngram_feature_vector = ngram_word_corpus.copy()

    # Find the post's n-grams
    grams = ngrams((utils.article(post)).split(), n)

    # For each n-gram in the post
    for g in grams:

        k = re.sub(
            r'[^a-zA-Z0-9 ]+', '',
            (" ".join(g)
             )).lower()  # Remove special characters and make it lowercase

        # If it exists in our initialized feature vector add 1
        if k in ngram_feature_vector.keys():
            ngram_feature_vector[k] += 1

    # Return the post's feature vector
    return ngram_feature_vector
Пример #2
0
def get_no_of_characters_features(post):
    """
    Calculates the "Number of characters" features. 4 features
    are calculated in total
    :return: a list that contains the features
    """
    f1 = utils.len_characters(utils.title(post))
    f2 = utils.len_characters(utils.article(post))
    f3 = utils.len_characters(utils.description(post))
    f4 = utils.len_characters(utils.keywords(post))
    return [f1, f2, f3, f4]
Пример #3
0
def get_slang_words_feature(post):
    """
    Checks whether the post's text and article's title contain
    slang words
    :param post: the current post
    :return: a list of 0s, 1s indicating whether the fields contain
    slang words
    """
    post_text = utils.title(post)
    article_title = utils.article(post)

    found_in_post_text = has_slang_words(post_text)
    found_in_article_title = has_slang_words(article_title)

    return [found_in_post_text, found_in_article_title]
Пример #4
0
def get_common_clickbait_phrases_feature(post):
    """
    Checks whether the post's text and article's title contain
    common words/phrases
    :param post: the current post
    :return: a list of 0s, 1s indicating whether the fields contain
    common words/phrases
    """
    post_text = utils.title(post)
    article_title = utils.article(post)

    found_in_post_text = has_common_phrases(post_text)
    found_in_article_title = has_common_phrases(article_title)

    return [found_in_post_text, found_in_article_title]
Пример #5
0
def get_ngram_corpus(n, lower_t, upper_t):
    """
    Function that initializes the dataset's n-gram feature vectors for given n

    :arg n: The n in n-gram
    :arg lower_t: The lower threshold that removes n-grams with lower counts than the threshold in the dataset
    :arg upper_t: The upper threshold that removes n-grams with higher counts than the threshold  in the dataset

    :return: The final n-gram feature vectors after the upper and lower pruning initialized to zero
    """

    counts = {}  # The dictionary that hold the n-gram occurrences

    # For every post in the dataset
    with open('dataset/instances.jsonl', 'rb') as f:
        for post in json_lines.reader(f):

            grams = ngrams((utils.article(post)).split(),
                           n)  # Get the post's n-grams

            # For every n-gram
            for g in grams:

                k = re.sub(r'[^a-zA-Z0-9 ]+', '',
                           (" ".join(g)))  # Remove special characters

                # If the n-gram is NNP don't take it into account
                if utils.POS_counts(k)['NNP'] == 0:

                    k = k.lower()  # make i lowercase

                    # Increment the count dictionary
                    if k in counts.keys():
                        counts[k] += 1
                    else:
                        counts[k] = 1

    # Create the final feature vector taking ito account the counts dictionary and the upper and lower thresholds
    ng = {k: 0 for k, v in counts.items() if v > lower_t and not v >= upper_t}

    # Write the results into a csv in order to plot the n-gram distributions afterwards
    pd.DataFrame(counts.items(), columns=['gram', 'count']).to_csv(
        "dataset/" + str(n) + "-gram_frequencies.csv", index=False)

    # Return the final feature vector with 0 values
    return ng
Пример #6
0
def get_hyperbolic_words_feature(connection, post):
    """
    Checks if there are any hyperbolic words in the provided
    post's texts and article's title. NOTE! This needs the NLP Stanford
    server to be up and running.
    :param connection: the connection to the stanford local server
    :param post: the current post
    :return: a list with 1s or 0s based on whether the post text/article title
    contains at least one hyperbolic word or not
    """
    post_text = utils.title(post)
    article_title = utils.article(post)

    found_in_post_text = has_hyperbolic_words(connection, post_text)
    found_in_article_title = has_hyperbolic_words(connection, article_title)

    return [found_in_post_text, found_in_article_title]
Пример #7
0
def get_no_of_characters_ratio_features(post):
    """
    Calculates the "Number of characters ratio". 6 features
    are calculated in total
    :return: a list that contains the features
    """
    post_title_len = utils.len_characters(utils.title(post))
    article_title_len = utils.len_characters(utils.article(post))
    article_desc_len = utils.len_characters(utils.description(post))
    article_keywords_len = utils.len_characters(utils.keywords(post))

    lst = [
        post_title_len, article_title_len, article_desc_len,
        article_keywords_len
    ]

    features_lst = get_ratio_features_list(lst)
    return features_lst
Пример #8
0
def get_diff_between_no_of_characters_features(post):
    """
    Calculates the "Difference between number of chars" features.
    6 features are calculated in total
    :return: a list that contains the features
    """
    post_title_len = utils.len_characters(utils.title(post))
    article_title_len = utils.len_characters(utils.article(post))
    article_desc_len = utils.len_characters(utils.description(post))
    article_keywords_len = utils.len_characters(utils.keywords(post))

    lst = [
        post_title_len, article_title_len, article_desc_len,
        article_keywords_len
    ]

    features_lst = get_difference_features_list(lst)
    return features_lst
Пример #9
0
def get_sentiment_polarity_feature(post):
    """
    Calculates the compound score of the post's text and the
    article's title
    :param post: the current post
    :return: a list with the compound scores
    """
    post_text = utils.title(post)
    article_title = utils.article(post)

    # If list extract the element into a string
    if isinstance(post_text, list):
        post_text = post_text[0]
    if isinstance(article_title, list):
        article_title = article_title[0]

    scores_post_text = analyser.polarity_scores(post_text)
    scores_article_title = analyser.polarity_scores(article_title)
    return [scores_post_text["compound"], scores_article_title["compound"]]
Пример #10
0
def main():
    # Creating label dictionary
    labels = utils.get_label_dict()
    with open('dataset/instances.jsonl', 'rb') as f:
        headers = False
        count = 0  # elements processed
        for post in json_lines.reader(f):
            count += 1
            print('Sample', count)
            # Reading post/article elements
            post_id = utils.post_id(post)
            post_title = utils.title(post)
            article_title = utils.article(post)
            # Extracting sample label
            post_label = labels[post_id]
            # Presence of image in a post
            has_image = imf.image_presence(post)
            # Number of characters
            len_chars_post_title, len_chars_article_title, len_chars_article_desc, len_chars_article_keywords = \
                laf.get_no_of_characters_features(post)
            # Difference between number of characters
            diff_chars_post_title_article_title, diff_chars_post_title_article_desc, diff_chars_post_title_article_keywords, \
            diff_chars_article_title_article_desc, diff_chars_article_title_article_keywords, diff_chars_article_desc_article_keywords = \
                laf.get_diff_between_no_of_characters_features(post)
            # Number of characters ratio
            ratio_chars_post_title_article_title, ratio_chars_post_title_article_desc, ratio_chars_post_title_article_keywords, \
            ratio_chars_article_title_article_desc, ratio_chars_article_title_article_keywords, ratio_chars_article_desc_article_keywords = \
                laf.get_no_of_characters_ratio_features(post)
            # Number of Words
            len_words_post_title, len_words_article_title, len_words_article_desc, len_words_article_keywords = \
                laf.get_no_of_characters_features(post)
            # Difference between number of words
            diff_words_post_title_article_title, diff_words_post_title_article_desc, diff_words_post_title_article_keywords, \
            diff_words_article_title_article_desc, diff_words_article_title_article_keywords, diff_words_article_desc_article_keywords = \
                laf.get_diff_between_no_of_words_features(post)
            # Number of words ratio
            ratio_words_post_title_article_title, ratio_words_post_title_article_desc, ratio_words_post_title_article_keywords, \
            ratio_words_article_title_article_desc, ratio_words_article_title_article_keywords, ratio_words_article_desc_article_keywords = \
                laf.get_no_of_words_ratio_features(post)
            # Post creation hour
            post_creation_hour = adf.get_post_creation_hour(post)
            # Number of sings
            post_title_no_signs = adf.get_no_signs(post_title)
            # Number of hashtags
            post_title_no_hashtags = adf.get_no_hashtags(post_title)
            # Number of exclamations
            post_title_no_exclamations = adf.get_no_exclamations(post_title)
            article_title_no_exclamations = adf.get_no_exclamations(
                article_title)
            # Number of question marks
            post_title_no_questionmarks = adf.get_no_question_marks(post_title)
            article_title_no_questionmarks = adf.get_no_question_marks(
                article_title)
            # Number of abbreviations
            post_title_no_abbreviations = adf.get_no_abbreviations(post_title)
            article_title_no_abbreviations = adf.get_no_abbreviations(
                article_title)
            # Number of ellipses
            post_title_no_ellipses = adf.get_no_ellipses(post_title)
            article_title_no_ellipses = adf.get_no_ellipses(article_title)
            # Number of dots
            post_title_no_dots = adf.get_no_dots(post_title)
            article_title_no_dots = adf.get_no_dots(article_title)
            # Begins with interrogative
            post_title_begins_with_interrogative = adf.get_begins_with_interrogative(
                post_title)
            article_title_begins_with_interrogative = adf.get_begins_with_interrogative(
                article_title)
            # Begins with number
            post_title_begins_with_number = adf.get_begins_with_number(
                post_title)
            article_title_begins_with_number = adf.get_begins_with_number(
                article_title)
            # Contains determiners and possessives
            post_title_determiners, post_title_possessives = laf.get_det_poses(
                post_title)
            article_title_determiners, article_title_possessives = laf.get_det_poses(
                article_title)
            # Contains hyperbolic words
            try:
                nlp = StanfordCoreNLP('http://*****:*****@,Post_Title_No_#,' \
                                  'Post_Title_No_Exclam,Article_Title_No_Exclam,Post_Title_No_Question,Article_Title_No_Question,Post_Title_No_Abbrev,' \
                                  'Article_Title_No_Abbrev,Post_Title_No_Ellipses,Article_Title_No_Ellipses,Post_Title_No_Dots,Article_Title_No_Dots'
                for key, value in counts_post_title_POS.items():
                    feature_headers += ',Post_Title_' + key
                for key, value in counts_article_title_POS.items():
                    feature_headers += ',Article_Title_' + key
                feature_headers += ',Post_Title_NNPV,Post_Title_NNPT'
                feature_headers += ',Article_Title_NNPV,Article_Title_NNPT'
                for key, value in unigrams.items():
                    feature_headers += ',' + key
                for key, value in bigrams.items():
                    feature_headers += ',' + key
                for key, value in trigrams.items():
                    feature_headers += ',' + key
                # Writing file headlines
                with open('dataset/features.csv',
                          encoding='utf8',
                          mode='w',
                          newline='') as features_file:
                    features_writer = csv.writer(features_file,
                                                 delimiter=',',
                                                 quotechar='"',
                                                 quoting=csv.QUOTE_MINIMAL)
                    features_writer.writerow([feature_headers])
                headers = True
            with open('dataset/features.csv',
                      encoding='utf8',
                      mode='a',
                      newline='') as features_file:
                features_writer = csv.writer(features_file,
                                             delimiter=',',
                                             quotechar='"',
                                             quoting=csv.QUOTE_MINIMAL)
                features_writer.writerow([feature_output])