Exemplo n.º 1
0
def count_offensive_words(docs, pickle_path_pattern=None):
    """Count the number of offensive words in the documents.
    This function counts the number of occurrences of all the expressions inside the Flame Dictionary.
    If the pickled results of the function (corresponding to the same input) already exists, the function is
    bypassed. If not, after the function is done, the results are stored as pickles.
    Args:
        - docs:         (list: strings) List of documents. Each row represents an author and contains one string.
        - pickle_path_pattern: (string) The path pattern for the pickle. This needs to include “<HASH>”, which will be
    replaced with the hash of the input of the function. Refer to the docstring of the *generate_pickle_path* function.
    Returns:
        - counts_of_expressions_dict: A Python dictionary
            • Keys:   (int)         Flame level
            • Values: (NumPy array) Counts of occurrences of expressions in that Flame level. Each row
                represents an author, and each column represents an expression in the Flame level of the key.
    Note: List of expressions can be accessed by calling *ProcessDataFiles.load_flame_dictionary*.
    """

    pickle_path = generate_pickle_path(docs, pickle_path_pattern)

    # Bypass: If the pickled results already exist, load (unpickle) and return them and skip the rest of the function
    if (pickle_path is not None) and os.path.isfile(pickle_path):
        with open(pickle_path, 'rb') as pickle_input_file:
            unpickled_object = pickle.load(pickle_input_file)
        print(
            'Function bypassed: The counts of offensive words was loaded from pickle "%s" instead.',
            pickle_path)
        return unpickled_object

    # Load the Flame Dictionary
    # %% TODO: Prevent loading the dictionary every time...
    flame_dictionary, flame_expressions_dict = process_data_files.load_flame_dictionary(
    )
    '''
    ↳
    *flame_dictionary*
        Keys:   (string) Expression
        Values: (int)    Flame level
    *flame_expressions_dict*
        Keys:   (int)           Flame level
        Values: (list: strings) Expressions
    '''

    # keys_dict_view = flame_dictionary.keys()
    # expressions = list(keys_dict_view)

    # Preprocess the merged tweets of authors
    preprocessed_docs = []  # Create an empty list
    for author_index, doc in enumerate(docs):
        preprocessed_docs.append(preprocess_tweet(doc))

        # Log after preprocessing the merged tweets of every 200 authors
        if author_index % 200 == 0:
            logger.debug(
                "@ %.2f seconds, progress: Preprocessed the tweets of author_index = %d",
                time.process_time(), author_index)
    print(
        "@ %.2f seconds: Finished preprocessing the tweets in *count_offensive_words()*",
        time.process_time())

    # Create a dictionary of five NumPy arrays full of zeros
    counts_of_expressions_dict = {}  # Create an empty dictionary
    for flame_index in range(1, 6):
        counts_of_expressions_dict[flame_index] = np.zeros(
            (len(preprocessed_docs), len(flame_expressions_dict[flame_index])),
            dtype=int)

    # Compile regex patterns into regex objects for all expressions, and store them in five separate lists, based on
    # Flame level (similar to *flame_expressions_dict*).
    '''
    - Most regex operations are available as module-level functions as well as methods on compiled
    regular expressions. The functions are shortcuts that don’t require you to compile a regex object first,
    but miss some fine-tuning parameters.
    - Compiling a regex pattern and storing the resulting regex object for reuse is more efficient when the
    expression will be used several times in a single program. Even though the most recent patterns passed to
    re.compile() and the module-level matching functions are cached, the size of this cache is limited.
    More info: https://docs.python.org/3/library/re.html#re.compile
    Here, we are dealing with 2,600+ expressions, so the built-in cache cannot help. Storing the regex objects,
    decreased the processing time of each Author from 1.6 seconds to 0.7 seconds (on my machine).
    '''
    '''
    - In Python code, Regular Expressions will often be written using the raw string notation (r"text").
    Without it, every backslash in a regular expression would have to be prefixed with another one to escape it.
    - The shorthand \b matches a word boundary, without consuming any characters. Word boundary characters
    include space, . ! " ' - * and much more.
    - Some examples of matches of the /\bWORD\b/ pattern: WORD's, prefix-WORD, WORD-suffix, "WORD".
    %% TODO: To increase the performance of regex:
        1. You can combine the patterns using | for all expressions of the same level of Flame.
        https://stackoverflow.com/questions/1782586/speed-of-many-regular-expressions-in-python#comment1669596_1782712
        2. You can first use str.find to find potential matches, and then check those matches with regex. 
    '''
    regex_objects_dict = {
        1: [],
        2: [],
        3: [],
        4: [],
        5: []
    }  # Create a dictionary of 5 empty lists
    for flame_index in range(1, 6):
        for expression in flame_expressions_dict[flame_index]:
            regex_pattern = r'\b' + expression + r'\b'
            regex_object = re.compile(regex_pattern, re.IGNORECASE)
            regex_objects_dict[flame_index].append(regex_object)
    print(
        "@ %.2f seconds: Finished compiling the regex patterns into regex objects.",
        time.process_time())

    # Count the matches of each expression for each author
    for author_index, merged_tweets_of_author in enumerate(preprocessed_docs):
        for flame_index in range(1, 6):
            for expression_index in range(
                    len(flame_expressions_dict[flame_index])):
                # ↳ Note: We are assuming that the lists inside *flame_expressions_dict* have not been manipulated since
                # the lists inside *regex_objects_dict* were created.
                list_of_matches = regex_objects_dict[flame_index][
                    expression_index].findall(merged_tweets_of_author)
                count = len(list_of_matches)
                # count = merged_tweets_of_author.count(expression)
                counts_of_expressions_dict[flame_index][
                    author_index, expression_index] = count

        # Log after counting the offensive words for every 100 authors
        if author_index % 100 == 0:
            logger.debug(
                "@ %.2f seconds, progress: Counted (regex) the offensive words for author_index = %d",
                time.process_time(), author_index)

    print(
        "@ %.2f seconds: Finished counting the occurrences of offensive words",
        time.process_time())

    # Pickle the output variable
    if pickle_path is not None:
        # Create the directory if it does not exist.
        os.makedirs(os.path.dirname(pickle_path), exist_ok=True)
        # Pickle
        with open(pickle_path, 'wb') as pickle_output_file:
            pickle.dump(counts_of_expressions_dict, pickle_output_file)
        print('The counts of offensive words was pickled to: "%s"',
              pickle_path)

    return counts_of_expressions_dict
Exemplo n.º 2
0
def extract_features_offensive_words(docs_train, docs_test):
    """Extract offensive words features
    This function performs the following tasks for the training and test datasets:
        1. Gets the counts of offensive words from the *count_offensive_words()* function.
        2. Concatenates the count arrays for the desired Flame levels into X_train and X_test.
        3. Transforms the count matrix (NumPy array) to a normalized TF or TF-IDF representation.
        4. Performs dimensionality reduction on the normalized matrix using truncated SVD (aka LSA).
    Moreover, the function collects and returns the feature names (offensive expressions) for the desired Flame
    levels in the following format: “expression (flame level)”
    Important constants:
    DESIRED_FLAME_LEVELS: (Tuple: Ints) Desired flame levels. You can select any of the levels: 1, 2, 3, 4, and 5.
    """

    # Count the number of occurrences of all the offensive expressions in the training set and test set
    counts_of_offensive_words_dict_train = count_offensive_words(
        docs_train,
        "pickles/counts_of_offensive_words_dict_train, <HASH>.pickle")
    counts_of_offensive_words_dict_test = count_offensive_words(
        docs_test,
        "pickles/counts_of_offensive_words_dict_test, <HASH>.pickle")

    # Load the Flame Dictionary (to produce the list of feature names)
    flame_dictionary, flame_expressions_dict = process_data_files.load_flame_dictionary(
    )
    '''
    ↳
    *flame_dictionary*
        Keys:   (string) Expression
        Values: (int)    Flame level
    *flame_expressions_dict*
        Keys:   (int)           Flame level
        Values: (list: strings) Expressions
    '''

    # Log the min, max, and shape of the offensive words count arrays (just to make sure the pickles were loaded
    # correctly.
    for flame_index in range(1, 6):
        array = counts_of_offensive_words_dict_train[flame_index]
        logger.debug("Flame level %d: min = %d | max = %-3d | shape = %s",
                     flame_index, array.min(), array.max(), array.shape)
    for flame_index in range(1, 6):
        array = counts_of_offensive_words_dict_test[flame_index]
        logger.debug("Flame level %d: min = %d | max = %-3d | shape = %s",
                     flame_index, array.min(), array.max(), array.shape)

    # Create empty lists
    arrays_list_train = []
    arrays_list_test = []
    feature_names_offensive_words = []

    # Concatenate the counts NumPy arrays and the feature names for the desired Flame levels
    DESIRED_FLAME_LEVELS = (1, 2, 3, 4, 5)
    for flame_index in DESIRED_FLAME_LEVELS:
        arrays_list_train.append(
            counts_of_offensive_words_dict_train[flame_index])
        arrays_list_test.append(
            counts_of_offensive_words_dict_test[flame_index])
        # Add the expressions to the list of feature names in the form: “expression (flame level)”
        for expression in flame_expressions_dict[flame_index]:
            feature_names_offensive_words.append("{} ({})".format(
                expression, flame_index))
    X_train_offensive_words_counts = np.concatenate(arrays_list_train, axis=1)
    X_test_offensive_words_counts = np.concatenate(arrays_list_test, axis=1)

    # • Transform the count matrix (NumPy array) to a normalized TF or TF-IDF representation
    # Build a TF-IDF transformer object
    tfidf_transformer = TfidfTransformer(norm='l2',
                                         use_idf=False,
                                         sublinear_tf=False)
    # ↳ With these parameters, the transformer does not make any changes: norm=None, use_idf=False, sublinear_tf=False
    '''
    ↳ With normalization, each row (= author) is normalized to have a sum of absolute values / squares equal to 1.
    L^1-norm: Sum of the absolute value of the numbers (here, TF or TF-IDF of the offensive expressions)
    L^2-norm: Sum of the square         of the numbers ”...
    More info: http://www.chioka.in/differences-between-l1-and-l2-as-loss-function-and-regularization/
    '''
    # Fit and transform
    X_train_offensive_words_tfidf = tfidf_transformer.fit_transform(
        X_train_offensive_words_counts)
    X_test_offensive_words_tfidf = tfidf_transformer.transform(
        X_test_offensive_words_counts)

    # • Dimensionality reduction using truncated SVD (aka LSA)
    # Build a truncated SVD (LSA) transformer object
    svd_offensive_words = TruncatedSVD(n_components=10, random_state=42)
    # Fit the LSI model and perform dimensionality reduction
    x_train_offensive_words_tfidf_reduced = svd_offensive_words.fit_transform(
        X_train_offensive_words_tfidf)
    print(
        "@ %.2f seconds: Finished dimensionality reduction (LSA) in *extract_features_offensive_words()* on "
        "the training dataset", time.process_time())
    x_test_offensive_words_tfidf_reduced = svd_offensive_words.transform(
        X_test_offensive_words_tfidf)
    print(
        "@ %.2f seconds: Finished dimensionality reduction (LSA) in *extract_features_offensive_words()* on "
        "the test dataset", time.process_time())

    return x_train_offensive_words_tfidf_reduced, x_test_offensive_words_tfidf_reduced, feature_names_offensive_words