Пример #1
0
    def clean_text(document):

        stop_words_ = STOP_WORDS.union(stopwords.words('english'))
        stop_words = [unidecode(stop).lower() for stop in stop_words_]
        # Split to translate
        tokens = document.split()
        # Concatenate
        document = ' '.join(tokens)
        # Remove accents
        document = unidecode(document)
        # Remove https, mentions, special characters, single character
        document = re.sub(
            "(@[A-Za-z0-9]+)|(_[A-Za-z0-9]+)|(\w+:\/\/\S+)|(\W_)", " ",
            document).lower()
        # Remove pontuaction
        document = re.sub('[' + string.punctuation + ']', '', document)
        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', document, flags=re.I)
        # Remove digits
        document = ''.join([i for i in document if not i.isdigit()])
        # Remove all single characters
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
        # Split
        tokens = document.split()
        # Stopwords
        tokens = [w for w in tokens if w not in stop_words]
        # Concatenate
        preprocessed_text = ' '.join(tokens)

        return preprocessed_text
Пример #2
0
    def _set_stopwords(self) -> 'KeywordRanking':
        stop_words = STOP_WORDS.union(self.stopwords) if self.stopwords else STOP_WORDS

        for word in stop_words:
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True

        return self
Пример #3
0
def cluster_commonwords(texts, nwords=10, onlycorona="yes"):
    ignore_words = STOP_WORDS if onlycorona == "no" else STOP_WORDS.union(
        ['coronavirus', 'covid', 'covid19', 'covid-19'])
    allwords = [
        w for w in ' '.join(texts.str.lower()).split()
        if w not in ignore_words and re.search('[a-z]', w)
    ]
    return ', '.join(
        [word for word, cnt in Counter(allwords).most_common(nwords)])
Пример #4
0
 def set_stopwords(self, stopwords):
     """Set stop words"""
     if self.language == "en":
         for word in STOP_WORDS.union(set(stopwords)):
             lexeme = self.nlp.vocab[word]
             lexeme.is_stop = True
     elif self.language == "de":
         for word in STOP_WORDS_DE.union(set(stopwords)):
             lexeme = self.nlp.vocab[word]
             lexeme.is_stop = True
Пример #5
0
def transform_matrix(content_body, tokenize_lemma):
    html_stop_words = get_html_stop_words(content_body, tokenize_lemma)
    stop_words_lemma_train = set(
        tokenize_lemma(' '.join(STOP_WORDS.union(set(html_stop_words)))))
    X = content_body
    tfidf_vectorizer = TfidfVectorizer(max_features=300,
                                       stop_words=stop_words_lemma_train,
                                       tokenizer=tokenize_lemma)
    tfidf_vectorizer = tfidf_vectorizer.fit(X)
    tfidf_matrix = tfidf_vectorizer.transform(X)
    return tfidf_matrix
Пример #6
0
def remove_stopwords(content):
    custom_stopwords = ("feeling", "feel", "becaus", "want", "time", "realli",
                        "im", "think", "thing", "ive", "still", "littl", "one",
                        "life", "peopl", "need", "bit", "even", "much", "dont",
                        "look", "way", "love", "start", "s", "m", "quot",
                        "work", "get", "http", "go", "day", "com", "got", "see"
                        "4pm", "<BIAS>", "veri", "know", "t", "like", "someth",
                        "good", "going", "today", "u", "new", "cant", "people",
                        "little", "pretty", "things")
    return hero.remove_stopwords(content,
                                 spacy_stop_words.union(custom_stopwords))
Пример #7
0
def scripts_to_tfidf(scripts):
    """Create Tfidf matrix from tokenized scripts."""
    # custom stop words for scripts
    film_stop_words = ['V.O.', "Scene", "CUT TO", "FADE IN"]
    stop_words = STOP_WORDS.union(film_stop_words)

    # vectorize scripts into Tfidf matrix
    vectorizer = TfidfVectorizer(input='content', stop_words=stop_words, min_df=0.2,
                                 ngram_range=(1, 2))  # less than 20% frequency words are removed.
    bow = vectorizer.fit_transform(scripts)
    vocab = vectorizer.get_feature_names()

    return bow, vocab
Пример #8
0
def stage2(process_folder, label):

    path_stage1 = process_folder + label + 'stage1.json'

    from spacy.lang.en.stop_words import STOP_WORDS

    path_stage2 = process_folder + label + 'stage2.json'

    graph, ranks = text_rank(path_stage1)
    render_ranks(graph, ranks)

    with open(path_stage2, 'w') as f:
        for rl in normalize_key_phrases(
                path_stage1, ranks, stopwords=STOP_WORDS.union(STOP_WORDS)):
            f.write("%s\n" % pretty_print(rl._asdict()))
def get_baseline():
    print('Spacy:', len(STOP_WORDS))
    sw_sk = set(stop_words.ENGLISH_STOP_WORDS)
    print('sklearn', len(sw_sk))

    sw = set(
        pd.read_csv(
            'http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words',
            header=None,
            squeeze=True).tolist())

    print('web', len(sw))

    all = STOP_WORDS.union(sw).union(sw_sk)
    print('all', len(all))
    pd.Series(sorted(list(all))).to_csv('baseline.csv', index=False)
Пример #10
0
import re
import cloudpickle
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
import os

nlp = English()
mystops = STOP_WORDS.union({
    'strain', 'strains', 'effect', 'effects', 'flavor', 'flavors', 'bud',
    'buds', ' ', '  ', '$', 'user', 'users', 'produce', 'produces', 'showing',
    'start', 'started', 'price', 'refers', 'packs', 'tends', 'stem', 'stems',
    'report', 'supposedly', 'breed', 'bred', 'seed', 'seeds', 'intermittent',
    'week', 'combine', 'combines', 'containing', '\xa0', 'smell', 'give',
    'gives', 'explanation', 'call', 'calls', 'match', 'matches', 'making',
    'tend', 'lineage', 'probably', 'especially', 'utilizing', 'offer',
    'offers', 'technique', 'techniques', 'like', 'including'
})


def token_str(s):
    if (type(s)) != str:
        return list()

    s = s.lower()
    s = re.sub(r'[\.,!?\\\-\$_]', ' ', s)
    s = re.sub(r' +', ' ', s)
    s = s.strip()

    if s == 'None' or s == '':
        return list()
    return [
Пример #11
0
 def set_stopwords(self, stopwords):
     """Set stop words"""
     for word in STOP_WORDS.union(set(stopwords)):
         lexeme = nlp.vocab[word]
         lexeme.is_stop = True
    def __init__(self,
                 num_distinct_documents=5000,
                 replace_entities=True,
                 max_term_length=127,
                 remove_stopwords=True,
                 custom_stopwords=[
                     ',', '.', '-', '\xa0', '“', '”', '"', '\n', '—', ':', '?',
                     'I', '(', ')'
                 ],
                 analyze=False,
                 document_tabe_name="documents",
                 sentence_table_name="sentences",
                 sentence_fields=OrderedDict({
                     "doc_id": "document_id",
                     "sen_id": "sentence_id",
                     "content": "sentence_text"
                 }),
                 term_table_name="terms",
                 term_sql_format=("term_id", "term_text", "is_entity"),
                 term_occurrence_table_name="term_occurrence",
                 term_occurrence_sql_format=("document_id", "sentence_id",
                                             "term_id"),
                 entity_table_name="entities",
                 entity_sql_format=("entity_id", "entity_type"),
                 database="postgres",
                 user="******",
                 password="******",
                 host="127.0.0.1",
                 port=5435,
                 log_file=os.path.join(os.path.dirname(__file__),
                                       "logs/TermGenerator.log"),
                 log_level=logging.INFO,
                 log_verbose=True):
        """
        Initializes various parameters, registers logger and MongoConnector, and sets up the limit.
        :param num_distinct_documents: (int) The number of distinct documents retrieved from the queries.
               For performance reasons, this should be limited during debugging/development.
               0 (Zero) represents no limit, in accordance with the MongoDB standard for .limit().
        :param replace_entities: (boolean) Whether or not the entities in the text should be replaced/recognised.
               The reason for this is that single terms might be merged together to one term, i.e. first and last name:
               "Dennis" "Aumiller" would be two separate terms in the traditional splitting (replace_entities=False),
               whereas - if set to true - "Dennis Aumiller" would represent only one entity.
        :param max_term_length: (int) Indicator of how long the terms are supposed to be (varchar property in table).
        :param remove_stopwords: (boolean) Determines whether or not stop words are removed. Currently, we are still
               deciding on the final set, but likely either one (or both) of NLTK and SpaCy's stop word lists.
        :param custom_stopwords: (list of strings) Additional words that will not be considered at adding-time.
        :param analyze: (boolean) Whether or not to include analytically relevant metrics.
        :param document_tabe_name: (str) Name of the table where the document information is stored.
        :param sentence_table_name: (str) Name of the table where the sentence information will be stored.
        :param sentence_fields: (OrderedDict) Structure of input to output values from MongoDB to postgres for the
               sentence table and its fields.
        :param term_table_name: (str) Name of the Postgres tables for the terms.
        :param term_sql_format: (tuple) Since those are generated locally, only a tuple of the PostgresColumns suffices.
        :param term_occurrence_table_name: (str) Name of the Postgres table for the term occurrences
        :param term_occurrence_sql_format: (tuple) Same as term_sql_format, but for the term occurrences.
        :param entity_table_name: (str) (Not implemented yet) Name of the table for the entity meta information.
        :param entity_sql_format: (str) Same as term_sql_format, but for entities.
        :param database: (str) database name.
        :param user: (str) User name to get access to the Postgres database.
        :param password: (str) Corresponding user password.
        :param host: (IP) IP address (in string format) for the host of the postgres database.
        :param port: (integer) Port at which to access the database.
        """
        # set up logger
        self.logger = set_up_logger(__name__, log_file, log_level, log_verbose)
        self.logger.info("Successfully registered logger to TermGenerator.")

        # register a MongoConnector
        self.mc = MongoConnector()
        self.logger.info(
            "Successfully registered MongoConnector to TermGenerator.")

        # PostgresConnector
        self.pc = PostgresConnector(database, user, password, host, port)
        self.logger.info(
            "Successfully registered PostgresConnector to DocumentGenerator.")

        self.num_distinct_documents = num_distinct_documents
        # do this earlier since we need it already for the distinct documents.
        self.document_table_name = document_tabe_name
        # get the distinct IDs for the documents so we can match against them later
        # since we have removed parts of the document collection, we have to make sure to get this from Postgres.
        self.logger.info("Parsing relevant documents from Postgres...")
        with self.pc as open_pc:
            open_pc.cursor.execute("SELECT document_id FROM {}".format(
                self.document_table_name))
            self.first_distinct_documents = list(open_pc.cursor.fetchall())
            # extract from the tuple structure
            self.first_distinct_documents = [
                el[0] for el in self.first_distinct_documents
            ]
            self.logger.info("Retrieved all relevant documents from Postgres.")

        # additionally restrict if we want only a number of documents.
        if self.num_distinct_documents != 0:
            self.logger.info(
                "Non-zero limit detected. Limiting to the first N entries.")
            self.first_distinct_documents = self.first_distinct_documents[:self
                                                                          .
                                                                          num_distinct_documents]

        self.replace_entities = replace_entities
        self.analyze = analyze

        self.max_term_length = max_term_length

        self.nlp = spacy.load("en")

        # construct dictionary with the entries per document/sentence id pair. Thus, we can later check whether
        # there are any entities in the current sentence with higher efficiency.
        self.occurrence_dict = {}
        self.occurring_entities = []

        # start building the term dictionary/set, as well as an occurence map. Since terms will be "post-processed",
        # it is first created as a list and later cast to Counter and set.
        self.terms = []  # cast into a set later on.
        self.term_in_sentence = set()
        self.term_id = {}
        self.term_is_entity = {}
        if self.analyze:
            self.term_count = Counter()
            self.entity_count = Counter()

        self.entities = []
        self.sentences = []
        self.processed_sentences = []

        # Postgres tables
        if not sentence_fields:
            self.logger.error("No sentence fields specified!")
        self.sentence_table_name = sentence_table_name
        self.sentence_fields = sentence_fields
        if not term_sql_format:
            self.logger.error("No term fields specified!")
        self.term_table_name = term_table_name
        self.term_sql_format = ", ".join(term_sql_format)
        if not term_occurrence_sql_format:
            self.logger.error("No term occurrence fields specified!")
        self.term_occurrence_table_name = term_occurrence_table_name
        self.term_occurrence_sql_format = ", ".join(term_occurrence_sql_format)
        if not entity_sql_format:
            self.logger.error("No entity fields specified!")
        self.entity_table_name = entity_table_name
        self.entity_sql_format = ", ".join(entity_sql_format)

        # value retrieving parse:
        self.sentence_values_to_retrieve = {
            key: 1
            for key in self.sentence_fields.keys()
        }
        # suppress _id if not present:
        if "_id" not in self.sentence_values_to_retrieve.keys():
            self.sentence_values_to_retrieve["_id"] = 0
        self.sentence_sql_format = ", ".join(
            [value for value in self.sentence_fields.values()])

        # create union of stop words, and add potentially custom stop words
        self.remove_stopwords = remove_stopwords
        self.removed_counter = 0
        self.stopwords = STOP_WORDS.union(set(stopwords.words("english")))
        # add custom stopwords.
        for word in custom_stopwords:
            self.stopwords.add(word)

        self.logger.info("Successfully initialized TermGenerator.")
Пример #13
0
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
import re
import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import os
import json
from pathlib import Path

nlp = spacy.load('en_core_web_lg')

with open('stopwords.txt', 'r', encoding='utf-8') as f:
    STOPWORDS = f.readlines()
    STOPWORDS = set([item.strip(string.whitespace) for item in STOPWORDS])
    STOP_WORDS = STOP_WORDS.union(STOPWORDS)

# encodings:
replace_dict = {
    '\ufb01': 'fi',
    '\u2019': '',
    '\u00e9': 'e',
    '\u00a8': '',
    'ямБ': 'fi',
}

documents = []  #  [ [token, token, token], [token, token, token], ...]

fp = '../data/LRECjson/'
for jsonfile in os.listdir(Path(fp)):
    #for jsonfile in ['../data/LRECjson/2018_1049.json']:
Пример #14
0
import numpy as np
# from sklearn.externals.six import StringIO
# from sklearn.tree import export_graphviz
# import pydotplus
# from IPython.display import Image
from spellchecker import SpellChecker
import pickle

#using smote to deal imbalance
#from imblearn.over_sampling import SMOTE
# this symbol seems to have higher weightage in the final words when Naive Bayes is used,
# so adding it to punctuations to filter
punctuations = string.punctuation + "".join(
    ["...", "..........", "....", "--", "/"])
nlp = spacy.load("en_core_web_sm")
STOP_WORDS = STOP_WORDS.union(CUSTOM_STOP_WORDS)

#excluding NO from stopwords for our use
#STOP_WORDS.discard("no")
#STOP_WORDS.discard("not")
#STOP_WORDS.discard("off")

Urban_vocab = pd.read_csv("urbandict-word-def.csv")
Urban_vocab = Urban_vocab["WORD"].tolist()
# contraction_log = open("1_contractions.log", "w")
# slang_log = open("1_slang.log", "w")
out_vocab = open("1_o_vocab.log", "w")
corpus_vocab = open("1_corpus_vocab.log", "w")

parser = English()
p.set_options(p.OPT.EMOJI, p.OPT.URL, p.OPT.SMILEY, p.OPT.NUMBER,
Пример #15
0
def get_stopwords():
    custom_stopwords = [
        'tarun', 'tathak', '*****@*****.**', '\\r', '\\n'
    ]
    stopwords = STOP_WORDS.union(set(punctuation)).union(set(custom_stopwords))
    return [x.lower() for x in list(stopwords)]
Пример #16
0
textrank = tr.TextRank()
normalizer = lnormalizer.Lineal_Normalizer()

pos_list = ['NOUN', 'PROPN', 'ADJ', 'VERB', 'INTJ']
stopword_list = [
    "ANTECEDENTES", "ANTECEDENTE", "Antecedentes", "Antecedente", "OBJETIVOS",
    "OBJETIVO", "Objetivo", "Objetivos", "RESULTADOS", "RESULTADO",
    "Resultado", "Resultados", "MÉTODOS", "METODO", "Método", "Métodos",
    "CONCLUSIONES", "CONCLUSION", "Conclusiones", "Conclusion", "EVALUACIÓN",
    "evaluación"
    "ANTECEDENTES/OBJETIVO", "INTRODUCCIÓN", "Introduccion", "RESUMEN",
    "Resumen", "estudio", "año"
]
"""Set stop words"""
for word in STOP_WORDS.union(set(stopword_list)):
    lexeme = nlp.vocab[word]
    lexeme.is_stop = True


def get_tokens(text, entity_map={}, lower=False):
    doc = nlp(text)
    sentences = []
    current_offset = 0

    #general entities
    for entity in doc.ents:
        if (":" in entity.text):
            continue
        entity_map[entity.start_char] = entity.text.replace(" ", "_")
def make_alphabetic(text):
    """
    A helper function to remove numbers and punctuation before passing the
    data to my preprocessing pipeline
    """
    text = re.sub(r'[^A-Za-z\s]', '', text)
    return text.lower()


nlp = spacy.load('en_core_web_sm', disable=['tagger', 'parser', 'ner'], n_process=-1)

custom_stopwords = ['bra', ' bra', 'bra ', 'bras', 'sport', 'sports'
    'a','aa', 'ab', 'b','c','cb','bc','d','dc','cd','dd','ddd', 
    'dddd', 'e', 'ee', 'f', 'ff', 'g', 'gg', 'h', 'hh', 'l', 'm', 'n', 'o', 'p',
    'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'x', 's', 'm', 'xs', 'l', 'xl',
    'xxl', 'lbs' , 'lb', '', ' ', '  ', '\n', '-PRON-', '\ufeff1'
    ]

stopwords_list=STOP_WORDS.union(custom_stopwords)

def lemmatize_pipe(doc):
    lemma_list = [str(tok.lemma_) for tok in doc
                 if tok.is_alpha and tok.text and tok.lemma_ not in stopwords_list]
    lem_string = " ".join(lemma_list)
    return lem_string.lower()

def preprocess_pipe(texts, batch_size=100):
    preproc_pipe = []
    for doc in nlp.pipe(texts, batch_size=batch_size, n_threads=-1):
        preproc_pipe.append(lemmatize_pipe(doc))
    return preproc_pipe
Пример #18
0
 def my_stopwords(self, stopwords):
     for word in STOP_WORDS.union(set(stopwords)):
         lexeme = parser.vocab[word]
         lexeme.is_stop = True
	def __init__(self):
		self.additional_stop_words = {"-PRON-"}
		self.stop_words = set(STOP_WORDS.union(self.additional_stop_words))
from collections import OrderedDict
import numpy as np
import spacy  # NLP library that analyses text to extract keywords
from spacy.lang.en.stop_words import STOP_WORDS

from keyword_text_analyser.text_analyse_utils import sentence_segment, get_token_pairs, get_vocab, get_matrix

nlp = spacy.load('en_core_web_sm')
nlp.add_pipe(nlp.create_pipe('sentencizer'), first=True)

custom_stopwords = ['use']
stopwords = STOP_WORDS.union(set(custom_stopwords))

window_size = 4
candidate_pos = ['NOUN', 'PROPN']


def set_stopwords():
    """Set stop words"""
    for word in stopwords:
        lexeme = nlp.vocab[word]
        lexeme.is_stop = True


class TextRank4Keyword:
    """Extract keywords from text """
    def __init__(self):
        # Set stop words
        set_stopwords()
        self.d = 0.85  # damping coefficient, usually is .85
        self.min_diff = 1e-5  # convergence threshold
Пример #21
0
def load_stop_words(stopwords=""):
    stop_words = []
    for word in STOP_WORDS.union(set(stopwords.split(" "))):
        stop_words.append(word)
    return stop_words
Пример #22
0
def plot_wordcloud(df,
                   name="unigram",
                   labels=None,
                   specific_label=None,
                   sentiments=None,
                   additional_stopwords=None,
                   types=None,
                   figsize=(15, 15),
                   max_words=20,
                   weights=None,
                   make_title=True):
    """
    This function creates wordclouds
    - when labels are passed and a specific label, only a wordcloud for this one 
    cluster is created 
    - when only labels are passed, a wordcloud for each cluster is created 
    - when neither labels nor specific_label is passed a wordcloud for the 
    complete df is created 
    Note that the labels and the sentiments need not be of the length of the 
    passed df, but can simply be corresponding to the original reviews 

    :param df: the unigram dataframe 
    :param name: the column name where the unigrams are stored 
    :param labels: the cluster labels 
    :param specific_labels: A specific label (if only a wordcloud for this 
    cluster should be created)
    :param sentiments: sentiments belonging to the reviews  
    :param additional_stopwords: A set (or list) containing additional stopwords 
    that are not already in the spacy stopwords
    :param types: Specify a list with word-types that are allowed in the wordcloud 
    see https://spacy.io/api/annotation; can be e.g. "NOUN" or "VERB" or "ADJ"
    :param figsize: the figsize 
    :param max_words: the maximal number of words that are displayed in one 
    wordcloud 
    :weights: whether the term frequencies used for the wordcloud are weighted
    :param make_title: whether to include a title 

    :return: figure
    """
    if additional_stopwords is None:
        additional_stopwords = set()

    stopwords = STOP_WORDS.union(additional_stopwords)

    # wordcloud for whole df----------------------------------------------------

    if labels is None and specific_label is None:
        term_frequency = get_term_frequency(
            df=df,
            name=name,
            types=types,
            additional_stopwords=additional_stopwords,
            weights=weights)

        wordcloud = WordCloud(background_color="white",
                              stopwords=stopwords,
                              max_words=max_words).generate_from_frequencies(
                                  frequencies=term_frequency)

        fig, axs = plt.subplots(figsize=figsize)
        title = f"n = {max(df['sentence_id'])}"
        if sentiments is not None:
            average_sentiment = np.round(np.mean(sentiments), 2)
            title = title + f", average sentiment = {average_sentiment}"
        if make_title:
            axs.set_title(title)
        axs.imshow(wordcloud)
        # remove x and y ticks
        axs.set_xticks([])
        axs.set_yticks([])
        return fig

    # wordcloud for a specific label--------------------------------------------

    if specific_label is not None:
        recycled_labels = recycle(labels, df["sentence_id"])
        term_frequency = get_term_frequency(
            df=df[recycled_labels == specific_label],
            name=name,
            types=types,
            additional_stopwords=additional_stopwords,
            weights=weights)

        wordcloud = WordCloud(background_color="white",
                              stopwords=stopwords,
                              max_words=max_words).generate_from_frequencies(
                                  frequencies=term_frequency)

        fig, axs = plt.subplots(figsize=figsize)
        cluster_size = sum(labels == specific_label)
        title = f"Wordcloud for cluster {specific_label}, n = {cluster_size}"
        if sentiments is not None:
            average_sentiment = round(np.mean([labels == specific_label]))
            title = title + f", average sentiment = {average_sentiment}"
        if make_title:
            axs.set_title(title)
        axs.imshow(wordcloud)
        # remove x and y ticks
        axs.set_xticks([])
        axs.set_yticks([])
        return fig

    # wordclouds for all clusters-----------------------------------------------

    # term frequency is a list with counter objects, one for each cluster
    term_frequency_list = get_term_frequency(
        df=df,
        name=name,
        types=types,
        labels=labels,
        additional_stopwords=additional_stopwords,
        weights=weights)

    def do_wc(term_frequency):
        wordcloud = WordCloud(background_color="white",
                              max_words=max_words).generate_from_frequencies(
                                  frequencies=term_frequency)

        return wordcloud

    # create wordcloud for each cluster
    wordclouds = [do_wc(term_frequency_list[i]) for i in \
                  range(len(term_frequency_list))]

    label_list = list(set(labels))
    n_clusters = len(label_list)
    ncol = 2
    nrow = int(np.ceil(n_clusters / ncol))

    fig, axs = plt.subplots(nrow, ncol, figsize=figsize)

    plt.subplots_adjust(hspace=0.3)

    for i in range(nrow):
        for j in range(ncol):
            if i * ncol + j + 1 <= n_clusters:  # if n_clusters is uneven one subplot
                # is empty
                index = i * ncol + j
                label = label_list[index]
                title = f"label = {label}, " \
                    f"n = {sum(labels == label)}"
                if sentiments is not None:
                    average_sentiment = round(
                        np.mean(sentiments[labels == label]), 2)
                    title = title + \
                        f", average_sentiment = {average_sentiment}"
                if make_title:
                    axs[i, j].set_title(title)
                axs[i, j].imshow(wordclouds[index])
                # remove x and y ticks
                axs[i, j].set_xticks([])
                axs[i, j].set_yticks([])
            else:
                fig.delaxes(axs[i, j])  # this is potentially the empty plot
                # if number of plots is uneven
    plt.tight_layout()
    plt.close()
    return fig
Пример #23
0
def get_term_frequency(df,
                       name=None,
                       types=None,
                       labels=None,
                       weights=None,
                       additional_stopwords=None,
                       sorted_df=False):
    """
    This function creates a term frequency table for a dataframe that is created 
    by get_ngram_df. Either one for the whole df is created (when labels is None) 
    when one term frequency dict is created for each 
    cluster, i.e. the return is a dict
    where output[k] contains the term-frequencies for label k. 
    If weights are passed each objects is weighed according to it's weight and 
    not as 1 

    :param df: a ngram-dataframe as obtained by the function 
    get_ngram_df 
    :param name: the name of the column in df that contains the ngrams
    e.g. "unigram", "bigram" or "unigram_stem", if kept at None it is assigned 
    to "unigram", "bigram" or "ngram" if the name exists in df.columns
    :param types: df has a column "pos" that can be used to e.g. subset 
    words. So only the data for which data["pos"] is in types are considered
    when creating the term frequencies. If None all types are accepted 
    :param labels: cluster labels as numpy array or list 
    :param weights: a numpy array or list that contains the weights, if None
    simple counting is done
    :param additional_stopwords: basic stopwords from spacy are removed by 
    default. in some cases it is useful to remove further task-specific stopwords
    that can be passed as a set or list; None means no additional stopwords 
    :param sorted_df: If true the term-frequencies are a sorted dataframe, 
    otherwise they are a Counter object 

    :return: the term frequencies 
    """
    # input checking------------------------------------------------------------
    assert isinstance(df, pd.core.frame.DataFrame)
    assert name is None or isinstance(name, str)
    if name is not None:
        assert name in df.columns
    else:
        if "unigram" in df.columns:
            name = "unigram"
        elif "bigram" in df.columns:
            name = "bigram"
        elif "ngram" in df.columns:
            name = "ngram"
        else:
            raise Exception("name is none und was could not be identified")
    assert types is None or isinstance(types, (list, str, set))
    if isinstance(types, str):
        types = [types]
    if isinstance(types, list):
        assert "pos" in df.columns
    assert labels is None or isinstance(labels, (list, np.ndarray))
    # we have to ensure that weights and labels are numpy arrays because
    # if they remain a list we cannot subset via e.g. [2,5,7]
    if isinstance(labels, list):
        labels = np.array(labels)
    assert weights is None or isinstance(weights, (list, np.ndarray))
    if isinstance(weights, list):
        weights = np.array(weights)

    if additional_stopwords is None:
        additional_stopwords = set()
    assert isinstance(additional_stopwords, (set, list, str))
    if isinstance(additional_stopwords, str):
        additional_stopwords = [additional_stopwords]

    assert isinstance(sorted_df, bool)
    assert "sentence_id" in df.columns

    # recycle weights and labels if required------------------------------------
    if labels is not None and len(labels) != len(df):
        labels = recycle(labels, df["sentence_id"])
    if weights is not None and len(weights) != len(df):
        weights = recycle(weights, df["sentence_id"])

    # subset relevant ngrams----------------------------------------------------
    if name == "unigram":
        # in case of unigrams we include the standard stopwords from spacy
        stopwords = STOP_WORDS.union(additional_stopwords)
    else:
        stopwords = additional_stopwords

    relevant = df[name].apply(lambda x: x not in stopwords)

    if types is not None:
        relevant_type = df["pos"].apply(lambda x: x in types)
        relevant = relevant & relevant_type

    relevant_df = df[[name, "sentence_id"]][relevant]

    if labels is not None:
        labels = labels[relevant]
    if weights is not None:
        weights = weights[relevant]

    # calculate the term frequencies--------------------------------------------
    if labels is None:
        return get_tf(ngrams=relevant_df[name],
                      name=name,
                      weights=weights,
                      sorted_df=sorted_df)
    else:
        output = dict()
        current_weights = None
        for label in set(labels):
            # ATTENTION:
            # not that it is important here that we subset the dataframe
            # with a logical vector and not the indices, because then
            # one would have to pay attention to resetting the indices of the
            # pandas dataframe when constructing the relevant_df
            if weights is not None:
                current_weights = weights[labels == label]
            output[label] = get_tf(ngrams=relevant_df[name][labels == label],
                                   name=name,
                                   weights=current_weights,
                                   sorted_df=sorted_df)

    return output