def lemmatize(data):
    output = []
    lemmatizerEn = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
    lemmatizerEs = Lemmatizer(LEMMA_INDEX,
                              LEMMA_EXC,
                              LEMMA_RULES,
                              lookup=spacy.lang.es.LOOKUP)
    bar = ChargingBar('Lemmatizing\t\t\t\t', max=len(data))
    for instance in data:
        new_tweet = {}
        new_tweet['tweetid'] = instance['tweetid']
        new_tweet['tweet'] = instance['tweet']
        new_tweet['tokens'] = []
        new_tweet['langid'] = instance['langid']
        new_tweet['sentiment'] = instance['sentiment']
        for i, word in enumerate(instance['tokens']):
            if (instance['langid'][i] == 'lang1'):
                new_tweet['tokens'].append(lemmatizerEn.lookup(word))
            elif (instance['langid'][i] == 'lang2'):
                new_tweet['tokens'].append(lemmatizerEs.lookup(word))
            else:
                new_tweet['tokens'].append(word)

            # new_tweet['tokens'].append(lemmatizerEn.lookup(word))
        output.append(new_tweet)
        new_tweet = {}
        new_tweet['tweetid'] = instance['tweetid']
        new_tweet['tweet'] = instance['tweet']
        new_tweet['tokens'] = []
        new_tweet['langid'] = []
        new_tweet['sentiment'] = instance['sentiment']
        bar.next()
    bar.finish()
    return output
def solve(word_list):
    wnl = stem.WordNetLemmatizer()
    porter = stem.porter.PorterStemmer()
    a = [porter.stem(word) for word in word_list]
    b = [wnl.lemmatize(word) for word in word_list]
    lemmatizer = Lemmatizer()
    c = [lemmatizer.lookup(word) for word in word_list]
    res = {}
    res['a'] = a
    res['b'] = b
    res['c'] = c
    return res
示例#3
0
    def transform(self):
        df2 = self.dataframe.withColumn(
            "_2",
            regexp_replace(col("_2"), "[\"'./§$&+,:;=?@#–|'<>.^*()%!-]", ""))
        df = df2.withColumn("_2", regexp_replace(col("_2"), "\\s{2,}", ""))

        language_detect = udf(lambda x: detect(x), returnType=StringType())
        df3 = df.withColumn("lang", language_detect('_2'))

        lemmatizer = Lemmatizer(lookup=delook)
        lemmatizer1 = Lemmatizer(lookup=enlook)
        tokenizer = Tokenizer(inputCol="_2", outputCol="words")
        tokenized = tokenizer.transform(df3)
        # print(tokenized)

        lemma = udf(lambda x, lang: True if lang == "de"
                    " ".join([lemmatizer.lookup(i) for i in x]) else " ".join(
                        [lemmatizer1.lookup(i) for i in x]),
                    returnType=StringType())

        lemmatized = tokenized.withColumn(
            "stemmed", lemma(col('words'),
                             col('lang'))).drop('words').drop('_2')
        tokenizer = Tokenizer(inputCol="stemmed", outputCol="words")
        tokenized = tokenizer.transform(lemmatized)
        remover = StopWordsRemover(inputCol="words", outputCol="filtered")
        stopwords = remover.loadDefaultStopWords(
            "german") + remover.loadDefaultStopWords("english")
        remover = remover.setStopWords(stopwords)
        newDataSet = remover.transform(tokenized)

        test = newDataSet.withColumn("filtered", explode(col("filtered"))) \
            .groupBy("_1", "filtered") \
            .agg(func.count(func.lit(1)).alias("count")) \
            .sort(col("count").desc())

        return test
示例#4
0
    def get_cleaned_text(self):
        '''
        This function clean the text
        '''

        f = gzip.open(self.path, 'rb')
        self.text = f.read().decode('utf-8')

        #removing stop words and words with only one character
        nlp = spacy.load("en_core_web_sm")
        stop_words = set(nlp.Defaults.stop_words)
        self.text = self.text.lower().split(' ')
        self.text = [
            word for word in self.text if word not in stop_words
            if len(word) > 1
        ]

        #lemmatizing the words
        lemmatizer = Lemmatizer()
        self.text = [lemmatizer.lookup(word) for word in self.text]

        return self.text
        docx = nlp(document1)
        #Otherwise, send the headingsearch result through nlp
    else:
        docx = nlp(returnedSearch)

    word_frequencies = {}  # how many times each word occurs int the document
    words = [
    ]  # a list of every word in the document stores in the same index of the frequency array

    #spacy lemmatizer to get root words
    lookups = Lookups()
    lemmatizer = Lemmatizer(lookups)

    for word in docx:  # go through every word in document
        if word.text not in stopwords:  # as long as the word isnt a stop word
            if lemmatizer.lookup(word.text) not in word_frequencies.keys(
            ):  # if we havent come across the word yet
                word_frequencies[lemmatizer.lookup(
                    word.text)] = 1  # its frequency is one
                words.append(lemmatizer.lookup(word.text))  # add it to words
            else:
                word_frequencies[lemmatizer.lookup(
                    word.text
                )] += 1  # otherwise it is already in the list, so increment it

#Sort through the array by bubble sort

    def bubble_sort(arrNum, arrStr):
        def swapNum(i, j):
            arrNum[i], arrNum[j] = arrNum[j], arrNum[i]
示例#6
0
lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Tokenize input
file = open('./Books/text.txt', mode='r')

tokenized_text = tokenizer.tokenize(file.read())

vocabulary = pandas.read_csv("./Lexique382/lexique3_words_90_percentile.tsv",
                             sep='\t').lemme.to_list()
unknownVocab = {}
index = 0
for token in tokenized_text:
    lemma = lemmatizer.lookup(token)
    if lemma not in vocabulary and token.isalpha():
        unknownVocab[token] = index
        tokenized_text[index] = '[MASK]'
    index += 1

# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])

# Load pre-trained model (weights)
model = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
model.eval()
# spacy -m download en_core_web_sm
# ```

# %%
import spacy

# %%
from spacy.lemmatizer import Lemmatizer
from spacy.lookups import Lookups
lookups = Lookups()
lookups.add_table("lemma_rules", {"noun": [["s", ""]]})

lemmatizer = Lemmatizer(lookups)

# %%
[lemmatizer.lookup(word) for word in word_list]

# %% [markdown]
# Spacy doesn't offer a stemmer (since lemmatization is considered better-- this is an example of being opinionated!)

# %% [markdown]
# Stop words vary from library to library

# %%
nlp = spacy.load("en_core_web_sm")

# %%
sorted(list(nlp.Defaults.stop_words))[:20]

# %% [markdown]
# #### Exercise: What stop words appear in spacy but not in sklearn?
示例#8
0
class SpacyTokenizer:
    """
    Tokenize or tokenize a list of documents.
    Return list of tokens or lemmas, without sentencizing.
    Works only for English language.
    """
    def __init__(self,
                 disable: list = None,
                 stopwords: list = None,
                 batch_size: int = None,
                 ngram_range: Tuple[int, int] = None,
                 lemmas=False,
                 lowercase: bool = None,
                 alphas_only: bool = None):
        """
        :param disable: pipeline processors to omit; if nothing should be disabled,
         pass an empty list
        :param stopwords: a set of words to skip
        :param batch_size: a batch size for internal spaCy multi-threading
        :param ngram_range: range for producing ngrams, ex. for unigrams + bigrams should be set to
        (1, 2), for bigrams only should be set to (2, 2)
        :param lemmas: weather to perform lemmatizing or not while tokenizing, currently works only
        for the English language
        :param n_threads: a number of threads for internal spaCy multi-threading
        """
        if disable is None:
            disable = ['parser', 'ner']
        self._stopwords = stopwords or []

        self.model = spacy.load('en', disable=disable)
        self.tokenizer = Tokenizer(self.model.vocab)
        self.lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES,
                                     LOOKUP)
        self.batch_size = batch_size
        self.ngram_range = ngram_range
        self.lemmas = lemmas
        self.lowercase = lowercase
        self.alphas_only = alphas_only

    @property
    def stopwords(self):
        return self._stopwords

    @stopwords.setter
    def stopwords(self, stopwords: List[str]):
        self._stopwords = stopwords

    def tokenize(self,
                 data: List[str],
                 ngram_range=(1, 1),
                 lowercase=True) -> Generator[List[str], Any, None]:
        """
        Tokenize a list of documents.
        :param data: a list of documents to process
        :param ngram_range: range for producing ngrams, ex. for unigrams + bigrams should be set to
        (1, 2), for bigrams only should be set to (2, 2)
        :param lowercase: whether to perform lowercasing or not
        :return: a single processed doc generator
        """
        size = len(data)

        _ngram_range = self.ngram_range or ngram_range

        if self.lowercase is None:
            _lowercase = lowercase
        else:
            _lowercase = self.lowercase

        for i, doc in enumerate(data):
            spacy_doc = self.model(doc)
            logger.debug("Tokenize doc {} from {}".format(i, size))
            if _lowercase:
                tokens = [t.lower_ for t in spacy_doc]
            else:
                tokens = [t.text for t in spacy_doc]
            filtered = self._filter(tokens)
            processed_doc = ngramize(filtered, ngram_range=_ngram_range)
            yield from processed_doc

    def lemmatize(self, data: List[str], ngram_range=(1, 1)) -> \
            Generator[List[str], Any, None]:
        """
        Lemmatize a list of documents.
        :param data: a list of documents to process
        :param ngram_range: range for producing ngrams, ex. for unigrams + bigrams should be set to
        (1, 2), for bigrams only should be set to (2, 2)
        :return: a single processed doc generator
        """
        size = len(data)

        _ngram_range = self.ngram_range or ngram_range

        for i, doc in enumerate(data):
            spacy_doc = self.model(doc)
            logger.debug("Lemmatize doc {} from {}".format(i, size))
            tokens = [t.lower_ for t in spacy_doc]
            lemmas = [self.lemmatizer.lookup(word) for word in tokens]
            filtered = self._filter(lemmas)
            processed_doc = ngramize(filtered, ngram_range=_ngram_range)
            yield from processed_doc

    def _filter(self, items, alphas_only=True):
        """
        Make ngrams from a list of tokens/lemmas
        :param items: list of tokens, lemmas or other strings to form ngrams
        :param alphas_only: should filter numeric and alpha-numeric types or not
        :return: filtered list of tokens/lemmas
        """
        _alphas_only = self.alphas_only or alphas_only

        if _alphas_only:
            filter_fn = lambda x: x.isalpha() and x not in self._stopwords
        else:
            filter_fn = lambda x: x not in self._stopwords

        return list(filter(filter_fn, items))