Пример #1
0
import nltk
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer
import spacy
from bs4 import BeautifulSoup
import unidecode
import re
import contractions
import unicodedata

nlp = spacy.load('en_core_web_lg')
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')

contractions.add("n't", "not")
contractions.add("1st", "first")
contractions.add("2st", "second")
contractions.add("3th", "third")


def lemmatize_text(text):
    """
    Input: Vector of text
    Process: Lemmatize the input vector
    Output: Returns vector of text
    """
    text = nlp(text)
    text = ' '.join([
        word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text
    ])
Пример #2
0
def expandText(txt):
    contractions.add("wont", "will not")
    contractions.add("dont", "do not")
    contractions.add("doesnt", "does not")
    contractions.add("dn't", "did not")
    contractions.add("wont", "will not")
    contractions.add("cant", "can not")
    contractions.add("its", "it is")
    contractions.add("idk", "i do not know")
    expand_txt = contractions.fix(txt)
    return expand_txt
def expand_contractions(tweets):
    """ Function to transform some of the most common English and
        French contractions into their expanded form

    Args:
        tweets: list containing all tweets

    Returns:
        clean_tweets: list of tweets with the contractions expanded

    References:
        https://github.com/kootenpv/contractions

    """

    # since the library is designed for English contractions,
    # we will only have to add French contractions

    contractions.add("c'est", "cest")
    contractions.add("c’est", "cest")
    contractions.add("qu'il", "que il")
    contractions.add("qu’il", "que il")
    contractions.add("s'il", "si il")
    contractions.add("s’il", "si il")

    # create a list for storing the results
    clean_tweets = []
    for tweet in tweets:
        clean_tweets.append(contractions.fix(tweet).lower())

    # the rest of the French contractions will need to be solved
    # through regular expressions

    # l’intelligence --> le intelligence
    clean_tweets = [re.sub(r"\bl['|’](\S)", r"le \1", tweet)
                    for tweet in clean_tweets]

    # d’bananes --> des bananes
    clean_tweets = [re.sub(r"\bd['|’](\S)", r"de \1", tweet)
                    for tweet in clean_tweets]

    # j’avais --> je avais
    clean_tweets = [re.sub(r"\bj['|’](\S)", r"je \1", tweet)
                    for tweet in clean_tweets]

    # n’aurait --> ne aurait
    clean_tweets = [re.sub(r"\bn['|’](\S)", r"ne \1", tweet)
                    for tweet in clean_tweets]

    return(clean_tweets)
Пример #4
0
def test_add():
    contractions.add('mychange', 'my change')
    assert contractions.fix('mychange') == 'my change'
def update_acronyms():
    for (key, value) in common_acronym_list.acronym_list.items():
        #print(key, ":",value)
        con.add(key, value)
    return True