def tokenization(frase):
    nlp=it_core_news_sm.load()
    tok = list()
    row = list()
    stop_word = ['e','o','','il','lo','la','un','uno','una','mia','mio','tuo','con','su','per','tra','fra','please','aiuto','urgente','help',',','.','..','...','....',':',';','!','(',')','-',' ','/','"']
    sentence = np.zeros(50)
    frase= frase.lower()
    print("domanda = ", frase)
    sentence = nlp(frase)

    vec =[]

    for tok in sentence:
        word = tok.text.lower()
        flag = True
        #delete stopWords
        if word in stop_word:
            flag = False
        #delete numbers
        elif tok.pos == NUM:
            flag = False

        elif tok.pos == VERB or tok.pos == AUX:
            word=tok.lemma_

        if flag==True:
            vec.append(tok.text)
    return vec
Exemplo n.º 2
0
def spacy_analyze(fulltext, source_lang):
    """Use spacy to analyze input text

    Parameters:
    fulltext (string): text
    source_lang (string): language of the input text

    Returns:
    nlp: nlp object

    """
    doc = None

    if (source_lang == 'fr'):
        try:
            nlp = fr_core_news_sm.load(disable=['parser', 'ner'])
            doc = nlp(fulltext)
        except:
            print(sys.exc_info()[0])
    elif (source_lang == 'it'):
        try:
            nlp = it_core_news_sm.load(disable=['parser', 'ner'])
            doc = nlp(fulltext)
        except:
            print(sys.exc_info()[0])

    return doc
def get_spacy_tokenizer(default_lingo, supported_languages, bigmodel_required):
    '''returns the spacy nlp function corresponding to the language of a document'''
    if default_lingo in supported_languages:
        if bigmodel_required == False:
            if default_lingo == "German":
                import de_core_news_sm
                nlp = de_core_news_sm.load()
            elif default_lingo == "English":
                import en_core_web_sm
                nlp = en_core_web_sm.load()
            elif default_lingo == "Spanish":
                import es_core_news_sm
                nlp = es_core_news_sm.load()
            elif default_lingo == "French":
                import fr_core_news_sm
                nlp = fr_core_news_sm.load()
            elif default_lingo == "Portuguese":
                import pt_core_news_sm
                nlp = pt_core_news_sm.load()
            else:
                import it_core_news_sm
                nlp = it_core_news_sm.load()
        else:
            if default_lingo == "German":
                import de_core_news_md
                nlp = de_core_news_md.load()
            elif default_lingo == "English":
                import en_core_web_md
                nlp = en_core_web_md.load()
            elif default_lingo == "Spanish":
                import es_core_news_md
                nlp = es_core_news_md.load()
            elif default_lingo == "French":
                import fr_core_news_md
                nlp = fr_core_news_md.load()
            elif default_lingo == "Portuguese":
                # there is no pt_md model
                import pt_core_news_sm
                nlp = pt_core_news_sm.load()
            else:
                # there is no it_md model
                import it_core_news_sm
                nlp = it_core_news_sm.load()
    else:
        print("NOT A SUPPORTED LANGUAGE!")
    return nlp
Exemplo n.º 4
0
    def __init__(self):
        self.nlp = it_core_news_sm.load()
        emoji = Emoji(self.nlp)
        sentencizer = self.nlp.create_pipe("sentencizer")

        #Add components to the pipeline
        self.nlp.add_pipe(emoji, first=True)
        self.nlp.add_pipe(hashtag_pipe, first=True)
        self.nlp.add_pipe(sentencizer)
Exemplo n.º 5
0
def get_dependency_tree(sentence, language="en"):

    if (language == "it"):
        nlp = it_core_news_sm.load()
    else:
        nlp = en_core_web_sm.load()
    doc = nlp(sentence)

    global root
    for token in doc:
        if token.dep_ == "ROOT":
            root = token

    result = create_dictionary(root)
    return result
Exemplo n.º 6
0
def _nlp(spacy_module: str) -> Optional[NLP]:
    print("Loading spacy language model for '", spacy_module, "'")
    if spacy_module == 'en':
        nlp = en_core_web_sm.load()
    elif spacy_module == 'es':
        nlp = es_core_news_sm.load()
    elif spacy_module == 'de':
        nlp = de_core_news_sm.load()
    elif spacy_module == 'fr':
        nlp = fr_core_news_sm.load()
    elif spacy_module == 'it':
        nlp = it_core_news_sm.load()
    elif spacy_module == 'pt':
        nlp = pt_core_news_sm.load()
    else:
        raise ValueError(f'Unsupported language {spacy_module}')
    return nlp
Exemplo n.º 7
0
 def __init__(self, url):
     try:
         pattern = re.compile(
             "^(?:http(s)?:\/\/)?[\w.-]+(?:\.[\w\.-]+)+[\w\-\._~:/?#[\]@!\$&'\(\)\*\+,;=.]+$"
         )
         if not pattern.match(url):
             print(f"{url} is not a valid url")
         self.url = url
         self.article = Article(self.url)
         self.article.download()
         self.article.parse()
         self.author = self.article.authors
         self.oneline = self.article.summary
         self.text = self.article.text.replace("\n", ".")
         if self.article.meta_lang == 'en' or (self.article.meta_lang == ''
                                               and url.find(
                                                   "cnn.com", 0, 10)):
             import en_core_web_sm
             self.model = en_core_web_sm.load()
         elif self.article.meta_lang == 'it':
             import it_core_news_sm
             self.model = it_core_news_sm.load()
         elif self.article.meta_lang == 'fr':
             import fr_core_news_sm
             self.model = fr_core_news_sm.load()
         elif self.article.meta_lang == 'es':
             import es_core_news_sm
             self.model = es_core_news_sm.load()
         elif self.article.meta_lang == 'pt':
             import pt_core_news_sm
             self.model = pt_core_news_sm.load()
         else:
             print(
                 f"The {self.article.meta_lang} language is not supported")
         self.data = []
         self.vectorizer = TfidfVectorizer(strip_accents='unicode')
     except article.ArticleException:
         print(
             f"The url {url} is not supported, please write to [email protected] for further help"
         )
         self.valid = False
Exemplo n.º 8
0
def check_spacy_models(main, lang, pipeline):
    if pipeline == 'word_tokenization':
        nlp_pipelines = []
        nlp_disable = ['tagger', 'parser', 'ner']
    elif pipeline in ['sentence_tokenization', 'tokenization']:
        nlp_pipelines = ['sentencizer']
        nlp_disable = ['tagger', 'parser', 'ner']
    elif pipeline in ['pos_tagging', 'lemmatization']:
        nlp_pipelines = ['tagger']
        nlp_disable = ['parser', 'ner']

    # Languages with models
    if lang in [
            'nld', 'eng', 'fra', 'deu', 'ell', 'ita', 'por', 'spa', 'other'
    ]:
        if f'spacy_nlp_{lang}' in main.__dict__:
            if main.__dict__[f'spacy_nlp_{lang}'].pipe_names != nlp_pipelines:
                del main.__dict__[f'spacy_nlp_{lang}']

        if f'spacy_nlp_{lang}' not in main.__dict__:
            # Dutch
            if lang == 'nld':
                import nl_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = nl_core_news_sm.load(
                    disable=nlp_disable)
            # English
            elif lang == 'eng':
                import en_core_web_sm

                main.__dict__[f'spacy_nlp_{lang}'] = en_core_web_sm.load(
                    disable=nlp_disable)
            # French
            elif lang == 'fra':
                import fr_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = fr_core_news_sm.load(
                    disable=nlp_disable)
            # German
            elif lang == 'deu':
                import de_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = de_core_news_sm.load(
                    disable=nlp_disable)
            # Greek (Modern)
            elif lang == 'ell':
                import el_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = el_core_news_sm.load(
                    disable=nlp_disable)
            # Italian
            elif lang == 'ita':
                import it_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = it_core_news_sm.load(
                    disable=nlp_disable)
            # Portuguese
            elif lang == 'por':
                import pt_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = pt_core_news_sm.load(
                    disable=nlp_disable)
            # Spanish
            elif lang == 'spa':
                import es_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = es_core_news_sm.load(
                    disable=nlp_disable)
            # Other Languages
            elif lang == 'other':
                import en_core_web_sm

                main.__dict__[f'spacy_nlp_{lang}'] = en_core_web_sm.load(
                    disable=nlp_disable)
    # Languages without models
    else:
        # Serbian (Cyrillic) & Serbian (Latin)
        if lang in ['srp_cyrl', 'srp_latn']:
            main.__dict__['spacy_nlp_srp_cyrl'] = spacy.blank('rs')
            main.__dict__['spacy_nlp_srp_latn'] = spacy.blank('rs')
        else:
            main.__dict__[f'spacy_nlp_{lang}'] = spacy.blank(
                wordless_conversion.to_iso_639_1(main, lang))

    if 'sentencizer' in nlp_pipelines:
        nlp = main.__dict__[f'spacy_nlp_{lang}']

        if 'sentencizer' not in nlp.pipe_names:
            nlp.add_pipe(nlp.create_pipe('sentencizer'))
Exemplo n.º 9
0
def check_spacy_models(main, lang, pipeline):
    if lang == 'other':
        lang = 'eng'

    if pipeline == 'word_tokenization':
        nlp_pipelines = []
        nlp_disable = ['tagger', 'parser', 'ner']
    elif pipeline in ['tokenization', 'sentence_tokenization']:
        nlp_pipelines = ['sbd']
        nlp_disable = ['tagger', 'parser', 'ner']
    elif pipeline in ['pos_tagging', 'lemmatization']:
        nlp_pipelines = ['tagger']
        nlp_disable = ['parser', 'ner']

    if lang in ['nld', 'eng', 'fra', 'deu', 'ell', 'ita', 'por', 'spa']:
        if f'spacy_nlp_{lang}' in main.__dict__:
            if main.__dict__[f'spacy_nlp_{lang}'].pipe_names != nlp_pipelines:
                del main.__dict__[f'spacy_nlp_{lang}']

        if f'spacy_nlp_{lang}' not in main.__dict__:
            # Dutch
            if lang == 'nld':
                import nl_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = nl_core_news_sm.load(
                    disable=nlp_disable)
            # English
            elif lang == 'eng':
                import en_core_web_sm

                main.__dict__[f'spacy_nlp_{lang}'] = en_core_web_sm.load(
                    disable=nlp_disable)
            # French
            elif lang == 'fra':
                import fr_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = fr_core_news_sm.load(
                    disable=nlp_disable)
            # German
            elif lang == 'deu':
                import de_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = de_core_news_sm.load(
                    disable=nlp_disable)
            # Greek (Modern)
            elif lang == 'ell':
                import el_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = el_core_news_sm.load(
                    disable=nlp_disable)
            # Italian
            elif lang == 'ita':
                import it_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = it_core_news_sm.load(
                    disable=nlp_disable)
            # Portuguese
            elif lang == 'por':
                import pt_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = pt_core_news_sm.load(
                    disable=nlp_disable)
            # Spanish
            elif lang == 'spa':
                import es_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = es_core_news_sm.load(
                    disable=nlp_disable)

        if 'sbd' in nlp_pipelines:
            nlp = main.__dict__[f'spacy_nlp_{lang}']

            if 'sbd' not in nlp.pipe_names:
                nlp.add_pipe(nlp.create_pipe('sentencizer'))
Exemplo n.º 10
0
def wordEmbedding(data):

    model = Word2Vec.load('word2vec/common/word2vec/models/wiki_iter=5_algorithm=skipgram_window=10_size=300_neg-samples=10.m') # word2vec model from italian word embedding
    nlp=it_core_news_sm.load()
    stop_word = ['e','o','','il','lo','la','un','uno','una','mia','mio','tuo','con','su','per','tra','fra','please','aiuto','urgente','help',',','.','..','...','....',':',';','!','(',')','-',' ','/','"']

    tok = list()
    row = list()
    sentence = np.zeros(50)
    matrix3D = np.zeros((len(data),50,300))
    indexs=0
    badToken=0
    goodToken=0
    nAns=0
    for row in data:

        frase= row.lower()
        print("domanda = ", frase)
        sentence = nlp(frase)
        tokIndex=0
        nAns += 1

        for tok in sentence:
            print([(tok.text, tok.pos_) ])
            #word = tok.lemma_
            word = tok.text.lower()
            g_vec =[]
            flag = True

            #delete stopWords
            if word in stop_word:
                flag = False

            #delete numbers
            elif tok.pos == NUM:
                flag = False

            elif word == 'è':
                word=tok.lemma_

            if flag==True:
                if model.wv.__contains__(word):
                    g_vec = model.wv.__getitem__(word)
                    goodToken += 1
                else: #set a random word embedding fon unknown words
                    badToken += 1
                    g_vec = model.wv.__getitem__(rn.choice(model.wv.index2entity))

            if flag==True:
                tok2D = []
                tok2D = g_vec
                if tokIndex < 50:
                    matrix3D[indexs][tokIndex] = tok2D
                tokIndex += 1
        indexs += 1

    print ('I token trovati sono:' ,goodToken)
    print ('I token non trovati sono:' , badToken)
    print('Il vocabolario è: ', len(model.wv.vocab))
    print('le domande trovate sono:', nAns)

    print(matrix3D.shape)

    return matrix3D
Exemplo n.º 11
0
from nltk.corpus import wordnet as wn

import fr_core_news_sm
nlp_fr = fr_core_news_sm.load()

import en_core_web_sm
nlp_en = en_core_web_sm.load()

import de_core_news_sm
nlp_de = de_core_news_sm.load()

import es_core_news_sm
nlp_es = es_core_news_sm.load()

import it_core_news_sm
nlp_it = it_core_news_sm.load()

import pt_core_news_sm
nlp_pt = pt_core_news_sm.load()

import nl_core_news_sm
nlp_nl = nl_core_news_sm.load()

# global variables
wnl = WordNetLemmatizer()
html_parser = HTMLParser()
stopword_list = []
language = ""


def init_lib(lang):
Exemplo n.º 12
0
def lemmatize(text: str, nlp=it_core_news_sm.load()) -> str:
    """Convert words to their base form."""
    doc = nlp(text)
    return " ".join([word.lemma_ if word.lemma_ != "-PRON-" else word.lower_ for word in doc])
Exemplo n.º 13
0
def make_nlp():
    return it_core_news_sm.load()  # spacy.load('it_core_news_sm')
Exemplo n.º 14
0
from Preprocessing import *
import it_core_news_sm
import re
nlp = it_core_news_sm.load()
def tokenizer_FASTTEXT(doc):
    tokenize = []
    new_verse = []
    for x in doc:
        verse = nlp(x)
        new_verse = []
        for w in verse:
            regex = re.compile(r'( +|\'|\-|\,|\!|\:|\;|\?|\.|\(|\)|\«|\»|\")')
            if not regex.match(w.text):
                w_lower = w.text.casefold()
                new_verse.append(w_lower)
        tokenize.append(" ".join(new_verse))

    return tokenize


df_train = pd.concat([cv_text, dev_text])
train_emotion = np.concatenate([emotion, dev_emotion])


cv_tokenized = tokenizer_FASTTEXT(cv_text)
dev_tokenized = tokenizer_FASTTEXT(dev_text)
test_tokenized = tokenizer_FASTTEXT(test_text)
train_tokenized = tokenizer_FASTTEXT(df_train)


#prepare dataset for fasttext
Exemplo n.º 15
0
from Preprocessing import *
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation as LDiA
from sklearn.preprocessing import StandardScaler
import it_core_news_sm
import re

sc = StandardScaler()
nlp = it_core_news_sm.load()  # the SpaCy Italian Tokenizer


def italian_tokenizer(verse):
    tokenized = []
    doc = nlp(verse)
    for w in doc:
        regex = re.compile(r'( +|\'|\-|\,|\!|\:|\;|\?|\.|\(|\)|\«|\»|\")')
        if not regex.match(w.text):
            w_lower = w.text.casefold()
            tokenized.append(w_lower)
    return tokenized


# Words - TF-IDF
vectorizer = TfidfVectorizer(tokenizer=italian_tokenizer)
cv_tfidf = vectorizer.fit_transform(cv_text).toarray()
cv_tfidf = pd.DataFrame(cv_tfidf)

dev_tfidf = vectorizer.transform(dev_text).toarray()
dev_tfidf = pd.DataFrame(dev_tfidf)

vectorizer2 = TfidfVectorizer(tokenizer=italian_tokenizer)
Exemplo n.º 16
0
def wordEmbedding(data):
    cap_path = datapath('cc.it.300.bin')
    modelfast = gs.models.fasttext.load_facebook_vectors(cap_path)
    nlp=it_core_news_sm.load()
    stop_word = ['e','o','','il','lo','la','un','uno','una','mia','mio','tuo','con','su','per','tra','fra','please','aiuto','urgente','help',',','.','..','...','....',':',';','!','(',')','-',' ','/','"']
    
    
    tok = list()
    row = list()
    sentence = np.zeros(50) 
    matrix3D = np.zeros((len(data),50,300))
    indexs=0
    badToken=0
    goodToken=0
    nAns=0
    for row in data:
        
        frase = row.lower()
        print("domanda = ", frase)
        sentence = nlp(frase)
        tokIndex=0
        nAns += 1
        
        for tok in sentence:
            print([(tok.text, tok.pos_) ])
            #word = tok.lemma_    
            word = tok.text.lower()
            g_vec =[]
            flag = True
            
            #delete stopWords
            if word in stop_word:
                flag = False
                                                      
            #delete numbers                       
            elif tok.pos == NUM:
                flag = False
            
            elif word == 'è':  
                word=tok.lemma_
            
            if flag==True:
                if modelfast.wv.__contains__(word):
                    g_vec = modelfast.wv.__getitem__(word)
                    goodToken += 1
                    #print(g_vec[:300])
                 
                #set a random word embedding fon unknown words
                else:
                    badToken += 1
                    g_vec = modelfast.wv.__getitem__(rn.choice(modelfast.wv.index2entity))
                    #print(g_vec [:300])
    
            
            if flag==True:
            # VECTOR FOR EACH TOKEN
                tok2D = []
                tok2D = g_vec
                #print(tok2D[:300])
                if tokIndex < 50:
                    matrix3D[indexs][tokIndex] = tok2D
                tokIndex += 1
        indexs += 1
    
    print ('I token trovati sono:' ,goodToken)
    print ('I token non trovati sono:' , badToken)
    print('Il vocabolario è: ', len(modelfast.wv.vocab))
    print('le domande trovate sono:', nAns)
    
    print(matrix3D.shape)
    
    return matrix3D