예제 #1
0
def updateEventoKeyWordEntities(sender, instance, created, **kwargs):

    assunto = instance.assunto

    # instancia o modelo de nlp
    nlp = pt_core_news_sm.load()
    doc = nlp(assunto)

    # Separação de tokens
    tokens = pre_processing(doc)

    # Requisição do dialogflow para obter as entities
    client = dialogflow_v2.EntityTypesClient()
    parent = client.project_agent_path(os.environ['PROJECT_ID'])
    list_entity_types_response = list(client.list_entity_types(parent))

    # cria uma nova instância com as novas entities processadas
    list_entity_types_response = list(client.list_entity_types(parent))
    entity_type = list_entity_types_response[2]

    entries = []
    entities = list(entity_type.entities)

    for token in tokens:
        entities.append({'value': token.lemma_, 'synonyms': [token.text]})

    #realiza o submit das entities ao dialogflow
    response = client.batch_update_entities(entity_type.name, entities)
    response.done()

    # treina o modelo do
    client = dialogflow_v2.AgentsClient()
    project_parent = client.project_path(os.environ['PROJECT_ID'])

    client.train_agent(project_parent)
def get_spacy_tokenizer(default_lingo, supported_languages, bigmodel_required):
    '''returns the spacy nlp function corresponding to the language of a document'''
    if default_lingo in supported_languages:
        if bigmodel_required == False:
            if default_lingo == "German":
                import de_core_news_sm
                nlp = de_core_news_sm.load()
            elif default_lingo == "English":
                import en_core_web_sm
                nlp = en_core_web_sm.load()
            elif default_lingo == "Spanish":
                import es_core_news_sm
                nlp = es_core_news_sm.load()
            elif default_lingo == "French":
                import fr_core_news_sm
                nlp = fr_core_news_sm.load()
            elif default_lingo == "Portuguese":
                import pt_core_news_sm
                nlp = pt_core_news_sm.load()
            else:
                import it_core_news_sm
                nlp = it_core_news_sm.load()
        else:
            if default_lingo == "German":
                import de_core_news_md
                nlp = de_core_news_md.load()
            elif default_lingo == "English":
                import en_core_web_md
                nlp = en_core_web_md.load()
            elif default_lingo == "Spanish":
                import es_core_news_md
                nlp = es_core_news_md.load()
            elif default_lingo == "French":
                import fr_core_news_md
                nlp = fr_core_news_md.load()
            elif default_lingo == "Portuguese":
                # there is no pt_md model
                import pt_core_news_sm
                nlp = pt_core_news_sm.load()
            else:
                # there is no it_md model
                import it_core_news_sm
                nlp = it_core_news_sm.load()
    else:
        print("NOT A SUPPORTED LANGUAGE!")
    return nlp
예제 #3
0
def get_pos_tags(tweet):
    """
    Takes a list of strings (tweets) and
    returns a list of strings of (POS tags).
    """
    nlp = pt_core_news_sm.load()
    doc = nlp(tweet)
    tag_list = [w.pos_ for w in doc]
    return tag_list
예제 #4
0
def _nlp(spacy_module: str) -> Optional[NLP]:
    print("Loading spacy language model for '", spacy_module, "'")
    if spacy_module == 'en':
        nlp = en_core_web_sm.load()
    elif spacy_module == 'es':
        nlp = es_core_news_sm.load()
    elif spacy_module == 'de':
        nlp = de_core_news_sm.load()
    elif spacy_module == 'fr':
        nlp = fr_core_news_sm.load()
    elif spacy_module == 'it':
        nlp = it_core_news_sm.load()
    elif spacy_module == 'pt':
        nlp = pt_core_news_sm.load()
    else:
        raise ValueError(f'Unsupported language {spacy_module}')
    return nlp
예제 #5
0
def obtemListasPortIng(lista):
  listaPortAux = []
  listaIngAux = []
  listaPalavrasNaoReconAux = []
  nlp = pt_core_news_sm.load()
  for termo in lista:
    w = []
    doc = nlp(termo)
    w = [token.lemma_ for token in doc]
    if ((w[0] in vocabPortugues) or (nltk.PorterStemmer().stem(termo)) in vocabPortugues) and (nltk.corpus.wordnet.morphy(termo) not in vocabIngles):
      listaPortAux.append(termo)
    else:
      if ((w[0] not in vocabPortugues) or (nltk.PorterStemmer().stem(termo)) not in vocabPortugues) and (nltk.corpus.wordnet.morphy(termo) not in vocabIngles):
        listaPalavrasNaoReconAux.append(termo)
      else:
        listaIngAux.append(termo)
  return listaPortAux, listaIngAux, listaPalavrasNaoReconAux
예제 #6
0
 def __init__(self, url):
     try:
         pattern = re.compile(
             "^(?:http(s)?:\/\/)?[\w.-]+(?:\.[\w\.-]+)+[\w\-\._~:/?#[\]@!\$&'\(\)\*\+,;=.]+$"
         )
         if not pattern.match(url):
             print(f"{url} is not a valid url")
         self.url = url
         self.article = Article(self.url)
         self.article.download()
         self.article.parse()
         self.author = self.article.authors
         self.oneline = self.article.summary
         self.text = self.article.text.replace("\n", ".")
         if self.article.meta_lang == 'en' or (self.article.meta_lang == ''
                                               and url.find(
                                                   "cnn.com", 0, 10)):
             import en_core_web_sm
             self.model = en_core_web_sm.load()
         elif self.article.meta_lang == 'it':
             import it_core_news_sm
             self.model = it_core_news_sm.load()
         elif self.article.meta_lang == 'fr':
             import fr_core_news_sm
             self.model = fr_core_news_sm.load()
         elif self.article.meta_lang == 'es':
             import es_core_news_sm
             self.model = es_core_news_sm.load()
         elif self.article.meta_lang == 'pt':
             import pt_core_news_sm
             self.model = pt_core_news_sm.load()
         else:
             print(
                 f"The {self.article.meta_lang} language is not supported")
         self.data = []
         self.vectorizer = TfidfVectorizer(strip_accents='unicode')
     except article.ArticleException:
         print(
             f"The url {url} is not supported, please write to [email protected] for further help"
         )
         self.valid = False
예제 #7
0
def check_spacy_models(main, lang, pipeline):
    if pipeline == 'word_tokenization':
        nlp_pipelines = []
        nlp_disable = ['tagger', 'parser', 'ner']
    elif pipeline in ['sentence_tokenization', 'tokenization']:
        nlp_pipelines = ['sentencizer']
        nlp_disable = ['tagger', 'parser', 'ner']
    elif pipeline in ['pos_tagging', 'lemmatization']:
        nlp_pipelines = ['tagger']
        nlp_disable = ['parser', 'ner']

    # Languages with models
    if lang in [
            'nld', 'eng', 'fra', 'deu', 'ell', 'ita', 'por', 'spa', 'other'
    ]:
        if f'spacy_nlp_{lang}' in main.__dict__:
            if main.__dict__[f'spacy_nlp_{lang}'].pipe_names != nlp_pipelines:
                del main.__dict__[f'spacy_nlp_{lang}']

        if f'spacy_nlp_{lang}' not in main.__dict__:
            # Dutch
            if lang == 'nld':
                import nl_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = nl_core_news_sm.load(
                    disable=nlp_disable)
            # English
            elif lang == 'eng':
                import en_core_web_sm

                main.__dict__[f'spacy_nlp_{lang}'] = en_core_web_sm.load(
                    disable=nlp_disable)
            # French
            elif lang == 'fra':
                import fr_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = fr_core_news_sm.load(
                    disable=nlp_disable)
            # German
            elif lang == 'deu':
                import de_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = de_core_news_sm.load(
                    disable=nlp_disable)
            # Greek (Modern)
            elif lang == 'ell':
                import el_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = el_core_news_sm.load(
                    disable=nlp_disable)
            # Italian
            elif lang == 'ita':
                import it_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = it_core_news_sm.load(
                    disable=nlp_disable)
            # Portuguese
            elif lang == 'por':
                import pt_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = pt_core_news_sm.load(
                    disable=nlp_disable)
            # Spanish
            elif lang == 'spa':
                import es_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = es_core_news_sm.load(
                    disable=nlp_disable)
            # Other Languages
            elif lang == 'other':
                import en_core_web_sm

                main.__dict__[f'spacy_nlp_{lang}'] = en_core_web_sm.load(
                    disable=nlp_disable)
    # Languages without models
    else:
        # Serbian (Cyrillic) & Serbian (Latin)
        if lang in ['srp_cyrl', 'srp_latn']:
            main.__dict__['spacy_nlp_srp_cyrl'] = spacy.blank('rs')
            main.__dict__['spacy_nlp_srp_latn'] = spacy.blank('rs')
        else:
            main.__dict__[f'spacy_nlp_{lang}'] = spacy.blank(
                wordless_conversion.to_iso_639_1(main, lang))

    if 'sentencizer' in nlp_pipelines:
        nlp = main.__dict__[f'spacy_nlp_{lang}']

        if 'sentencizer' not in nlp.pipe_names:
            nlp.add_pipe(nlp.create_pipe('sentencizer'))
예제 #8
0
plt.savefig('ward_clusters.png', dpi=200) #save figure as ward_clusters

# REFERENCE: http://brandonrose.org/clustering

"""TextRank for Tweet Summarization"""

!pip install -U spacy
!pip install -U scikit-learn
!python -m spacy download pt_core_news_sm

#importing libraries
import spacy
from spacy.lang.pt.stop_words import STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer
import pt_core_news_sm
nlp = pt_core_news_sm.load()
import os
path2 = 'drive/MyDrive/Project/Preprocess_Data'
for filename in os.listdir(path2):
  # if(filename=='.ipynb_checkpoints'):
  #   pass 
  if(filename == 'Pfizer.txt'):
    txt_file = path2 + "/" + filename
    with open(txt_file, "r", encoding="utf-8") as f:
        text = " ".join(f.readlines())
    doc = nlp(text)
    corpus = [sent.text.lower() for sent in doc.sents ]
    cv = CountVectorizer(stop_words=list(STOP_WORDS))   
    cv_fit=cv.fit_transform(corpus)    
   
    word_list = cv.get_feature_names();    
예제 #9
0
def check_spacy_models(main, lang, pipeline):
    if lang == 'other':
        lang = 'eng'

    if pipeline == 'word_tokenization':
        nlp_pipelines = []
        nlp_disable = ['tagger', 'parser', 'ner']
    elif pipeline in ['tokenization', 'sentence_tokenization']:
        nlp_pipelines = ['sbd']
        nlp_disable = ['tagger', 'parser', 'ner']
    elif pipeline in ['pos_tagging', 'lemmatization']:
        nlp_pipelines = ['tagger']
        nlp_disable = ['parser', 'ner']

    if lang in ['nld', 'eng', 'fra', 'deu', 'ell', 'ita', 'por', 'spa']:
        if f'spacy_nlp_{lang}' in main.__dict__:
            if main.__dict__[f'spacy_nlp_{lang}'].pipe_names != nlp_pipelines:
                del main.__dict__[f'spacy_nlp_{lang}']

        if f'spacy_nlp_{lang}' not in main.__dict__:
            # Dutch
            if lang == 'nld':
                import nl_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = nl_core_news_sm.load(
                    disable=nlp_disable)
            # English
            elif lang == 'eng':
                import en_core_web_sm

                main.__dict__[f'spacy_nlp_{lang}'] = en_core_web_sm.load(
                    disable=nlp_disable)
            # French
            elif lang == 'fra':
                import fr_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = fr_core_news_sm.load(
                    disable=nlp_disable)
            # German
            elif lang == 'deu':
                import de_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = de_core_news_sm.load(
                    disable=nlp_disable)
            # Greek (Modern)
            elif lang == 'ell':
                import el_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = el_core_news_sm.load(
                    disable=nlp_disable)
            # Italian
            elif lang == 'ita':
                import it_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = it_core_news_sm.load(
                    disable=nlp_disable)
            # Portuguese
            elif lang == 'por':
                import pt_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = pt_core_news_sm.load(
                    disable=nlp_disable)
            # Spanish
            elif lang == 'spa':
                import es_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = es_core_news_sm.load(
                    disable=nlp_disable)

        if 'sbd' in nlp_pipelines:
            nlp = main.__dict__[f'spacy_nlp_{lang}']

            if 'sbd' not in nlp.pipe_names:
                nlp.add_pipe(nlp.create_pipe('sentencizer'))
예제 #10
0
nlp_fr = fr_core_news_sm.load()

import en_core_web_sm
nlp_en = en_core_web_sm.load()

import de_core_news_sm
nlp_de = de_core_news_sm.load()

import es_core_news_sm
nlp_es = es_core_news_sm.load()

import it_core_news_sm
nlp_it = it_core_news_sm.load()

import pt_core_news_sm
nlp_pt = pt_core_news_sm.load()

import nl_core_news_sm
nlp_nl = nl_core_news_sm.load()

# global variables
wnl = WordNetLemmatizer()
html_parser = HTMLParser()
stopword_list = []
language = ""


def init_lib(lang):
    global stopword_list, language

    nltk.download('stopwords')
예제 #11
0
def summarization(args):

    with open(args.original_text, "r", encoding="utf-8") as f:
        text = " ".join(f.readlines())

    if args.language == 'portuguese':
        import pt_core_news_sm
        nlp = pt_core_news_sm.load()
    else:
        import en_core_web_sm
        nlp = en_core_web_sm.load()

    doc = nlp(text)

    corpus = [sent.text.lower() for sent in doc.sents]

    cv = CountVectorizer(stop_words=list(STOP_WORDS))
    cv_fit = cv.fit_transform(corpus)
    word_list = cv.get_feature_names()
    count_list = cv_fit.toarray().sum(axis=0)
    """
    The zip(*iterables) function takes iterables as arguments and returns an iterator. 
    This iterator generates a series of tuples containing elements from each iterable. 
    Let's convert these tuples to {word:frequency} dictionary"""

    word_frequency = dict(zip(word_list, count_list))

    val = sorted(word_frequency.values())

    # Check words with higher frequencies
    higher_word_frequencies = [
        word for word, freq in word_frequency.items() if freq in val[-3:]
    ]
    print("\nWords with higher frequencies: ", higher_word_frequencies)

    # gets relative frequencies of words
    higher_frequency = val[-1]
    for word in word_frequency.keys():
        word_frequency[word] = (word_frequency[word] / higher_frequency)

    # SENTENCE RANKING: the rank of sentences is based on the word frequencies
    sentence_rank = {}
    for sent in doc.sents:
        for word in sent:
            if word.text.lower() in word_frequency.keys():
                if sent in sentence_rank.keys():
                    sentence_rank[sent] += word_frequency[word.text.lower()]
                else:
                    sentence_rank[sent] = word_frequency[word.text.lower()]
            else:
                continue

    top_sentences = (sorted(sentence_rank.values())[::-1])
    top_sent = top_sentences[:args.nb_sentences]

    # Mount summary
    summary = []
    for sent, strength in sentence_rank.items():
        if strength in top_sent:
            summary.append(sent)

    # return orinal text and summary
    return text, summary
예제 #12
0
import pt_core_news_sm
import operator
from nltk.tokenize import sent_tokenize

dictEQ = {}  #entidade -> quantidade
dictEL = {}  #entidade -> label
i = 1
link = ""
titulo = ""
data = ""
noticia = ""
parser = pt_core_news_sm.load()
with open("baseUnica.txt", "r", encoding="utf-8") as docs:
    for linha in docs:
        trx = linha.rstrip()

        if (i == 1):
            link = trx
        elif (i == 2):
            titulo = trx
        elif (i == 3):
            data = trx
        elif (trx == "YippieKiYay"):
            i = 0
            lista_tknzd = sent_tokenize(titulo)
            lista_tknzd += sent_tokenize(noticia)
            #print(lista_tknzd)
            noticia = ""
            for sents in lista_tknzd:
                parsedEx = parser(sents)
                ents = list(parsedEx.ents)
예제 #13
0
#!/usr/bin/python3

import spacy
from spacy import displacy
import pt_core_news_sm
import de_core_news_sm
import en_core_web_sm
from fuzzywuzzy import fuzz, process

NAMED_ENTITY_MINIMUM_LENGTH = 3
SIMILARITY_RATIO_THRESHOLD = 70

NLP_PT = pt_core_news_sm.load()
NLP_DE = de_core_news_sm.load()
NLP_EN = en_core_web_sm.load()

MODELS = {
    'de': NLP_DE,
    'en': NLP_EN,
    'pt': NLP_PT,
}


def _get_nlp_model(language):
    return MODELS[language]


def _get_named_entities(text, language):
    nlp_model = _get_nlp_model(language)
    named_entities = nlp_model(text).ents
    return named_entities