예제 #1
0
 def check_packages():
     packages = ["embeddings2.pt", "pos2.pt", "ner2.pt", "sentiment2.pt"]
     for package in packages:
         if not downloader.is_installed(package):
             print("Baixando {0}".format(package))
             downloader.download(package)
     return True
예제 #2
0
def install_analyzers():
    """Download linguistic resources for the analyzers."""
    # ['pos2', 'ner2', 'morph2', 'tsne2', 'counts2', 'embeddings2',
    #  'sentiment2', 'sgns2', 'transliteration2']
    for task in ['embeddings2', 'ner2']:
        log.info("Downloading linguistic resources: %r...", task)
        downloader.download('TASK:%s' % task, quiet=True)
예제 #3
0
def download_models(downloads):
    """Downloads the models specified in NLP_SERVICE_MODELS_JSON

    Args:
        downloads: dictionary containing all models to download
    """
    poly_downloads = []
    if 'polyglot' in downloads:
        for language in downloads['polyglot']:
            assert language in ALLOWED_POLY_LANGUAGES
            if not os.path.isdir(
                    f'{download_dir}/polyglot_data/embeddings2/{language}'):
                poly_downloads.append(f'embeddings2.{language}')
                poly_downloads.append(f'ner2.{language}')
            else:
                logging.info(f'skipping {language}, already installed')
        if poly_downloads:
            downloader.download(poly_downloads,
                                download_dir=f'{download_dir}/polyglot_data')

    if 'spacy' in downloads:
        for model in downloads['spacy']:
            if not spacy.util.is_package(model):
                logging.info(f'downloading spacy model {model}')
                model_name, type = model.rsplit('_', 1)
                assert model_name in ALLOWED_SPACY_MODELS
                assert type in ALLOWED_SPACY_MODEL_TYPES
                spacy.cli.download(model, False, False, '-t', '/data/spacy/',
                                   '--no-deps')
            else:
                logging.info(f'{model} is already installed')
예제 #4
0
def parse_morpheme(request):
    request_json = request.get_json()
    response = Response()
    response.headers.add('Access-Control-Allow-Origin', '*')
    response.headers.add('Access-Control-Allow-Methods', 'POST')
    try:
        if request.method == 'OPTIONS':
            headers = request.headers.get('Access-Control-Request-Headers')

            if headers:
                response.headers['Access-Control-Allow-Headers'] = headers
                response.status_code = 200
                return response

        if request_json and 'text' in request_json:
            downloader.download("morph2.en")     
            text = request_json['text'].replace(" ", "")
            parsedText = Text(text)
            parsedText.language = "en"
            response.set_data(json.dumps({ "result": parsedText.morphemes }))
            response.status_code = 200
            return response

        else:
            response.set_data(json.dumps({ "error": 'invalid request' }))
            response.status_code = 400
            return response
    
    except:
        response.set_data(json.dumps({ "error": 'internal server error' }))
        response.status_code = 500
        return response
def dowload_languages(language_table):
    for row in language_table.iterrows():
        if isinstance(row[1]['language'],
                      str):  #not np.isnan(row[1]['language']):
            code = row[1]['code']
            print('downloading', code)
            downloader.download("LANG:" + code)
            print('downloaded', code)
예제 #6
0
def download_polyglot_dicts():
    """Download dictionaries needed for Polyglot library.
    """
    langs = current_app.config['APP_LANGUAGES']
    dicts = current_app.config['APP_LANG_POLYGLOT_DICTS']

    for dic in dicts:
        for lang in langs:
            downloader.download('{dic}.{lang}'.format(dic=dic, lang=lang))
예제 #7
0
    def __init__(verbose=True, debug=False):

        self.verbose = True

        if debug:
            self.logger.setLevel('DEBUG')

        # ensure polyglot supports NL
        downloader.download('sentiment2.nl')
예제 #8
0
def download(lang=None):
    if lang is None:
        language = 'en'
    else:
        language = lang

    downloader.download("embeddings2." + language)
    supported_tasks = downloader.supported_tasks(lang=language)
    if "ner2" in supported_tasks:
        downloader.download("ner2." + language)
    return
예제 #9
0
def check_and_download(package: str) -> bool:
    """
    Verifica se determinado pacote do Polyglot está instalado, caso não, será instalado automaticamente
    :param package: pacote a ser procurado
    :return: Verdadeiro sempre. Se houverem erros uma excessão será levantada
    """
    if downloader.is_installed(package) == downloader.NOT_INSTALLED:
        log.info(package.split(".")[0] + " não instalado, instalando.")
        downloader.download(package)
        log.info(package.split(".")[0] + " instalado.")
    return True
예제 #10
0
def get_polyglot_sentiment(text):
    text = clean_text(text)

    from polyglot.text import Text as T
    text = T(text)

    try:
        return text.polarity
    except:
        from polyglot.downloader import downloader
        downloader.download("sentiment2.{}".format(text.language.code))
        return text.polarity
예제 #11
0
def download(langs=None):
    if langs is None:
        languages = ['en']
    else:
        languages = langs

    for language in languages:
        downloader.download("embeddings2." + language)
        supported_tasks = downloader.supported_tasks(lang=language)
        if "transliteration2" in supported_tasks:
            downloader.download("transliteration2." + language)
    return
예제 #12
0
def download(library=None, lang=None):
    if lang is None:
        language = 'en'
    else:
        language = lang

    if library == 'stanza':
        stanza.download(language)
        return

    downloader.download("embeddings2." + language)
    supported_tasks = downloader.supported_tasks(lang=language)
    if "pos2" in supported_tasks:
        downloader.download("pos2." + language)
    return
예제 #13
0
def getSentiments(text):
    parsedText = Text(text)

    lang = parsedText.language.code
    name = parsedText.language.name

    if (parsedText.language.confidence > 95):
        if allOp != None or languageOp != None:
            print("Language detected: " + name)

        if allOp != None or textOp != None or fullOp != None or entityOp != None or findOp != None:
            try:
                tasksSupported = downloader.supported_tasks(lang=lang)
            except:
                print("Language (" + name + ") not supported!")
                sys.exit(1)

        if allOp != None or textOp != None or fullOp != None or entityOp != None or findOp != None:
            if "sentiment2" in tasksSupported:
                #download necessary files, quiet=True for not outputing download info to stdout
                downloader.download("sentiment2." + lang, quiet=True)
                if allOp != None or textOp != None or fullOp != None:
                    (sumSentiment,
                     numberWords) = textSentiment(parsedText, lang)
            else:
                print("Language (" + name + ") not supported!")
                sys.exit(1)

        if allOp != None or entityOp != None or fullOp != None or findOp != None:
            if "ner2" in tasksSupported and "embeddings2" in tasksSupported:
                if 'sumSentiment' not in locals():
                    sumSentiment = 0
                if 'numberWords' not in locals():
                    numberWords = 0
                entitiesAndFinalSentiment(parsedText, lang, sumSentiment,
                                          numberWords)
            else:
                print("Language (" + name + ") not supported!")
                sys.exit(1)
    else:
        print("Can't detect language reliably or don't support language!")
        sys.exit(1)
예제 #14
0
def entitiesAndFinalSentiment(parsedText, lang, sumSentiment, numberWords):
    #download necessary files, quiet=True for not outputing download info to stdout
    downloader.download("ner2." + lang, quiet=True)
    downloader.download("embeddings2." + lang, quiet=True)

    if entityOp != None or allOp != None:
        print(
            "\nSentiment associated to each entity ocorrence by order of appearance in text:"
        )

    if findOp != None:
        findsSent = 0
        findsOcur = 0
        findsOut = ""

    for entity in parsedText.entities:
        entitySent = entity.positive_sentiment - entity.negative_sentiment
        if fullOp != None or allOp != None:
            sumSentiment += entitySent

        if entityOp != None or allOp != None:
            print("\t" + str(" ".join(entity)) + ": " + str(entitySent))
        if findOp != None:
            jEntity = " ".join(entity)
            if findOp in jEntity:
                findsOut += "\t" + jEntity + ": " + str(entitySent) + "\n"
                findsSent += entitySent
                findsOcur += 1

    if findOp != None:
        print("\nOcurences and sentiment of \"" + findOp + "\" entity:")
        print(findsOut, end='')  #print without newline
        print("\nTotal of \"" + findOp + "\" entity:")
        print("\tsum: " + str(findsSent))
        print("\tmean: " + str(findsSent / findsOcur))

    if fullOp != None or allOp != None:
        print("\nFinal Sentiment of text (with entity sentiment):")
        print("\tsum: " + str(sumSentiment))
        print("\tmean: " + str(sumSentiment / numberWords))
def polyglot_ner_to_df(text, lang):
    """
  text (str):
  lang (str): a two digit lang code
  
  Examples:
  >>> text = "Dette er en 'lang' tekst skrevet af Kenneth Enevoldsen fra Aarhus"
  >>> polyglot_ner_to_df(text, lang = "da")
  """
    from polyglot.downloader import downloader
    from polyglot.text import Text

    #check if lang is downloaded if not download it
    if downloader.download("ner2." +
                           lang) and downloader.download("embeddings2." +
                                                         lang):
        txt = Text(text, hint_language_code=lang)
        df = pd.DataFrame(list(txt.words), columns=["token"])
        df['ner'] = np.nan
        df['ner'] = df['ner'].astype(object)
        df['ner_ent_n'] = np.nan
        for ent_n, ent in enumerate(txt.entities):
            df.loc[ent.start:ent.end - 1, 'ner'] = ent.tag
            df.loc[ent.start:ent.end - 1, 'ner_ent_n'] = ent_n

        df['ner_conf'] = 1
        # Normalize tags
        di = {
            "I-LOC": "LOC",
            "B-LOC": "LOC",
            "I-PER": "PER",
            "B-PER": "PER",
            "I-ORG": "ORG",
            "B-ORG": "ORG"
        }
        df = df.replace({"ner": di})
        return df
    raise ValueError(f"{lang} is not in polyglot language directory.")
예제 #16
0
def download():
    # downloader.download('sgns2.en', os.path.join(os.getcwd(), 'polyglot_data'))
    # downloader.download('unipos.en', os.path.join(os.getcwd(), 'polyglot_data'))
    # downloader.download('ner2.en', os.path.join(os.getcwd(), 'polyglot_data'))
    # downloader.download('counts2.en', os.path.join(os.getcwd(), 'polyglot_data'))
    downloader.download('embeddings2.en',
                        os.path.join(os.getcwd(), 'polyglot_data'))
    # downloader.download('uniemb.en', os.path.join(os.getcwd(), 'polyglot_data'))
    downloader.download('pos2.en', os.path.join(os.getcwd(), 'polyglot_data'))
    # downloader.download('sentiment2.en', os.path.join(os.getcwd(), 'polyglot_data'))
    # downloader.download('tsne2.en', os.path.join(os.getcwd(), 'polyglot_data'))
    downloader.download('morph2.en', os.path.join(os.getcwd(),
                                                  'polyglot_data'))
예제 #17
0
from flair.models import SequenceTagger
from flair.data import Sentence
import numpy as np
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import polyglot
from polyglot.downloader import downloader
from polyglot.text import Text
from polyglot.detect import Detector
import math
import time

#from flask import Flask

model = SequenceTagger.load('ner')
downloader.download("TASK:transliteration2", quiet=True)

# Initialize the app
app = dash.Dash(__name__)
#application = app.server
#app = Flask(__name__)
app.config['suppress_callback_exceptions'] = True

app.layout = html.Div(
    children=[
        html.Div(className='row',
                 children=[
                    html.Div(className='three columns div-user-controls',
                             children=[
                                 html.H1('Automated Entity Recognition'),
                                 html.P('Classifying entities as person or organization'),
예제 #18
0
def donwload_all_transliterators():
    from polyglot.downloader import downloader
    downloader.download("TASK:transliteration2", quiet=False)
예제 #19
0
import os
import re
import tarfile
import polyglot
from polyglot.text import Text
from polyglot.downloader import downloader
downloader.download("TASK:embeddings2")
downloader.download("TASK:ner2")
import xml.etree.ElementTree as ET
import nltk
nltk.download('punkt')
from nltk import word_tokenize

#==============================================================#
#Unzips the archives and extracts all the text in each xml file#
#==============================================================#
class MyParser:
    def __init__(self,file=None):
        cwd = os.getcwd()
        self.my_string = None
        if file is not None:
            self.file = file
            try:
                f = tarfile.open(cwd + '/' + self.file,'r')
                f.extractall() #Extract all file to the current working directory.
                names = f.getnames()
                corpus = []
                for name in names:
                    if (name).endswith("xml") is True:
                        tree = ET.parse(cwd + '/'+ name)
                        txt = ET.tostringlist(tree.getroot(), encoding='utf-8', method='text')
예제 #20
0
from polyglot.downloader import downloader
from polyglot.mapping import Embedding

fileName = "dicPolicial.pickle"

# https://sites.google.com/site/rmyeid/projects/polyglot
# http://nbviewer.jupyter.org/gist/aboSamoor/6046170


def loadMyTagger(fileName):
    return pickle.load(open(fileName, "rb"))


embeddings = loadMyTagger(fileName)

downloader.download("embeddings2.pt")
downloader.download("pos2.pt")

# Inicial aplicativo

# Criar o Banco de Dados
con = sqlite3.connect('./db/dadosDipol.db')
cur = con.cursor()

sql_create_miniFrase = 'CREATE TABLE IF NOT EXISTS miniFrases '\
'(id integer primary key AUTOINCREMENT, '\
'texto varchar(200), '\
'entidade varchar(50), '\
'arquivo varchar(140))'
cur.execute(sql_create_miniFrase)
sql_insert_miniFrase = 'insert into miniFrases (texto, entidade, arquivo) values (?, ?, ?)'
예제 #21
0
#print(downloader.supported_languages_table("sentiment2", 3))

## download polyglot modules

from polyglot.downloader import downloader

downloader.download("sentiment2.de")
downloader.download("ner2.de")
downloader.download("embeddings2.de")
print("\n")
#downloader.list(show_packages=False)

from polyglot.text import Text
import re
import statistics
import sys

## extraction functions___________________


def read_file(path, file):
    """
    reading .txt file and return its content as string
    :param path: path to file
    :param file: file (.txt)
    :return: file_content as string
    """
    file_name = str(path) + str(file)

    with open(file_name + '.txt', 'r') as myfile:
        file_content = myfile.read()
예제 #22
0
from polyglot.downloader import downloader
downloader.download("TASK:sentiment2", quiet=False)
예제 #23
0
def install_analyzers():
    # ['pos2', 'ner2', 'morph2', 'tsne2', 'counts2', 'embeddings2',
    #  'sentiment2', 'sgns2', 'transliteration2']
    for task in ['embeddings2', 'ner2']:
        log.info("Downloading linguistic resources: %r...", task)
        downloader.download('TASK:%s' % task, quiet=True)
예제 #24
0
# coding: utf-8

# This is the code used for UdL team at SemEval 2017 STS task EN-EN track
# Author: Hussein AL-NATSHEH <*****@*****.**>
# License: BSD 3 clause
# 2016, 2017

import pandas as pd
import argparse
import numpy as np
import pickle

from polyglot.downloader import downloader

downloader.download("embeddings2.en")
downloader.download("pos2.en")

from utils.polyglot import polyglot_words
from utils.polyglot import polyglot_nouns
from utils.polyglot import polyglot_proper_nouns
from utils.polyglot import polyglot_pronouns
from utils.polyglot import polyglot_verbs
from utils.polyglot import polyglot_auxiliary_verbs
from utils.polyglot import polyglot_adjectives
from utils.polyglot import polyglot_adverbs
from utils.polyglot import polyglot_numbers
from utils.polyglot import polyglot_punctuation
from utils.polyglot import polyglot_particle
from utils.polyglot import polyglot_determiner
from utils.polyglot import polyglot_interjection
from utils.polyglot import polyglot_coordinating_conjunction
예제 #25
0
import stanfordnlp
from collections import Counter
from spacy_stanfordnlp import StanfordNLPLanguage
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from polyglot.text import Text, Word
from polyglot.downloader import downloader

downloader.download("sentiment2.fi")
entities = "/textdump/entities4.txt"
adjectives = []
verbs = []
sentiment = 0
raw = open('suomi24vuodet2/suomi24kommentit2017.txt').read()
sentences = sent_tokenize(raw)
stop_words = set(stopwords.words('finnish'))

#Top 20 most common namend entities in the comments
words = [
    "asia", "Suomi", "Turkki", "Helsinki", "Thaimaa", "Kanaria", "Kreikka",
    "Australia", "USA", "Thaimaa", "Alanyassa", "Italia", "Bulgaria", "Intia",
    "Gambia", "Teneriffa", "Turku", "Tunisia", "Tampere", "Usa"
]

#initialize spacy with standord-nlp pipeline model for finnish language
snlp = stanfordnlp.Pipeline(lang="fi", processors="tokenize,mwt,lemma,pos")
nlp = StanfordNLPLanguage(snlp)

#Find adjectives and verbs from the comments:

for sentence in sentences:
예제 #26
0
    for i in range(len(words)):
        outFile.write(words[i])
        for val in vect[i]:
            outFile.write(' ' + str(val))
        outFile.write('\n')
    outFile.close()
    os.remove(dataDir + embeds)


for package in ['polyglot', 'pyicu', 'pycld2', 'dill']:
    if not pkgutil.find_loader(package):
        os.system('pip3 install --user ' + package)

from polyglot.downloader import downloader
#downloader.download("TASK:embeddings2")
downloader.download("embeddings2.nl")

dataDir = 'embeds/polyglot/'
if not os.path.exists(dataDir):
    if not os.path.exists('embeds'):
        os.mkdir('embeds')
    os.mkdir(dataDir)

homedir = expanduser("~")
polyDir = homedir + '/polyglot_data/embeddings2/'
for lang in os.listdir(polyDir):
    cmd = 'tar -xjf ' + polyDir + lang + '/embeddings_pkl.tar.bz2'
    os.system(cmd)
    os.rename('words_embeddings_32.pkl', dataDir + lang + '.pickle')
os.system('rm -rf ~/polyglot_data')
예제 #27
0
def install_analyzers():
    """Download linguistic resources for the analyzers."""
    for task in ['embeddings2', 'ner2']:
        log.info("Downloading linguistic resources: %r...", task)
        downloader.download('TASK:%s' % task, quiet=True)
 def _download_polyglot_languages(self):
     for lang in self.supported_languages:
         lang_resource = 'sentiment2.{}'.format(lang)
         if not downloader.is_installed(lang_resource):
             downloader.download('sentiment2.es')
예제 #29
0
# COMMAND ----------

hist_df.describe(['countResult']).show()


# COMMAND ----------

#poiché la media è 28.3 e la deviazione standard è 11.6, allora la "regoletta" degli outlier è 28.3 + 2*11.6 = 51
outliers = hist_df.filter("countResult > 51")
display(outliers)

# COMMAND ----------

from polyglot.downloader import downloader
downloader.download("embeddings2.en")
downloader.download("ner2.en")
downloader.download("embeddings2.it")
downloader.download("ner2.it")

# COMMAND ----------

from polyglot.text import Text
def extract_NN_by_lan(sent,lan):
  entities = []
  text = Text(sent,hint_language_code=lan)
  for sent in text.sentences:
    for entity in sent.entities:
      temp = ""
      for term in entity._collection:
        temp = temp+" " + term
예제 #30
0
from polyglot.detect import Detector
from polyglot.text import Text
from polyglot.downloader import downloader
import csv
import tweepy
from flask import Flask, request
import pyodbc

downloader.download("sentiment2.en")
downloader.download("sentiment2.yo")

conn = pyodbc.connect('Driver={SQL Server};'
                      'Server=THA-MACHINE;'
                      'Database=SentiAna;'
                      'Trusted_Connection=yes;')
cursor = conn.cursor()

app = Flask(__name__)


@app.route('/query-example')
def query_example():
    print('receiving incoming request...')
    value = request.args.get('language')
    return '''<h1> The language value is : {}<h1>'''.format(value)


@app.route('/form-example', methods=['GET', 'POST'])
def form_example():
    if request.method == 'POST':
        language = request.form.get('language')
예제 #31
0
def download_language_model():
    from polyglot.downloader import downloader
    downloader.download("embeddings2.sv")