def check_packages(): packages = ["embeddings2.pt", "pos2.pt", "ner2.pt", "sentiment2.pt"] for package in packages: if not downloader.is_installed(package): print("Baixando {0}".format(package)) downloader.download(package) return True
def install_analyzers(): """Download linguistic resources for the analyzers.""" # ['pos2', 'ner2', 'morph2', 'tsne2', 'counts2', 'embeddings2', # 'sentiment2', 'sgns2', 'transliteration2'] for task in ['embeddings2', 'ner2']: log.info("Downloading linguistic resources: %r...", task) downloader.download('TASK:%s' % task, quiet=True)
def download_models(downloads): """Downloads the models specified in NLP_SERVICE_MODELS_JSON Args: downloads: dictionary containing all models to download """ poly_downloads = [] if 'polyglot' in downloads: for language in downloads['polyglot']: assert language in ALLOWED_POLY_LANGUAGES if not os.path.isdir( f'{download_dir}/polyglot_data/embeddings2/{language}'): poly_downloads.append(f'embeddings2.{language}') poly_downloads.append(f'ner2.{language}') else: logging.info(f'skipping {language}, already installed') if poly_downloads: downloader.download(poly_downloads, download_dir=f'{download_dir}/polyglot_data') if 'spacy' in downloads: for model in downloads['spacy']: if not spacy.util.is_package(model): logging.info(f'downloading spacy model {model}') model_name, type = model.rsplit('_', 1) assert model_name in ALLOWED_SPACY_MODELS assert type in ALLOWED_SPACY_MODEL_TYPES spacy.cli.download(model, False, False, '-t', '/data/spacy/', '--no-deps') else: logging.info(f'{model} is already installed')
def parse_morpheme(request): request_json = request.get_json() response = Response() response.headers.add('Access-Control-Allow-Origin', '*') response.headers.add('Access-Control-Allow-Methods', 'POST') try: if request.method == 'OPTIONS': headers = request.headers.get('Access-Control-Request-Headers') if headers: response.headers['Access-Control-Allow-Headers'] = headers response.status_code = 200 return response if request_json and 'text' in request_json: downloader.download("morph2.en") text = request_json['text'].replace(" ", "") parsedText = Text(text) parsedText.language = "en" response.set_data(json.dumps({ "result": parsedText.morphemes })) response.status_code = 200 return response else: response.set_data(json.dumps({ "error": 'invalid request' })) response.status_code = 400 return response except: response.set_data(json.dumps({ "error": 'internal server error' })) response.status_code = 500 return response
def dowload_languages(language_table): for row in language_table.iterrows(): if isinstance(row[1]['language'], str): #not np.isnan(row[1]['language']): code = row[1]['code'] print('downloading', code) downloader.download("LANG:" + code) print('downloaded', code)
def download_polyglot_dicts(): """Download dictionaries needed for Polyglot library. """ langs = current_app.config['APP_LANGUAGES'] dicts = current_app.config['APP_LANG_POLYGLOT_DICTS'] for dic in dicts: for lang in langs: downloader.download('{dic}.{lang}'.format(dic=dic, lang=lang))
def __init__(verbose=True, debug=False): self.verbose = True if debug: self.logger.setLevel('DEBUG') # ensure polyglot supports NL downloader.download('sentiment2.nl')
def download(lang=None): if lang is None: language = 'en' else: language = lang downloader.download("embeddings2." + language) supported_tasks = downloader.supported_tasks(lang=language) if "ner2" in supported_tasks: downloader.download("ner2." + language) return
def check_and_download(package: str) -> bool: """ Verifica se determinado pacote do Polyglot está instalado, caso não, será instalado automaticamente :param package: pacote a ser procurado :return: Verdadeiro sempre. Se houverem erros uma excessão será levantada """ if downloader.is_installed(package) == downloader.NOT_INSTALLED: log.info(package.split(".")[0] + " não instalado, instalando.") downloader.download(package) log.info(package.split(".")[0] + " instalado.") return True
def get_polyglot_sentiment(text): text = clean_text(text) from polyglot.text import Text as T text = T(text) try: return text.polarity except: from polyglot.downloader import downloader downloader.download("sentiment2.{}".format(text.language.code)) return text.polarity
def download(langs=None): if langs is None: languages = ['en'] else: languages = langs for language in languages: downloader.download("embeddings2." + language) supported_tasks = downloader.supported_tasks(lang=language) if "transliteration2" in supported_tasks: downloader.download("transliteration2." + language) return
def download(library=None, lang=None): if lang is None: language = 'en' else: language = lang if library == 'stanza': stanza.download(language) return downloader.download("embeddings2." + language) supported_tasks = downloader.supported_tasks(lang=language) if "pos2" in supported_tasks: downloader.download("pos2." + language) return
def getSentiments(text): parsedText = Text(text) lang = parsedText.language.code name = parsedText.language.name if (parsedText.language.confidence > 95): if allOp != None or languageOp != None: print("Language detected: " + name) if allOp != None or textOp != None or fullOp != None or entityOp != None or findOp != None: try: tasksSupported = downloader.supported_tasks(lang=lang) except: print("Language (" + name + ") not supported!") sys.exit(1) if allOp != None or textOp != None or fullOp != None or entityOp != None or findOp != None: if "sentiment2" in tasksSupported: #download necessary files, quiet=True for not outputing download info to stdout downloader.download("sentiment2." + lang, quiet=True) if allOp != None or textOp != None or fullOp != None: (sumSentiment, numberWords) = textSentiment(parsedText, lang) else: print("Language (" + name + ") not supported!") sys.exit(1) if allOp != None or entityOp != None or fullOp != None or findOp != None: if "ner2" in tasksSupported and "embeddings2" in tasksSupported: if 'sumSentiment' not in locals(): sumSentiment = 0 if 'numberWords' not in locals(): numberWords = 0 entitiesAndFinalSentiment(parsedText, lang, sumSentiment, numberWords) else: print("Language (" + name + ") not supported!") sys.exit(1) else: print("Can't detect language reliably or don't support language!") sys.exit(1)
def entitiesAndFinalSentiment(parsedText, lang, sumSentiment, numberWords): #download necessary files, quiet=True for not outputing download info to stdout downloader.download("ner2." + lang, quiet=True) downloader.download("embeddings2." + lang, quiet=True) if entityOp != None or allOp != None: print( "\nSentiment associated to each entity ocorrence by order of appearance in text:" ) if findOp != None: findsSent = 0 findsOcur = 0 findsOut = "" for entity in parsedText.entities: entitySent = entity.positive_sentiment - entity.negative_sentiment if fullOp != None or allOp != None: sumSentiment += entitySent if entityOp != None or allOp != None: print("\t" + str(" ".join(entity)) + ": " + str(entitySent)) if findOp != None: jEntity = " ".join(entity) if findOp in jEntity: findsOut += "\t" + jEntity + ": " + str(entitySent) + "\n" findsSent += entitySent findsOcur += 1 if findOp != None: print("\nOcurences and sentiment of \"" + findOp + "\" entity:") print(findsOut, end='') #print without newline print("\nTotal of \"" + findOp + "\" entity:") print("\tsum: " + str(findsSent)) print("\tmean: " + str(findsSent / findsOcur)) if fullOp != None or allOp != None: print("\nFinal Sentiment of text (with entity sentiment):") print("\tsum: " + str(sumSentiment)) print("\tmean: " + str(sumSentiment / numberWords))
def polyglot_ner_to_df(text, lang): """ text (str): lang (str): a two digit lang code Examples: >>> text = "Dette er en 'lang' tekst skrevet af Kenneth Enevoldsen fra Aarhus" >>> polyglot_ner_to_df(text, lang = "da") """ from polyglot.downloader import downloader from polyglot.text import Text #check if lang is downloaded if not download it if downloader.download("ner2." + lang) and downloader.download("embeddings2." + lang): txt = Text(text, hint_language_code=lang) df = pd.DataFrame(list(txt.words), columns=["token"]) df['ner'] = np.nan df['ner'] = df['ner'].astype(object) df['ner_ent_n'] = np.nan for ent_n, ent in enumerate(txt.entities): df.loc[ent.start:ent.end - 1, 'ner'] = ent.tag df.loc[ent.start:ent.end - 1, 'ner_ent_n'] = ent_n df['ner_conf'] = 1 # Normalize tags di = { "I-LOC": "LOC", "B-LOC": "LOC", "I-PER": "PER", "B-PER": "PER", "I-ORG": "ORG", "B-ORG": "ORG" } df = df.replace({"ner": di}) return df raise ValueError(f"{lang} is not in polyglot language directory.")
def download(): # downloader.download('sgns2.en', os.path.join(os.getcwd(), 'polyglot_data')) # downloader.download('unipos.en', os.path.join(os.getcwd(), 'polyglot_data')) # downloader.download('ner2.en', os.path.join(os.getcwd(), 'polyglot_data')) # downloader.download('counts2.en', os.path.join(os.getcwd(), 'polyglot_data')) downloader.download('embeddings2.en', os.path.join(os.getcwd(), 'polyglot_data')) # downloader.download('uniemb.en', os.path.join(os.getcwd(), 'polyglot_data')) downloader.download('pos2.en', os.path.join(os.getcwd(), 'polyglot_data')) # downloader.download('sentiment2.en', os.path.join(os.getcwd(), 'polyglot_data')) # downloader.download('tsne2.en', os.path.join(os.getcwd(), 'polyglot_data')) downloader.download('morph2.en', os.path.join(os.getcwd(), 'polyglot_data'))
from flair.models import SequenceTagger from flair.data import Sentence import numpy as np from fuzzywuzzy import fuzz from fuzzywuzzy import process import polyglot from polyglot.downloader import downloader from polyglot.text import Text from polyglot.detect import Detector import math import time #from flask import Flask model = SequenceTagger.load('ner') downloader.download("TASK:transliteration2", quiet=True) # Initialize the app app = dash.Dash(__name__) #application = app.server #app = Flask(__name__) app.config['suppress_callback_exceptions'] = True app.layout = html.Div( children=[ html.Div(className='row', children=[ html.Div(className='three columns div-user-controls', children=[ html.H1('Automated Entity Recognition'), html.P('Classifying entities as person or organization'),
def donwload_all_transliterators(): from polyglot.downloader import downloader downloader.download("TASK:transliteration2", quiet=False)
import os import re import tarfile import polyglot from polyglot.text import Text from polyglot.downloader import downloader downloader.download("TASK:embeddings2") downloader.download("TASK:ner2") import xml.etree.ElementTree as ET import nltk nltk.download('punkt') from nltk import word_tokenize #==============================================================# #Unzips the archives and extracts all the text in each xml file# #==============================================================# class MyParser: def __init__(self,file=None): cwd = os.getcwd() self.my_string = None if file is not None: self.file = file try: f = tarfile.open(cwd + '/' + self.file,'r') f.extractall() #Extract all file to the current working directory. names = f.getnames() corpus = [] for name in names: if (name).endswith("xml") is True: tree = ET.parse(cwd + '/'+ name) txt = ET.tostringlist(tree.getroot(), encoding='utf-8', method='text')
from polyglot.downloader import downloader from polyglot.mapping import Embedding fileName = "dicPolicial.pickle" # https://sites.google.com/site/rmyeid/projects/polyglot # http://nbviewer.jupyter.org/gist/aboSamoor/6046170 def loadMyTagger(fileName): return pickle.load(open(fileName, "rb")) embeddings = loadMyTagger(fileName) downloader.download("embeddings2.pt") downloader.download("pos2.pt") # Inicial aplicativo # Criar o Banco de Dados con = sqlite3.connect('./db/dadosDipol.db') cur = con.cursor() sql_create_miniFrase = 'CREATE TABLE IF NOT EXISTS miniFrases '\ '(id integer primary key AUTOINCREMENT, '\ 'texto varchar(200), '\ 'entidade varchar(50), '\ 'arquivo varchar(140))' cur.execute(sql_create_miniFrase) sql_insert_miniFrase = 'insert into miniFrases (texto, entidade, arquivo) values (?, ?, ?)'
#print(downloader.supported_languages_table("sentiment2", 3)) ## download polyglot modules from polyglot.downloader import downloader downloader.download("sentiment2.de") downloader.download("ner2.de") downloader.download("embeddings2.de") print("\n") #downloader.list(show_packages=False) from polyglot.text import Text import re import statistics import sys ## extraction functions___________________ def read_file(path, file): """ reading .txt file and return its content as string :param path: path to file :param file: file (.txt) :return: file_content as string """ file_name = str(path) + str(file) with open(file_name + '.txt', 'r') as myfile: file_content = myfile.read()
from polyglot.downloader import downloader downloader.download("TASK:sentiment2", quiet=False)
def install_analyzers(): # ['pos2', 'ner2', 'morph2', 'tsne2', 'counts2', 'embeddings2', # 'sentiment2', 'sgns2', 'transliteration2'] for task in ['embeddings2', 'ner2']: log.info("Downloading linguistic resources: %r...", task) downloader.download('TASK:%s' % task, quiet=True)
# coding: utf-8 # This is the code used for UdL team at SemEval 2017 STS task EN-EN track # Author: Hussein AL-NATSHEH <*****@*****.**> # License: BSD 3 clause # 2016, 2017 import pandas as pd import argparse import numpy as np import pickle from polyglot.downloader import downloader downloader.download("embeddings2.en") downloader.download("pos2.en") from utils.polyglot import polyglot_words from utils.polyglot import polyglot_nouns from utils.polyglot import polyglot_proper_nouns from utils.polyglot import polyglot_pronouns from utils.polyglot import polyglot_verbs from utils.polyglot import polyglot_auxiliary_verbs from utils.polyglot import polyglot_adjectives from utils.polyglot import polyglot_adverbs from utils.polyglot import polyglot_numbers from utils.polyglot import polyglot_punctuation from utils.polyglot import polyglot_particle from utils.polyglot import polyglot_determiner from utils.polyglot import polyglot_interjection from utils.polyglot import polyglot_coordinating_conjunction
import stanfordnlp from collections import Counter from spacy_stanfordnlp import StanfordNLPLanguage from nltk.tokenize import sent_tokenize from nltk.corpus import stopwords from polyglot.text import Text, Word from polyglot.downloader import downloader downloader.download("sentiment2.fi") entities = "/textdump/entities4.txt" adjectives = [] verbs = [] sentiment = 0 raw = open('suomi24vuodet2/suomi24kommentit2017.txt').read() sentences = sent_tokenize(raw) stop_words = set(stopwords.words('finnish')) #Top 20 most common namend entities in the comments words = [ "asia", "Suomi", "Turkki", "Helsinki", "Thaimaa", "Kanaria", "Kreikka", "Australia", "USA", "Thaimaa", "Alanyassa", "Italia", "Bulgaria", "Intia", "Gambia", "Teneriffa", "Turku", "Tunisia", "Tampere", "Usa" ] #initialize spacy with standord-nlp pipeline model for finnish language snlp = stanfordnlp.Pipeline(lang="fi", processors="tokenize,mwt,lemma,pos") nlp = StanfordNLPLanguage(snlp) #Find adjectives and verbs from the comments: for sentence in sentences:
for i in range(len(words)): outFile.write(words[i]) for val in vect[i]: outFile.write(' ' + str(val)) outFile.write('\n') outFile.close() os.remove(dataDir + embeds) for package in ['polyglot', 'pyicu', 'pycld2', 'dill']: if not pkgutil.find_loader(package): os.system('pip3 install --user ' + package) from polyglot.downloader import downloader #downloader.download("TASK:embeddings2") downloader.download("embeddings2.nl") dataDir = 'embeds/polyglot/' if not os.path.exists(dataDir): if not os.path.exists('embeds'): os.mkdir('embeds') os.mkdir(dataDir) homedir = expanduser("~") polyDir = homedir + '/polyglot_data/embeddings2/' for lang in os.listdir(polyDir): cmd = 'tar -xjf ' + polyDir + lang + '/embeddings_pkl.tar.bz2' os.system(cmd) os.rename('words_embeddings_32.pkl', dataDir + lang + '.pickle') os.system('rm -rf ~/polyglot_data')
def install_analyzers(): """Download linguistic resources for the analyzers.""" for task in ['embeddings2', 'ner2']: log.info("Downloading linguistic resources: %r...", task) downloader.download('TASK:%s' % task, quiet=True)
def _download_polyglot_languages(self): for lang in self.supported_languages: lang_resource = 'sentiment2.{}'.format(lang) if not downloader.is_installed(lang_resource): downloader.download('sentiment2.es')
# COMMAND ---------- hist_df.describe(['countResult']).show() # COMMAND ---------- #poiché la media è 28.3 e la deviazione standard è 11.6, allora la "regoletta" degli outlier è 28.3 + 2*11.6 = 51 outliers = hist_df.filter("countResult > 51") display(outliers) # COMMAND ---------- from polyglot.downloader import downloader downloader.download("embeddings2.en") downloader.download("ner2.en") downloader.download("embeddings2.it") downloader.download("ner2.it") # COMMAND ---------- from polyglot.text import Text def extract_NN_by_lan(sent,lan): entities = [] text = Text(sent,hint_language_code=lan) for sent in text.sentences: for entity in sent.entities: temp = "" for term in entity._collection: temp = temp+" " + term
from polyglot.detect import Detector from polyglot.text import Text from polyglot.downloader import downloader import csv import tweepy from flask import Flask, request import pyodbc downloader.download("sentiment2.en") downloader.download("sentiment2.yo") conn = pyodbc.connect('Driver={SQL Server};' 'Server=THA-MACHINE;' 'Database=SentiAna;' 'Trusted_Connection=yes;') cursor = conn.cursor() app = Flask(__name__) @app.route('/query-example') def query_example(): print('receiving incoming request...') value = request.args.get('language') return '''<h1> The language value is : {}<h1>'''.format(value) @app.route('/form-example', methods=['GET', 'POST']) def form_example(): if request.method == 'POST': language = request.form.get('language')
def download_language_model(): from polyglot.downloader import downloader downloader.download("embeddings2.sv")