Exemplo n.º 1
0
def spacy_analyze(fulltext, source_lang):
    """Use spacy to analyze input text

    Parameters:
    fulltext (string): text
    source_lang (string): language of the input text

    Returns:
    nlp: nlp object

    """
    doc = None

    if (source_lang == 'fr'):
        try:
            nlp = fr_core_news_sm.load(disable=['parser', 'ner'])
            doc = nlp(fulltext)
        except:
            print(sys.exc_info()[0])
    elif (source_lang == 'it'):
        try:
            nlp = it_core_news_sm.load(disable=['parser', 'ner'])
            doc = nlp(fulltext)
        except:
            print(sys.exc_info()[0])

    return doc
Exemplo n.º 2
0
def lemmatize(text):
    nlp = fr_core_news_sm.load()
    text = nlp(text)
    text = " ".join([
        word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text
    ])
    return text
Exemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser(description="Prune the copula results")
    parser.add_argument("log")
    parser.add_argument("out")
    args = parser.parse_args()

    nlp = fr_core_news_sm.load()

    out = open(args.out, "w", encoding='UTF-8')

    with open(args.log, "r", encoding='UTF-8') as file:
        for line in file:
            subject, copula, attribute = line.split()

            # remove small words like c', ce, l', la
            if len(subject) <= 2:
                continue
            if len(attribute) <= 2:
                continue
            
            s = nlp(subject, disable=["ner", "dep"])[0]
            if not (s.pos_ == "NOUN" or s.pos_ == "PROPN"):
                continue

            a = nlp(attribute, disable=["ner", "dep"])[0]
            if not (a.pos_ == "NOUN" or a.pos_ == "PROPN"):
                continue
            
            print(subject, copula, attribute, file=out)
    out.close()
 def __init__(self):
     self.nlp = spacy_lang.load()
     self.symbols = set(" ".join(string.punctuation + '0123456789' +
                                 '°').split(" "))
     self.stopwords = set(stopwords.words('french'))
     self.accepted_words = set(['pas'])
     self.pos_to_remove = ['PUNCT', 'SPACE', 'NUM', 'DET', 'PROPN']
Exemplo n.º 5
0
def LemmatizeWords(listWords):
    '''lemmatisation'''
    listLemmas = list()
    strWords = " ".join(str(word) for word in listWords)
    nlp = fr_core_news_sm.load()
    strLemmas = nlp(strWords)
    for frLemma in strLemmas:
        listLemmas.append(frLemma.lemma_)
    return listLemmas
    def __init__(self, BATCH_SIZE, DEVICE):
        self.spacy_fr = fr_core_news_sm.load()
        self.spacy_en = en_core_web_sm.load()

        self.init_token = "<sos>"
        self.eos_token = "<eos>"
        self.pad_token = "<pad>"
        self.unk_token = "<unk>"

        self.BATCH_SIZE = BATCH_SIZE
        self.DEVICE = DEVICE
Exemplo n.º 7
0
def LemmatizeWords(listWords):
    '''Lemmatisation
    :param listWords: Une liste des tokens
    :return: Une liste des tokens lemmatisés
    :rtype: List
    '''
    listLemmas = list()
    strWords = " ".join(str(word) for word in listWords)
    nlp = fr_core_news_sm.load()
    strLemmas = nlp(strWords)
    for frLemma in strLemmas:
        listLemmas.append(frLemma.lemma_)
    return listLemmas
Exemplo n.º 8
0
def profile(path):
    nlp = fr_core_news_sm.load()
    my_text = nlp(pdf2txt(path))
    nounchunks = list(my_text.noun_chunks)
    tokens = [token.text for token in my_text if not token.is_stop]
    data = panda.read_csv(os.path.join(os.path.dirname('data'), 'profile.csv'),encoding ='latin1')
    profiles = list(data.columns.values)
    profile_set = []
    for token in tokens:
        if token.lower() in profiles:
            profile_set.append(token)
    for token in nounchunks:
        token = token.text.lower().strip()
        if token in profiles:
            profile_set.append(token)
    return [i.capitalize() for i in set([i.lower() for i in profile_set])]
def get_spacy_tokenizer(default_lingo, supported_languages, bigmodel_required):
    '''returns the spacy nlp function corresponding to the language of a document'''
    if default_lingo in supported_languages:
        if bigmodel_required == False:
            if default_lingo == "German":
                import de_core_news_sm
                nlp = de_core_news_sm.load()
            elif default_lingo == "English":
                import en_core_web_sm
                nlp = en_core_web_sm.load()
            elif default_lingo == "Spanish":
                import es_core_news_sm
                nlp = es_core_news_sm.load()
            elif default_lingo == "French":
                import fr_core_news_sm
                nlp = fr_core_news_sm.load()
            elif default_lingo == "Portuguese":
                import pt_core_news_sm
                nlp = pt_core_news_sm.load()
            else:
                import it_core_news_sm
                nlp = it_core_news_sm.load()
        else:
            if default_lingo == "German":
                import de_core_news_md
                nlp = de_core_news_md.load()
            elif default_lingo == "English":
                import en_core_web_md
                nlp = en_core_web_md.load()
            elif default_lingo == "Spanish":
                import es_core_news_md
                nlp = es_core_news_md.load()
            elif default_lingo == "French":
                import fr_core_news_md
                nlp = fr_core_news_md.load()
            elif default_lingo == "Portuguese":
                # there is no pt_md model
                import pt_core_news_sm
                nlp = pt_core_news_sm.load()
            else:
                # there is no it_md model
                import it_core_news_sm
                nlp = it_core_news_sm.load()
    else:
        print("NOT A SUPPORTED LANGUAGE!")
    return nlp
Exemplo n.º 10
0
def _nlp(spacy_module: str) -> Optional[NLP]:
    print("Loading spacy language model for '", spacy_module, "'")
    if spacy_module == 'en':
        nlp = en_core_web_sm.load()
    elif spacy_module == 'es':
        nlp = es_core_news_sm.load()
    elif spacy_module == 'de':
        nlp = de_core_news_sm.load()
    elif spacy_module == 'fr':
        nlp = fr_core_news_sm.load()
    elif spacy_module == 'it':
        nlp = it_core_news_sm.load()
    elif spacy_module == 'pt':
        nlp = pt_core_news_sm.load()
    else:
        raise ValueError(f'Unsupported language {spacy_module}')
    return nlp
Exemplo n.º 11
0
 def __init__(self, url):
     try:
         pattern = re.compile(
             "^(?:http(s)?:\/\/)?[\w.-]+(?:\.[\w\.-]+)+[\w\-\._~:/?#[\]@!\$&'\(\)\*\+,;=.]+$"
         )
         if not pattern.match(url):
             print(f"{url} is not a valid url")
         self.url = url
         self.article = Article(self.url)
         self.article.download()
         self.article.parse()
         self.author = self.article.authors
         self.oneline = self.article.summary
         self.text = self.article.text.replace("\n", ".")
         if self.article.meta_lang == 'en' or (self.article.meta_lang == ''
                                               and url.find(
                                                   "cnn.com", 0, 10)):
             import en_core_web_sm
             self.model = en_core_web_sm.load()
         elif self.article.meta_lang == 'it':
             import it_core_news_sm
             self.model = it_core_news_sm.load()
         elif self.article.meta_lang == 'fr':
             import fr_core_news_sm
             self.model = fr_core_news_sm.load()
         elif self.article.meta_lang == 'es':
             import es_core_news_sm
             self.model = es_core_news_sm.load()
         elif self.article.meta_lang == 'pt':
             import pt_core_news_sm
             self.model = pt_core_news_sm.load()
         else:
             print(
                 f"The {self.article.meta_lang} language is not supported")
         self.data = []
         self.vectorizer = TfidfVectorizer(strip_accents='unicode')
     except article.ArticleException:
         print(
             f"The url {url} is not supported, please write to [email protected] for further help"
         )
         self.valid = False
Exemplo n.º 12
0
    def __init__(self, train_path, sequence_length=70):

        # Storing some variables
        self.seq_len = sequence_length
        self.train_path = train_path
        self.pad_idx = 0  # putting padding index at 0

        print('Loading Spacy ....')
        # disable=['ner', 'tagger', 'parser'] for faster tokenization
        self.nlp = fr_core_news_sm.load(disable=['ner', 'tagger', 'parser'])

        self.cleaner = lambda x: [
            str(a.lemma_).lower() for a in self.nlp(x)
            if not (a.is_stop or not a.is_alpha)
        ]

        self.train = pd.read_csv(train_path)

        self.n_classes = len(np.unique(self.train['Label'].values))

        self.label_encode = LabelEncoder()
        self.label_encode.fit(self.train['Label'].values)

        print(f'Number of classes: {self.n_classes}')

        print
        self.vectorizer = TfidfVectorizer()
        self.vectorizer.fit(self.train['Texte'].apply(lambda x: ' '.join([
            str(a.lemma_).lower() for a in self.nlp(x)
            if not (a.is_stop or not a.is_alpha)
        ])))

        self.id2word, self.word2id = self.get_vocab_dicts()

        self.num_words = len(self.id2word)

        print(f'Number of unique words: {len(self.id2word)}')
Exemplo n.º 13
0
def check_spacy_models(main, lang, pipeline):
    if pipeline == 'word_tokenization':
        nlp_pipelines = []
        nlp_disable = ['tagger', 'parser', 'ner']
    elif pipeline in ['sentence_tokenization', 'tokenization']:
        nlp_pipelines = ['sentencizer']
        nlp_disable = ['tagger', 'parser', 'ner']
    elif pipeline in ['pos_tagging', 'lemmatization']:
        nlp_pipelines = ['tagger']
        nlp_disable = ['parser', 'ner']

    # Languages with models
    if lang in [
            'nld', 'eng', 'fra', 'deu', 'ell', 'ita', 'por', 'spa', 'other'
    ]:
        if f'spacy_nlp_{lang}' in main.__dict__:
            if main.__dict__[f'spacy_nlp_{lang}'].pipe_names != nlp_pipelines:
                del main.__dict__[f'spacy_nlp_{lang}']

        if f'spacy_nlp_{lang}' not in main.__dict__:
            # Dutch
            if lang == 'nld':
                import nl_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = nl_core_news_sm.load(
                    disable=nlp_disable)
            # English
            elif lang == 'eng':
                import en_core_web_sm

                main.__dict__[f'spacy_nlp_{lang}'] = en_core_web_sm.load(
                    disable=nlp_disable)
            # French
            elif lang == 'fra':
                import fr_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = fr_core_news_sm.load(
                    disable=nlp_disable)
            # German
            elif lang == 'deu':
                import de_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = de_core_news_sm.load(
                    disable=nlp_disable)
            # Greek (Modern)
            elif lang == 'ell':
                import el_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = el_core_news_sm.load(
                    disable=nlp_disable)
            # Italian
            elif lang == 'ita':
                import it_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = it_core_news_sm.load(
                    disable=nlp_disable)
            # Portuguese
            elif lang == 'por':
                import pt_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = pt_core_news_sm.load(
                    disable=nlp_disable)
            # Spanish
            elif lang == 'spa':
                import es_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = es_core_news_sm.load(
                    disable=nlp_disable)
            # Other Languages
            elif lang == 'other':
                import en_core_web_sm

                main.__dict__[f'spacy_nlp_{lang}'] = en_core_web_sm.load(
                    disable=nlp_disable)
    # Languages without models
    else:
        # Serbian (Cyrillic) & Serbian (Latin)
        if lang in ['srp_cyrl', 'srp_latn']:
            main.__dict__['spacy_nlp_srp_cyrl'] = spacy.blank('rs')
            main.__dict__['spacy_nlp_srp_latn'] = spacy.blank('rs')
        else:
            main.__dict__[f'spacy_nlp_{lang}'] = spacy.blank(
                wordless_conversion.to_iso_639_1(main, lang))

    if 'sentencizer' in nlp_pipelines:
        nlp = main.__dict__[f'spacy_nlp_{lang}']

        if 'sentencizer' not in nlp.pipe_names:
            nlp.add_pipe(nlp.create_pipe('sentencizer'))
Exemplo n.º 14
0
import gensim
from gensim.test.utils import datapath, get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
import fr_core_news_sm
from dateutil.parser import parse
import spacy
from spacy import displacy
from collections import Counter
import random

avoid_tags = ["ADV_", "ADP_", "VERB", "PRON"]

filepath_wac_fr_data = "data/frWac_no_postag_no_phrase_700_skip_cut50.bin"
nlp_fr = fr_core_news_sm.load()
french_model = KeyedVectors.load_word2vec_format(filepath_wac_fr_data,
                                                 binary=True,
                                                 unicode_errors="ignore")


def is_date(string, fuzzy=False):
    """
    Return whether the string can be interpreted as a date.

    :param string: str, string to check for date
    :param fuzzy: bool, ignore unknown tokens in string if True
    """
    try:
        parse(string, fuzzy=fuzzy)
        return True
Exemplo n.º 15
0
#################    Preprocessing    #####################

# lowercase strings
X_train['designation'] = X_train['designation'].str.lower()


# remove non aplha numeric characters
def remove_characters(string):
    string = re.sub("([^\w]|[\d_])+", " ",  string)
    return string

X_train['designation'] = X_train['designation'].apply(remove_characters)

# define language detectors 
language_detector = LanguageDetector()
nlp_fr = fr_core_news_sm.load(disable=["tagger", "parser","ner","entity_linker","textcat","entity_ruler","sentencizer","merge_noun_chunks","merge_entities","merge_subtokens"])
nlp_fr.add_pipe(nlp_fr.create_pipe('sentencizer'))
nlp_fr.add_pipe(language_detector)


# add a column for languages
X_train['language'] = X_train['designation'].str[:].apply(lambda row : nlp_fr(row)._.language['language'])


# plot the different languages of the dataset
fig, axes = plt.subplots(1, 1, figsize = (10,5))

ax = sns.countplot(x="language", 
                   data=X_train,
                   order=['fr','en','it','ca']
                     )
Exemplo n.º 16
0
def check_spacy_models(main, lang, pipeline):
    if lang == 'other':
        lang = 'eng'

    if pipeline == 'word_tokenization':
        nlp_pipelines = []
        nlp_disable = ['tagger', 'parser', 'ner']
    elif pipeline in ['tokenization', 'sentence_tokenization']:
        nlp_pipelines = ['sbd']
        nlp_disable = ['tagger', 'parser', 'ner']
    elif pipeline in ['pos_tagging', 'lemmatization']:
        nlp_pipelines = ['tagger']
        nlp_disable = ['parser', 'ner']

    if lang in ['nld', 'eng', 'fra', 'deu', 'ell', 'ita', 'por', 'spa']:
        if f'spacy_nlp_{lang}' in main.__dict__:
            if main.__dict__[f'spacy_nlp_{lang}'].pipe_names != nlp_pipelines:
                del main.__dict__[f'spacy_nlp_{lang}']

        if f'spacy_nlp_{lang}' not in main.__dict__:
            # Dutch
            if lang == 'nld':
                import nl_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = nl_core_news_sm.load(
                    disable=nlp_disable)
            # English
            elif lang == 'eng':
                import en_core_web_sm

                main.__dict__[f'spacy_nlp_{lang}'] = en_core_web_sm.load(
                    disable=nlp_disable)
            # French
            elif lang == 'fra':
                import fr_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = fr_core_news_sm.load(
                    disable=nlp_disable)
            # German
            elif lang == 'deu':
                import de_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = de_core_news_sm.load(
                    disable=nlp_disable)
            # Greek (Modern)
            elif lang == 'ell':
                import el_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = el_core_news_sm.load(
                    disable=nlp_disable)
            # Italian
            elif lang == 'ita':
                import it_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = it_core_news_sm.load(
                    disable=nlp_disable)
            # Portuguese
            elif lang == 'por':
                import pt_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = pt_core_news_sm.load(
                    disable=nlp_disable)
            # Spanish
            elif lang == 'spa':
                import es_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = es_core_news_sm.load(
                    disable=nlp_disable)

        if 'sbd' in nlp_pipelines:
            nlp = main.__dict__[f'spacy_nlp_{lang}']

            if 'sbd' not in nlp.pipe_names:
                nlp.add_pipe(nlp.create_pipe('sentencizer'))
Exemplo n.º 17
0
import re
from selenium import webdriver
from selenium.webdriver.common import keys
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from collections import Counter
from heapq import nlargest
import en_core_web_sm
import fr_core_news_sm
# Import summarize from gensim
from gensim.summarization.summarizer import summarize
from gensim.summarization import keywords  # Import the library
# to convert MSword doc to txt for processing.
import docx2txt
nlp_job = fr_core_news_sm.load()
nlp_resume = fr_core_news_sm.load()
from dotenv import load_dotenv, dotenv_values
from getpass import getpass

load_dotenv()

config = dotenv_values(".env")


def get_jobs_links(job_query, user, pwd):
    """[summary]

    Args:
        job_query ([type]): [description]
    """
Exemplo n.º 18
0
either from texts or from documents.
'''

#importing necessary modules

from flask import Flask, render_template, url_for, request
from flask_bootstrap import Bootstrap
from collections import Counter
from docx2python import docx2python
from tika import parser
import spacy
import en_core_web_sm as en
import fr_core_news_sm as fr

#load the languages
nlp_fr = fr.load()
nlp_en = en.load()
pos_tag = ['NOUN', 'PROPN', 'VERB', 'ADJ']
'''
Define a function that checks the type of file uploaded by the client and read it accordingly
Define another function that does the counting of the words
'''


def DocType(source):
    result = source.filename
    #split filename with (.) to get the file extension
    result_splitted = result.split('.')
    file_extension = result_splitted[-1]
    #check the extension type and use appropriate method to read
    if file_extension == "docx":
Exemplo n.º 19
0
def main():
    nlp = fr_core_news_sm.load()  # Load the model takes 10-20 seconds.
    print(nlp)
Exemplo n.º 20
0
# -*- coding: utf-8 -*-
"""
@author: LOX
"""
import spacy
from spacy import displacy
import requests
from bs4 import BeautifulSoup
import random
import fr_core_news_sm

# =======================================================
#                Sentence Analysis
# =======================================================
# nlp = spacy.load("fr_core_news_sm")
nlp = fr_core_news_sm.load()

#sentence = 'Mais, vous savez, moi je ne crois pas qu’il y ait de bonne ou de mauvaise situation. Moi, si je devais résumer ma vie aujourd’hui avec vous, je dirais que c’est d’abord des rencontres, des gens qui m’ont tendu la main, peut-être à un moment où je ne pouvais pas, où j’étais seul chez moi. Et c’est assez curieux de se dire que les hasards, les rencontres forgent une destinée… Parce que quand on a le goût de la chose, quand on a le goût de la chose bien faite, le beau geste, parfois on ne trouve pas l’interlocuteur en face, je dirais, le miroir qui vous aide à avancer. Alors ce n’est pas mon cas, comme je le disais là, puisque moi au contraire, j’ai pu ; et je dis merci à la vie, je lui dis merci, je chante la vie, je danse la vie… Je ne suis qu’amour ! Et finalement, quand beaucoup de gens aujourd’hui me disent : « Mais comment fais-tu pour avoir cette humanité ? » Eh bien je leur réponds très simplement, je leur dis que c’est ce goût de l’amour, ce goût donc qui m’a poussé aujourd’hui à entreprendre une construction mécanique, mais demain, qui sait, peut-être simplement à me mettre au service de la communauté, à faire le don, le don de soi...'
#sentence = "puceau moi ? serieusement ^^ haha on me l avait pas sortie celle la depuis loooongtemps 🙂 demande a mes potes si je suis puceau tu vas voir les reponses que tu vas te prendre XD rien que la semaine passee j ai niquer donc chuuuuut ferme la puceau de merde car oui toi tu m as tout l air d un bon puceau de merde car souvent vous etes frustrer de ne pas BAISER 🙂 ses agreable de se faire un missionnaire ou un amazone avec une meuf hein? tu peux pas repondre car tu ne sais pas ce que c ou alors tu le sais mais tu as du taper dans ta barre de recherche 'missionnaire sexe' ou 'amazone sexe' pour comprendre ce que c etait mdddrrr !! c est grave quoiquil en soit.... pour revenir a moi, je pense que je suis le mec le moins puceau de ma bande de 11 meilleurs amis pas psk j ai eu le plus de rapport intime mais psk j ai eu les plus jolie femme que mes amis :) ses pas moi qui le dit, ses eux qui commente sous mes photos insta 'trop belle la fille que tu as coucher avec hier en boite notamment!' donc apres si tu veux que sa parte plus loi sa peut partir vraiment loi j habite dans la banlieue de niort sa te parle steven sanchez ? ses juste un cousin donc OKLM hahaha on verra si tu parles encore le puceau de merde mdddrrr pk insulter qd on est soi meme puceau tu me feras toujour marrer!"
sentence = input()
doc = nlp(sentence)

# Text Preprocessing | Lemmatization
print("\n" + f"Token\t\tLemma\t\tStopword\tDEP\t\tPOS".format(
    'Token', 'Lemma', 'Stopword'))
print("-" * 70)
for token in doc:
    print(
        f"{str(token)}\t\t{token.lemma_}\t\t{token.is_stop}\t\t{token.dep_}\t\t{token.pos_}"
    )
Exemplo n.º 21
0
def load_spacy():
    nlp = fr_core_news_sm.load(disable=["parser", "tagger"])
    suffixes = nlp.Defaults.suffixes + [r"\d*?[\.,]?\d*\%"]
    suffix_regex = spacy.util.compile_suffix_regex(suffixes)
    nlp.tokenizer.suffix_search = suffix_regex.search
    return nlp
Exemplo n.º 22
0
def evaulate_parsers(article_de, article_en, article_fr):

    # Get all articles from the database
    # print(f"German Article to parse: {article_de}")
    # print(f"English Article to parse: {article_en}")
    # print(f"French Article to parse: {article_fr}")

    # TODO Create AllenNLP Parsing Function, and call the proper values....
    allen_scores = []
    allen_scores.append(50)
    allen_scores.append(60)
    allen_scores.append(70)

    # Stanford Parser Baseline CODE
    """
    The Stanford Parser is the Baseline for this Application, we compare every other parser to the output of the Stanford Parser.
    Define Models, Assign Dataframe to List
    """

    config = "tokenize,mwt,pos,lemma,depparse"

    nlp_en = stanza.Pipeline(lang='en', processors=config)
    nlp_de = stanza.Pipeline(lang='de', processors=config)
    nlp_fr = stanza.Pipeline(lang='fr', processors=config)

    df_stanford_en = stanford.parse_stan(article_en, nlp_en)
    df_stanford_de = stanford.parse_stan(article_de, nlp_de)
    df_stanford_fr = stanford.parse_stan(article_fr, nlp_fr)
    """
    Spacy Parser:
    Define Spacy Models, Assign Dataframe to List
    """

    df_spacy_de = spacyparser.parse_spacy(article_de, de_core_news_sm.load())
    df_spacy_en = spacyparser.parse_spacy(article_en, en_core_web_sm.load())
    df_spacy_fr = spacyparser.parse_spacy(article_fr, fr_core_news_sm.load())

    # Evaluate Parsers against each other....
    df_complete_de = pd.concat([df_stanford_de, df_spacy_de],
                               axis=1,
                               sort=False)
    df_complete_en = pd.concat([df_stanford_en, df_spacy_en],
                               axis=1,
                               sort=False)
    df_complete_fr = pd.concat([df_stanford_fr, df_spacy_fr],
                               axis=1,
                               sort=False)

    # print("German Dataframe Combined")
    df_complete_de['spacy_eval_upos'] = df_complete_de['upos'].str.lower(
    ) == df_complete_de['sp_upos'].str.lower()
    df_complete_de['spacy_eval_deprel'] = df_complete_de['deprel'].str.lower(
    ) == df_complete_de['sp_deprel'].str.lower()
    df_complete_de['spacy_eval'] = df_complete_de[
        'spacy_eval_upos'] == df_complete_de['spacy_eval_deprel']

    # print("English Dataframe Combined")
    df_complete_en['spacy_eval_upos'] = df_complete_en['upos'].str.lower(
    ) == df_complete_en['sp_upos'].str.lower()
    df_complete_en['spacy_eval_deprel'] = df_complete_en['deprel'].str.lower(
    ) == df_complete_en['sp_deprel'].str.lower()
    df_complete_en['spacy_eval'] = df_complete_en[
        'spacy_eval_upos'] == df_complete_en['spacy_eval_deprel']

    # print("French Dataframe Combined")
    df_complete_fr['spacy_eval_upos'] = df_complete_fr['upos'].str.lower(
    ) == df_complete_fr['sp_upos'].str.lower()
    df_complete_fr['spacy_eval_deprel'] = df_complete_fr['deprel'].str.lower(
    ) == df_complete_fr['sp_deprel'].str.lower()
    df_complete_fr['spacy_eval'] = df_complete_fr[
        'spacy_eval_upos'] == df_complete_fr['spacy_eval_deprel']

    # Evaluate the Parsers Against the Stanford Parse
    # print(df_stanford_de.equals(df_spacy_de))

    allen_scores = [0, 0, 0]
    spacy_scores = []
    stanford_scores = [1, 1, 1]

    spacy_de_scores = df_complete_de.spacy_eval.value_counts().tolist()
    spacy_en_scores = df_complete_en.spacy_eval.value_counts().tolist()
    spacy_fr_scores = df_complete_fr.spacy_eval.value_counts().tolist()

    spacy_de_score = calculate_score(spacy_de_scores[0],
                                     spacy_de_scores[0] + spacy_de_scores[1])
    spacy_en_score = calculate_score(spacy_en_scores[0],
                                     spacy_en_scores[0] + spacy_en_scores[1])
    spacy_fr_score = calculate_score(spacy_fr_scores[0],
                                     spacy_fr_scores[0] + spacy_fr_scores[1])

    spacy_scores.append(spacy_de_score)
    spacy_scores.append(spacy_en_score)
    spacy_scores.append(spacy_fr_score)

    # The Report Data sets Stanford Parser Output to 100 by default, as it is the parser we wan't to compare against. The other parsers are set by their values of true and false in comparison to the stanford parser
    report_data = {
        'de_stan': stanford_scores[0],
        'en_stan': stanford_scores[1],
        'fr_stan': stanford_scores[2],
        'de_spacy': spacy_scores[0],
        'en_spacy': spacy_scores[1],
        'fr_spacy': spacy_scores[2],
        'de_allen': allen_scores[0],
        'en_allen': allen_scores[1],
        'fr_allen': allen_scores[2]
    }
    # report_data = {'de_stan': 100, 'en_stan': 100, 'fr_stan': 100, 'de_spacy': 93.2, 'en_spacy': 92.6, 'fr_spacy': 90.7, 'de_allen': 87.9, 'en_allen': 88.6, 'fr_allen': 90.2}
    return (report_data)
Exemplo n.º 23
0
#!/usr/bin/env python3
import sys
import re
import os
import argparse
import requests

from bs4 import BeautifulSoup, Comment
from random import shuffle

from utils import splitIntoWords, filter_numbers, maybe_normalize, extract_sentences, check_output_dir, set_custom_boundaries
import spacy
try:
    import fr_core_news_sm  #if it doesn't work, an alternative is: nlp = spacy.load('fr_core_news_sm') https://spacy.io/models/fr. See also line nlp = fr_core_news_sm.load(), at the bottom of the page
    nlp = fr_core_news_sm.load(
    )  #if it doesn't work, try: nlp = spacy.load('fr_core_news_sm'). See  imports, and https://spacy.io/models/fr, https://spacy.io/models/fr, etc.
except ModuleNotFoundError:
    from spacy.cli import download as spacy_model_download
    spacy_model_download('fr_core_news_sm')
    nlp = spacy.load('fr_core_news_sm')

    import nltk
    nltk.download('punkt')

# - prose
# - 19è + 20è siècle
LIBRETHEATRE_URL = 'https://data.libretheatre.fr/ajax?__fromnavigation=1&rql=DISTINCT+Any+X%2CA%2CX%2CG%2CX%2CF%2CM%2CW+ORDERBY+XAT+WHERE+X+genre+G%2C+A+author_of+X%2C+X+preferred_form+XA%2C+X+text_form+F%2C+XA+title+XAT%2C+X+nb_men+M%2C+X+nb_women+W%2C+X+text_form+%22Prose%22%2C+X+timespan+B%2C+B+eid+IN(1742%2C+3181)&__force_display=1&vid=table.work.no-filter&divid=table_work_no_filter_28fab344fb3a4775b10b359c84710a16&fname=view&pageid=1403154733050406ce179a062b74023961c80756d6f8349'
WORK_TEMPLATE = 'https://data.libretheatre.fr/work/%(workid)d'
PD_LICENCE = 'https://data.libretheatre.fr/license/1747'

mapping_specific = [
from resources.config_provider import get_config_default
from viewer.spacy_viewer import view_spacy_docs
import fr_core_news_sm  # added by piter

warnings.filterwarnings('ignore')

config_training = get_config_default()
model_dir_path = config_training["model_dir_path"]
xml_dev_path = config_training["xml_dev_path"]
number_of_paragraph_to_display = int(
    config_training["number_of_paragraph_to_display"])

# nlp = get_empty_model(load_labels_for_training=False)
# nlp = nlp.from_disk(model_dir_path)

nlp = fr_core_news_sm.load()  #a added by piter

DEV_DATA = get_paragraph_from_file(xml_dev_path,
                                   keep_paragraph_without_annotation=True)

all_docs_to_view: List[Doc] = list()
# last_case_spans = dict()
last_case_docs: List[Doc] = list()
former_case_id = None
entity_typename_builder = EntityTypename()

with tqdm(total=len(DEV_DATA[:number_of_paragraph_to_display]),
          unit=" paragraphs",
          desc="Find entities") as progress_bar:
    for (case_id, original_text, _,
         _) in DEV_DATA[:number_of_paragraph_to_display]: