Exemplo n.º 1
0
def get_french_distances(dataset_fn):
    pipeline = French()
    sentencizer = pipeline.create_pipe('sentencizer')
    pipeline.add_pipe(sentencizer)

    questions_list, sentences_list, spans_list = compute_question_sentence(
        dataset_fn, pipeline)

    nlp_fr = spacy.load('fr_core_news_sm')

    all_distances = []
    error = 0
    error_anchor = 0
    no_pronoums = 0
    all_lexical_variation = []
    for i, question in enumerate(questions_list):
        try:
            print(questions_list[i], sentences_list[i], spans_list[i])
            distance, lexical_variation = get_anchor(questions_list[i],
                                                     sentences_list[i], nlp_fr,
                                                     spans_list[i])
            if distance is not None:
                if distance == -1:
                    error_anchor += 1
                elif distance == -2:
                    no_pronoums += 1
                else:
                    all_distances.append(distance)
                    all_lexical_variation.append(lexical_variation)
        except:
            error += 1
            continue
    print(error, error_anchor, no_pronoums)
    return all_distances, all_lexical_variation
Exemplo n.º 2
0
 def __init__(self, language='en'):
     self.exclude = EXCLUDE
     self.language = language
     if language == 'fr':
         nlp = French()
     else:
         nlp = English()
     #nlp.add_pipe(nlp.create_pipe('sentencizer'))
     sbd = SentenceSegmenter(nlp.vocab, strategy=split_sents)
     nlp.add_pipe(sbd)
     self.nlp = nlp
def clean_text(txt):
    nlp = French()
    listcode = [x + 45 for x in range(99)]
    postalcod = lambda dd, liscode: str(int(dd) * 1000
                                        ) if dd in liscode else dd
    customize_remove_PUNCT = ['%']
    for w in customize_remove_PUNCT:
        nlp.vocab[w].is_punct = False
    customize_add_PUNCT = [
        '>', '=', '$', '™', 'eee', 'ee', 'e', "EE", "EEE", "E", ":"
    ]
    for w in customize_add_PUNCT:
        nlp.vocab[w].is_punct = True
    reg = '(?<=[0-9])[+\\-\\*^](?=[0-9-])'
    list_infixes_defaults = list(nlp.Defaults.infixes)
    if reg in list_infixes_defaults:
        list_infixes_defaults.remove(reg)
    # modify process_text infix patterns(dd-dd-dd)
    infixes = (list_infixes_defaults + [r"(?<=[0-9])[\+\*^](?=[0-9-])"])
    infix_re = compile_infix_regex(infixes)
    nlp.tokenizer.infix_finditer = infix_re.finditer
    doc = nlp(txt)
    tokens = [
        postalcod(w.text.lower(), listcode) for w in doc
        if w.text != 'n' and not w.is_punct and not w.is_space
        and not (w.like_num and len(w.text) > 5) and not len(w.text) > 11
        and not w.is_quote
    ]
    listToStr = ' '.join(map(str, tokens))

    return listToStr
Exemplo n.º 4
0
def get_nlp(lang: str):
    if lang == "fr":
        return French()
    elif lang == "en":
        return English()
    else:
        raise ValueError("unknown lang: {}".format(lang))
Exemplo n.º 5
0
def read_mtl_file(domain, filename):
    X = []
    Y = []
    if domain == 'en':
        # tokenizer = WordPunctTokenizer()
        tokenizer = English().Defaults.create_tokenizer()
    elif domain == 'fr':
        # tokenizer = nltk.data.load('tokenizers/punkt/french.pickle')
        tokenizer = French().Defaults.create_tokenizer()
    elif domain == 'de':
        # tokenizer = nltk.data.load('tokenizers/punkt/german.pickle')
        tokenizer = German().Defaults.create_tokenizer()
    with open(filename, 'r', encoding='utf-8') as inf:
        for line in inf.readlines():
            parts = line.split('\t')
            if len(parts) == 3:  # labeled
                Y.append(int(float(parts[1])))
            elif len(parts) == 2:  # unlabeled
                Y.append(0)
            else:
                raise Exception('Unknown format')
            clean = clean_sentence(parts[-1])
            # if domain is 'en':
            #     words = word_tokenize(clean, language='english')
            # elif domain is 'fr':
            #     words = word_tokenize(clean, language='french')
            # elif domain is 'de':
            #     words = word_tokenize(clean, language='german')
            words = [str(e) for e in tokenizer(clean)]
            tmp = {}
            tmp['tokens'] = words
            tmp['sent'] = clean
            X.append(tmp)
    #Y = torch.LongTensor(Y).to(opt.device)
    return (X, Y)
Exemplo n.º 6
0
    def init_resources(self):
        self.punctuation_pattern = re.compile("|".join(PUNCTUATION))
        self.stemmer = None
        stopwords_path = os.path.join(
            os.path.dirname(assistant_dialog_skill_analysis.__file__),
            "resources",
            self.language_code,
            "stopwords",
        )
        if self.language_code == "en":
            from spacy.lang.en import English

            self.tokenizer = Tokenizer(English().vocab)
            self.stemmer = SnowballStemmer(language="english")
            self.stop_words = self.load_stop_words(stopwords_path)

        elif self.language_code == "fr":
            from spacy.lang.fr import French

            self.tokenizer = Tokenizer(French().vocab)
            self.stemmer = SnowballStemmer(language="french")
            self.stop_words = self.load_stop_words(stopwords_path)

        elif self.language_code == "de":
            from spacy.lang.de import German

            self.tokenizer = Tokenizer(German().vocab)
            self.stemmer = SnowballStemmer(language="german")
            self.stop_words = self.load_stop_words(stopwords_path)

        elif self.language_code == "it":
            from spacy.lang.it import Italian

            self.tokenizer = Tokenizer(Italian().vocab)
            self.stemmer = SnowballStemmer(language="italian")
            self.stop_words = self.load_stop_words(stopwords_path)

        elif self.language_code == "cs":
            from spacy.lang.cs import Czech

            self.tokenizer = Tokenizer(Czech().vocab)
            self.stop_words = self.load_stop_words(stopwords_path)

        elif self.language_code == "pt":
            from spacy.lang.pt import Portuguese

            self.tokenizer = Tokenizer(Portuguese().vocab)
            self.stemmer = SnowballStemmer(language="portuguese")
            self.stop_words = self.load_stop_words(stopwords_path)

        elif self.language_code == "es":
            from spacy.lang.es import Spanish

            self.tokenizer = Tokenizer(Spanish().vocab)
            self.stemmer = SnowballStemmer(language="spanish")
            self.stop_words = self.load_stop_words(stopwords_path)
        else:
            raise Exception("language code %s is not supported",
                            self.language_code)
def RecupererTextTokenSansPonctuation(fichier):
    #Import Package French langages
    Langue = French()
    f = fichier
    tokenizer = RegexpTokenizer(r'\w+')
    doc = Langue(f.read())
    filtered_sent = []
    for word in doc:
        if word.text:
            filtered_sent.append(word)
# delete all and save just text
    return str(tokenizer.tokenize(str(filtered_sent)))
Exemplo n.º 8
0
def define_spacy_tokenizer(language):
    # Construction 1
    from spacy.tokenizer import Tokenizer
    if (language == 'french'):
        from spacy.lang.fr import French
        nlp = French()
    if (language == 'english'):
        from spacy.lang.en import English
        nlp = English()
    # Create a blank Tokenizer with just the language vocab
    tokenizer = Tokenizer(nlp.vocab)

    return tokenizer
Exemplo n.º 9
0
def initGlobal():
    global parser
    global fr_stop

    print("INITIALIZATION")
    print("Check downloads for nltk libs...")
    nltk.download('wordnet')
    nltk.download('stopwords')

    print("Parse into French")
    parser = French()
    fr_stop = set(nltk.corpus.stopwords.words('french'))

    print("DONE")
Exemplo n.º 10
0
def tokenize(document, language, punctutation):
    if language == 'fr':
        nlp = French()
    if language == 'de':
        nlp = German()
    if language == 'en':
        nlp = French()
    if language == 'es':
        nlp = Spanish()
    sentencizer = nlp.create_pipe("sentencizer")
    nlp.add_pipe(sentencizer)
    doc = nlp(document)
    if punctutation:
        sentences = [[str(word) for word in sent if str(word) != '\n']
                     for sent in doc.sents]
    else:
        sentences = [[
            str(word) for word in sent
            if ((str(word) != '\n') and (str(word).isalpha()))
        ] for sent in doc.sents]
    return sentences
Exemplo n.º 11
0
 def get_tokenizers(self, lang):
     os.environ['TOKENIZERS_PARALLELISM'] = "True"
     if lang == 'de':
         spacy = German()
         bert = "deepset/gbert-base"
     elif lang == 'fr':
         spacy = French()
         bert = "camembert/camembert-base-ccnet"
     elif lang == 'it':
         spacy = Italian()
         bert = "dbmdz/bert-base-italian-cased"
     else:
         raise ValueError(
             f"Please choose one of the following languages: {self.languages}"
         )
     return spacy.tokenizer, AutoTokenizer.from_pretrained(bert)
Exemplo n.º 12
0
def lang_change(language):
    if language == 'en':
        from spacy.lang.en import English
        from spacy.lang.en.stop_words import STOP_WORDS
        parser = English()
        file = "\config_files\config_spacy_en.yaml"
        configfile_path = os.getcwd() + file
    elif language == 'de':
        from spacy.lang.de import German
        from spacy.lang.de.stop_words import STOP_WORDS
        parser = German()
        file = "\config_files\config_spacy_de.yaml"
        configfile_path = os.getcwd() + file
    elif language == 'es':
        from spacy.lang.es import Spanish
        from spacy.lang.es.stop_words import STOP_WORDS
        parser = Spanish()
        file = "\config_files\config_spacy_es.yaml"
        configfile_path = os.getcwd() + file
    elif language == 'pt':
        from spacy.lang.pt import Portuguese
        from spacy.lang.pt.stop_words import STOP_WORDS
        parser = Portuguese()
        file = "\config_files\config_spacy_pt.yaml"
        configfile_path = os.getcwd() + file
    elif language == 'fr':
        from spacy.lang.fr import French
        from spacy.lang.fr.stop_words import STOP_WORDS
        parser = French()
        file = "\config_files\config_spacy_fr.yaml"
        configfile_path = os.getcwd() + file
    elif language == 'it':
        from spacy.lang.it import Italian
        from spacy.lang.it.stop_words import STOP_WORDS
        parser = Italian()
        file = "\config_files\config_spacy_it.yaml"
        configfile_path = os.getcwd() + file
    elif language == 'nl':
        from spacy.lang.nl import Dutch
        from spacy.lang.nl.stop_words import STOP_WORDS
        parser = Dutch()
        file = "\config_files\config_spacy_nl.yaml"
        configfile_path = os.getcwd() + file

    return parser, STOP_WORDS, configfile_path
Exemplo n.º 13
0
    def get_nlp(self, language):

        """"
        this method returns the corresponding spacy language model when 
        provided with a language. To do so it also does the required 
        import. This is certainly not the standard approach. 
        But as this endpoint will be deployed to Heroku (space limitation)
        and only be invoked rarely it is the fastest approach.
        """

        if language == "en":

            from spacy.lang.en import English
            return English()

        elif language == "fr":

            from spacy.lang.fr import French
            return French()

        elif language == "de":

            from spacy.lang.de import German
            return German()

        elif language == "es":

            from spacy.lang.es import Spanish
            return Spanish()

        elif language == "pt":

            from spacy.lang.pt import Portuguese
            return Portuguese()

        else:

            return {"error": "invalid or not supported language entered"}
Exemplo n.º 14
0
    def tokenize(self, dataset, language):
        """
        Articles will be processed in parallel
        """
        articles_iter = chunk(dataset, size=self.chunks)
        length = int(len(dataset) / self.chunks)
        if language == 'english':
            nlp_iter = repeat(English())
        else:
            nlp_iter = repeat(French())

        tokenized_questions = []
        with ProcessPoolExecutor() as executor:
            chunksize = int(max(length / (self.processes * self.parallelism), 1))
            i = 0
            for result in executor.map(_tokenize_questions, articles_iter,
                                        nlp_iter, chunksize=chunksize):
                for article in result:
                    tokenized_questions.append(article)
                    i += 1
                    if i % 10000 == 0:
                        print('Processed {} articles'.format(i))
        return tokenized_questions
Exemplo n.º 15
0
def preprocess_file(file_path):
    json_data = []
    with open(file_path, encoding="utf8") as json_file:
        json_data = json.load(json_file)

    # Filters the question to only take into account the ones that have answers
    response_data = []
    for contrib in json_data:
        for response in contrib["responses"]:
            # Si on a une reponse non vide
            if response["value"] and response["formattedValue"]:
                # Flattens the responses and add it to the response data
                response_obj = dict(contrib)
                del response_obj["responses"]
                response_obj.update(response)
                response_data.append(response_obj)
    df_response_data = pd.DataFrame.from_records(response_data)

    df_response_data.to_json(
        os.path.join(data_dir, "response_" + os.path.basename(file_path)))

    # Loads the french model of spacy and adds some new stop words (could be extended)
    nlp = fr_core_news_md.load()
    tokenizer = French().Defaults.create_tokenizer(nlp)
    additional_stopwords = ["de", "le", "que", "ce", "l"]
    for stopword in additional_stopwords:
        nlp.Defaults.stop_words.add(stopword)

    # Creates a new column in the dataframe that contains each token lemma.
    # Punctuations, spaces and stopwords are removed
    df_response_data["lemmatizedValue"] = df_response_data["formattedValue"].\
        apply(lambda t: [token.lemma_ for token in tokenizer(t.lower()) if not token.is_stop and not token.is_punct and
                         not token.is_space])

    df_response_data.to_json(
        os.path.join(data_dir,
                     "response_lemmatized_" + os.path.basename(file_path)))
Exemplo n.º 16
0
 def __init__(self):
     self.nlp = French()
Exemplo n.º 17
0
def fr_nlp():
    return French()
Exemplo n.º 18
0
import json
from spacy.lang.fr import French
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher

with open("exercises/fr/countries.json", encoding="utf8") as f:
    COUNTRIES = json.loads(f.read())

with open("exercises/fr/capitals.json", encoding="utf8") as f:
    CAPITALS = json.loads(f.read())

nlp = French()
matcher = PhraseMatcher(nlp.vocab)
matcher.add("COUNTRY", None, *list(nlp.pipe(COUNTRIES)))


def countries_component(doc):
    # Crée une entité Span avec le label "GPE" pour toutes les correspondances
    matches = matcher(doc)
    doc.ents = [
        Span(doc, start, end, label="GPE") for match_id, start, end in matches
    ]
    return doc


# Ajoute le composant au pipeline
nlp.add_pipe(countries_component)
print(nlp.pipe_names)

# Getter qui recherche le texte du span dans le dictionnaire
# des capitales des pays
Exemplo n.º 19
0
import json
from spacy.matcher import Matcher
from spacy.lang.fr import French

with open("exercises/fr/iphone.json", encoding="utf8") as f:
    TEXTS = json.loads(f.read())

nlp = French()
matcher = Matcher(nlp.vocab)

# Deux tokens dont les formes majuscules correspondent à "iphone" et "x"
pattern1 = [{"LOWER": "iphone"}, {"LOWER": "x"}]

# Tokens dont les formes majuscules correspondent à "iphone" et un nombre
pattern2 = [{"LOWER": "iphone"}, {"IS_DIGIT": True}]

# Ajoute les motifs au matcher et vérifie le résultat
matcher.add("GADGET", None, pattern1, pattern2)
for doc in nlp.pipe(TEXTS):
    print([doc[start:end] for match_id, start, end in matcher(doc)])
Exemplo n.º 20
0
                    get_stop_words("en") + STOP_LIST +
                    stopwords.words('english'))
        elif lang == "nl":
            return set(
                get_stop_words("nl") + stopwords.words('dutch') + STOP_LIST_NL)
    except:
        print("warning: no stopwords were downloaded. check nltk corpora")
        print(format_exc())
        return set()


# load resources
_stop_words = load_stoplist()
print("Loading spacy model...")
_spacy = English()
_spacy_fr = French()
_spacy_nl = Dutch()
_spacy_it = Italian()


def get_stoplist():
    return _stop_words


def lemmatize(text, lowercase=True, lang="en"):
    """ Return lemmatized text """

    if lang == "en":
        tokens = _spacy(text)
    elif lang == "fr":
        tokens = _spacy_fr(text)
Exemplo n.º 21
0
"""
import re
from dataclasses import dataclass, field
from typing import Iterable, List, Set, Tuple, Dict

from spacy.lang.fr import French

SPLITTER_CHAR = {"(", ")", ",", ";", "[", "]", "-", "{", "}"}

# Food additives (EXXX) may be mistaken from one another, because of their edit distance proximity
ADDITIVES_REGEX = re.compile("(?:E ?\d{3,5}[a-z]*)", re.IGNORECASE)

OffsetType = Tuple[int, int]

FR_NLP = French()


class TokenLengthMismatchException(Exception):
    pass


def normalize_ingredients(ingredients: str) -> str:
    normalized = ingredients.lower()
    normalized = normalized.replace("œu", "oeu")
    normalized = normalized.replace("’", "'")
    return normalized


def normalize_item_ingredients(item: Dict) -> Dict:
    item = item.copy()
Exemplo n.º 22
0
# Spacy
from spacy.lang.en import English
from spacy.lang.es import Spanish
from spacy.lang.fr import French
from spacy.lang.zh import Chinese
from spacy.lang.ru import Russian
from spacy.lang.ar import Arabic
from spacy.lang.de import German
from spacy.lang.uk import Ukrainian
from spacy.lang.ro import Romanian

lang_id_to_spacy = {
    'en': English(),
    'es': Spanish(),
    'fr': French(),
    'zh-cn': Chinese(),
    'ru': Russian(),
    'ar': Arabic(),
    'de': German(),
    'uk': Ukrainian(),
    'ro': Romanian()
}

#####################
### Globals
#####################

reddit = Reddit(client_id='OFsSWAsbFrzLpg',
                client_secret='tRReu7VAAyxgEXbGqaE19_OUrR4',
                password='******',
Exemplo n.º 23
0
import torch.nn.functional as F
import torch.optim as optim

import spacy
from spacy.lang.fr import French

# %%
# python -m spacy download fr_core_news_sm
spacy_fr = spacy.load("fr_core_news_sm")

# %% [markdown]
# ## Tokenizing the corpus

# %%
# Create a tokenizer for the french language
tokenizer = French().Defaults.create_tokenizer()

with open("data/20_000_lieues_sous_les_mers.txt", "r", encoding="utf-8") as f:
    document = tokenizer(f.read())

# Define a filtered set of tokens by iterating on `document`
tokens = ...

# Make a list of unique tokens and dictionary that maps tokens to
# their index in that list.
idx2tok = []
tok2idx = {}
...

# %% [markdown]
# ## The continuous bag of words model
Exemplo n.º 24
0
from spacy.lang.de import German
from spacy.lang.ru import Russian
from spacy.lang.zh import Chinese
from spacy.lang.ja import Japanese
from spacy.lang.ca import Catalan
from spacy.lang.eu import Basque

from DataHandler import load_df_twitter_sent, load_df_lorelei
from util import clean_str as test_clean_str
from nltk.corpus import stopwords
from util import identity_fn, lang2id

language_dict = {
    'english': English(),
    'spanish': Spanish(),
    'french': French(),
    'italian': Italian(),
    'german': German(),
    'russian': Russian(),
    'chinese': Chinese(),
    'japanese': Japanese(),
    'catalan': Catalan(),
    'basque': Basque(),
}


class Tokenizer:
    def __init__(self,
                 language,
                 tokenizer_method='spacy',
                 remove_stopwords=True,
Exemplo n.º 25
0
 def __init__(self, lang=English):
     if lang == "fr":
         self.nlp = French()
     else:
         self.nlp = English()
     self.nlp.add_pipe(self.nlp.create_pipe('sentencizer'))
Exemplo n.º 26
0
def split_into_lemmas_spacy(desc):
    nlp = French()
    doc = nlp(desc)
    return [w.lemma_ for w in doc]
Exemplo n.º 27
0
from spacy.lang.fr import French

nlp = French()

# Importe la classe Doc
from ____ import ____

# Texte désiré : "spaCy est cool."
words = ["spaCy", "est", "cool", "."]
spaces = [True, True, False, False]

# Crée un Doc à partir des mots et des espaces
doc = ____(____, words=words, spaces=spaces)
print(doc.text)
Exemplo n.º 28
0
import json
from spacy.lang.fr import French

with open("exercises/fr/countries.json", encoding="utf8") as f:
    COUNTRIES = json.loads(f.read())

nlp = French()
doc = nlp("La Tchéquie pourrait aider la Slovaquie à protéger son espace aérien")

# Importe le PhraseMatcher et initialise-le
from spacy.____ import ____

matcher = ____(____)

# Crée des motifs objets Doc et ajoute-les au matcher
# C'est la version rapide de : [nlp(country) for country in COUNTRIES]
patterns = list(nlp.pipe(COUNTRIES))
matcher.add("COUNTRY", None, *patterns)

# Appelle le matcher sur le document de test et affiche le résultat
matches = ____(____)
print([doc[start:end] for match_id, start, end in matches])
Exemplo n.º 29
0
import numpy as np
from scipy import spatial
import sys
import unidecode
#from sklearn.decomposition import PCA
#QUERY  Neighbours Ids_and_Score_bool
directory = '../'
argv = sys.argv
nlp = spacy.load("fr_core_news_lg")
pca = pickle.load(open(directory + 'models/pca_30.pkl', 'rb'))
pca_space = np.load(directory + 'models/vectors_pca_30.npy', allow_pickle=True)
id_table = list(np.load(directory + '../data/id_table.npy', allow_pickle=True))
tree = spatial.KDTree(pca_space)
from spacy.lang.fr.stop_words import STOP_WORDS
from spacy.lang.fr import French
parser = French()
stopwords = list(STOP_WORDS)


def process_query(search_query):
    query = str(search_query).lower()
    clean_query = unidecode.unidecode(query)
    tokens = parser(clean_query)
    tokens = [word.lower_ for word in tokens]
    tokens = [word for word in tokens if word not in stopwords]
    tokens = " ".join([i for i in tokens])
    return (tokens)


def query2vec(search_query):
    x = nlp(search_query).vector  #spacy 300d
Exemplo n.º 30
0
import json
from spacy.lang.fr import French
from spacy.tokens import Doc

with open("exercises/fr/bookquotes.json", encoding="utf8") as f:
    DATA = json.loads(f.read())

nlp = French()

# Déclare l'extension de Doc "author" (défaut None)
Doc.set_extension("author", default=None)

# Déclare l'extension de Doc "book" (default None)
Doc.set_extension("book", default=None)

for doc, context in nlp.pipe(DATA, as_tuples=True):
    # Définis les attributs doc._.book et doc._.author à partir du contexte
    doc._.book = context["book"]
    doc._.author = context["author"]

    # Affiche le texte et les données des attributs personnalisés
    print(f"{doc.text}\n — '{doc._.book}' par {doc._.author}\n")