def __init__(self, dataset: t.Iterable) -> t.NoReturn: #os.system('python -m spacy download fr_core_news_md') self.nlp = fr_core_news_md.load() #self.nlp = spacy.load('fr_core_news_md') self.sentences = list(dataset['sentence']) self.intents = list(dataset['intent']) self.clean_text = [] self.vectorized_text = [] # Keep track of the removed sentences' intents during preprocessing self.intents_to_remove = []
def get_spacy_tokenizer(default_lingo, supported_languages, bigmodel_required): '''returns the spacy nlp function corresponding to the language of a document''' if default_lingo in supported_languages: if bigmodel_required == False: if default_lingo == "German": import de_core_news_sm nlp = de_core_news_sm.load() elif default_lingo == "English": import en_core_web_sm nlp = en_core_web_sm.load() elif default_lingo == "Spanish": import es_core_news_sm nlp = es_core_news_sm.load() elif default_lingo == "French": import fr_core_news_sm nlp = fr_core_news_sm.load() elif default_lingo == "Portuguese": import pt_core_news_sm nlp = pt_core_news_sm.load() else: import it_core_news_sm nlp = it_core_news_sm.load() else: if default_lingo == "German": import de_core_news_md nlp = de_core_news_md.load() elif default_lingo == "English": import en_core_web_md nlp = en_core_web_md.load() elif default_lingo == "Spanish": import es_core_news_md nlp = es_core_news_md.load() elif default_lingo == "French": import fr_core_news_md nlp = fr_core_news_md.load() elif default_lingo == "Portuguese": # there is no pt_md model import pt_core_news_sm nlp = pt_core_news_sm.load() else: # there is no it_md model import it_core_news_sm nlp = it_core_news_sm.load() else: print("NOT A SUPPORTED LANGUAGE!") return nlp
def initialize_weighted_vectorizer(config, documents): print("Initializing weighted vectorizer.") # load scoring method or create it if not existing scorer = load_scorer(config, documents) # instantiate if not spacy.util.is_package("fr_core_news_md"): print("Downloading fr_core_news_md spacy model for pos tagging...") spacy.cli.download('fr_core_news_md') print("done.") print("Loading fr_core_news_md Spacy model.") nlp_wv = fr_core_news_md.load() nlp_wv.remove_pipe( "ner") # no need, and it seems not implemented for french model vectorizer_component = VectorizerComponent() vectorizer_component.add_scorer(scorer) nlp_wv.add_pipe(vectorizer_component) # save nlp_wv.to_disk(config.model_dir) print("added to disk (TODO: delete this statement") return nlp_wv
def lemmatizer(df): """Lemmatize dataframe Series Args: df (Series): list to lemmatize Returns: Series: lemmatized list """ df_tmp = [] nlp = fr_core_news_md.load(disable=["ner", "parser"]) nlp.add_pipe(nlp.create_pipe("sentencizer")) for doc in nlp.pipe(list(df), batch_size=2000): texts = [ent.lemma_ for ent in doc] tmp = " ".join(texts) df_tmp.append(tmp) # doc = nlp.pipe(text) # text_lemm = [token.lemma_ for token in doc] # text_lemm = " ".join(text_lemm) return df_tmp
def preprocess_sentence(sentence: str) -> t.Iterable: nlp = fr_core_news_md.load() # remove special characters clean_sentence = re.sub(r'[^ A-Za-z0-9éèàêî€]', '', sentence) doc_s = nlp(clean_sentence) # remove determiners (i.e. pos = 90 or pos_ = 'DET') remaining_words = list(filter(lambda x: x.pos != 90, doc_s)) # build new doc object str_s = " ".join(list(map(lambda x: x.text, remaining_words))) final_sentence = nlp(str_s) null_vector = np.zeros(300) vect = np.asarray( final_sentence.vector if final_sentence.has_vector else null_vector) return vect
def preprocess_file(file_path): json_data = [] with open(file_path, encoding="utf8") as json_file: json_data = json.load(json_file) # Filters the question to only take into account the ones that have answers response_data = [] for contrib in json_data: for response in contrib["responses"]: # Si on a une reponse non vide if response["value"] and response["formattedValue"]: # Flattens the responses and add it to the response data response_obj = dict(contrib) del response_obj["responses"] response_obj.update(response) response_data.append(response_obj) df_response_data = pd.DataFrame.from_records(response_data) df_response_data.to_json( os.path.join(data_dir, "response_" + os.path.basename(file_path))) # Loads the french model of spacy and adds some new stop words (could be extended) nlp = fr_core_news_md.load() tokenizer = French().Defaults.create_tokenizer(nlp) additional_stopwords = ["de", "le", "que", "ce", "l"] for stopword in additional_stopwords: nlp.Defaults.stop_words.add(stopword) # Creates a new column in the dataframe that contains each token lemma. # Punctuations, spaces and stopwords are removed df_response_data["lemmatizedValue"] = df_response_data["formattedValue"].\ apply(lambda t: [token.lemma_ for token in tokenizer(t.lower()) if not token.is_stop and not token.is_punct and not token.is_space]) df_response_data.to_json( os.path.join(data_dir, "response_lemmatized_" + os.path.basename(file_path)))
""" repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)') repl = r'\1\2\3' if wordnet.synsets(word): return word repl_word = repeat_regexp.sub(repl, word) if repl_word != word: return remove_duplication(repl_word) else: return repl_word remove_duplication("hello"),remove_duplication("blaaaaaabla") """### Lemmetization & Stemming""" # Les fonctions pour séparer les mots et les transformer vers leurs origin gramatical spacy_fr=fr_core_news_md.load() spacy_en = en_core_web_sm.load() # Convertir les francais mots vers leurs origin fr_lemmatizer = lambda w:spacy_fr(w)[0].lemma_ # Convertir les mots anglais vers leurs origin eng_lemmatizer = lambda w:spacy_en(w)[0].lemma_ # Convertir les mots arabe vers leurs origin ar_lemmatizer = ISRIStemmer().stem lemmatizer = lambda word: ar_lemmatizer(fr_lemmatizer(eng_lemmatizer(remove_duplication(word)))) lemmatizer("السلااام"),lemmatizer("donne"),lemmatizer("yeeux") """# Importing dataset""" #lire le fichier answers with open('intents.json', 'r', encoding='utf8', errors='ignore') as fin: data_file = fin.read().lower()
# import spacy # nlp = spacy.load('fr') import fr_core_news_md nlp = fr_core_news_md.load() # train_data = [ # (u"quelle heure il est", {"cats": {"GET_TIME": 1, "HELLO": 0}}), # (u"c'est quoi l'heure", {"cats": {"GET_TIME": 1, "HELLO": 0}}), # (u"j'ai besoin de connaître l'heure", {"cats": {"GET_TIME": 1, "HELLO": 0}}), # (u"aurais-tu une idée de l'heure qu'il est", {"cats": {"GET_TIME": 1, "HELLO": 0}}), # # (u"bonjour", {"cats": {"GET_TIME": 0, "HELLO": 1}}), # (u"salut", {"cats": {"GET_TIME": 0, "HELLO": 1}}), # (u"hello", {"cats": {"GET_TIME": 0, "HELLO": 1}}), # (u"salutations", {"cats": {"GET_TIME": 0, "HELLO": 1}}), # ] # # textcat = nlp.create_pipe('textcat') # nlp.add_pipe(textcat, last=True) # textcat.add_label('GET_TIME') # textcat.add_label('HELLO') # optimizer = nlp.begin_training() # for itn in range(5): # for doc, gold in train_data: # nlp.update([doc], [gold], sgd=optimizer) doc = nlp(u"t'as pas l'heure s'il te plaît") print(doc.cats) doc = nlp(u'bonjour à vous') print(doc.cats)
'lang', type=str, help= "The Wikipedia version we want to retrieve data from (e.g. 'fr' for French, 'en' for English, etc." ) parser.add_argument('output', type=str, help='Output directory') args = parser.parse_args() check_output_dir(args.output) #TODO: internationalize spacy & nlp imports spacy_models = {"fr": "fr_core_news_md", "en": "en_core_web_md"} try: if args.lang == "fr": import fr_core_news_md #if it doesn't work, an alternative is: nlp = spacy.load('fr_core_news_sm') https://spacy.io/models/fr. See also line nlp = fr_core_news_sm.load(), at the bottom of the page nlp = fr_core_news_md.load( ) #if it doesn't work, try: nlp = spacy.load('fr_core_news_sm'). See imports, and https://spacy.io/models/fr, https://spacy.io/models/fr, etc. #elif args.lang == "en": else: import en_core_web_md nlp = en_core_web_md.load() except ImportError: from spacy.cli import download as spacy_model_download spacy_model_download(spacy_models['en']) #[args.lang]) nlp = spacy.load(spacy_models['en']) #args.lang]) import nltk nltk.download('punkt') nlp.add_pipe(set_custom_boundaries, before='parser') #tool = language_check.LanguageTool('fr-FR') #TODO for later mapping_specific = [
def __init__(self, stop_words=None): self.nlp = fr_core_news_md.load() self.stop_words = stop_words