def __init__(self, dataset: t.Iterable) -> t.NoReturn:

        #os.system('python -m spacy download fr_core_news_md')
        self.nlp = fr_core_news_md.load()
        #self.nlp = spacy.load('fr_core_news_md')

        self.sentences = list(dataset['sentence'])
        self.intents = list(dataset['intent'])
        self.clean_text = []
        self.vectorized_text = []

        # Keep track of the removed sentences' intents during preprocessing
        self.intents_to_remove = []
def get_spacy_tokenizer(default_lingo, supported_languages, bigmodel_required):
    '''returns the spacy nlp function corresponding to the language of a document'''
    if default_lingo in supported_languages:
        if bigmodel_required == False:
            if default_lingo == "German":
                import de_core_news_sm
                nlp = de_core_news_sm.load()
            elif default_lingo == "English":
                import en_core_web_sm
                nlp = en_core_web_sm.load()
            elif default_lingo == "Spanish":
                import es_core_news_sm
                nlp = es_core_news_sm.load()
            elif default_lingo == "French":
                import fr_core_news_sm
                nlp = fr_core_news_sm.load()
            elif default_lingo == "Portuguese":
                import pt_core_news_sm
                nlp = pt_core_news_sm.load()
            else:
                import it_core_news_sm
                nlp = it_core_news_sm.load()
        else:
            if default_lingo == "German":
                import de_core_news_md
                nlp = de_core_news_md.load()
            elif default_lingo == "English":
                import en_core_web_md
                nlp = en_core_web_md.load()
            elif default_lingo == "Spanish":
                import es_core_news_md
                nlp = es_core_news_md.load()
            elif default_lingo == "French":
                import fr_core_news_md
                nlp = fr_core_news_md.load()
            elif default_lingo == "Portuguese":
                # there is no pt_md model
                import pt_core_news_sm
                nlp = pt_core_news_sm.load()
            else:
                # there is no it_md model
                import it_core_news_sm
                nlp = it_core_news_sm.load()
    else:
        print("NOT A SUPPORTED LANGUAGE!")
    return nlp
示例#3
0
def initialize_weighted_vectorizer(config, documents):
    print("Initializing weighted vectorizer.")
    # load scoring method or create it if not existing
    scorer = load_scorer(config, documents)
    # instantiate
    if not spacy.util.is_package("fr_core_news_md"):
        print("Downloading fr_core_news_md spacy model for pos tagging...")
        spacy.cli.download('fr_core_news_md')
        print("done.")
    print("Loading fr_core_news_md Spacy model.")
    nlp_wv = fr_core_news_md.load()
    nlp_wv.remove_pipe(
        "ner")  # no need, and it seems not implemented for french model
    vectorizer_component = VectorizerComponent()
    vectorizer_component.add_scorer(scorer)
    nlp_wv.add_pipe(vectorizer_component)
    # save
    nlp_wv.to_disk(config.model_dir)
    print("added to disk (TODO: delete this statement")
    return nlp_wv
    def lemmatizer(df):
        """Lemmatize dataframe Series

        Args:
            df (Series): list to lemmatize

        Returns:
            Series: lemmatized list
        """
        df_tmp = []
        nlp = fr_core_news_md.load(disable=["ner", "parser"])
        nlp.add_pipe(nlp.create_pipe("sentencizer"))
        for doc in nlp.pipe(list(df), batch_size=2000):
            texts = [ent.lemma_ for ent in doc]
            tmp = " ".join(texts)
            df_tmp.append(tmp)
        # doc = nlp.pipe(text)
        # text_lemm = [token.lemma_ for token in doc]
        # text_lemm = " ".join(text_lemm)
        return df_tmp
def preprocess_sentence(sentence: str) -> t.Iterable:

    nlp = fr_core_news_md.load()

    # remove special characters
    clean_sentence = re.sub(r'[^ A-Za-z0-9éèàêî€]', '', sentence)
    doc_s = nlp(clean_sentence)

    # remove determiners (i.e. pos = 90 or pos_ = 'DET')
    remaining_words = list(filter(lambda x: x.pos != 90, doc_s))
    # build new doc object
    str_s = " ".join(list(map(lambda x: x.text, remaining_words)))
    final_sentence = nlp(str_s)

    null_vector = np.zeros(300)

    vect = np.asarray(
        final_sentence.vector if final_sentence.has_vector else null_vector)

    return vect
示例#6
0
def preprocess_file(file_path):
    json_data = []
    with open(file_path, encoding="utf8") as json_file:
        json_data = json.load(json_file)

    # Filters the question to only take into account the ones that have answers
    response_data = []
    for contrib in json_data:
        for response in contrib["responses"]:
            # Si on a une reponse non vide
            if response["value"] and response["formattedValue"]:
                # Flattens the responses and add it to the response data
                response_obj = dict(contrib)
                del response_obj["responses"]
                response_obj.update(response)
                response_data.append(response_obj)
    df_response_data = pd.DataFrame.from_records(response_data)

    df_response_data.to_json(
        os.path.join(data_dir, "response_" + os.path.basename(file_path)))

    # Loads the french model of spacy and adds some new stop words (could be extended)
    nlp = fr_core_news_md.load()
    tokenizer = French().Defaults.create_tokenizer(nlp)
    additional_stopwords = ["de", "le", "que", "ce", "l"]
    for stopword in additional_stopwords:
        nlp.Defaults.stop_words.add(stopword)

    # Creates a new column in the dataframe that contains each token lemma.
    # Punctuations, spaces and stopwords are removed
    df_response_data["lemmatizedValue"] = df_response_data["formattedValue"].\
        apply(lambda t: [token.lemma_ for token in tokenizer(t.lower()) if not token.is_stop and not token.is_punct and
                         not token.is_space])

    df_response_data.to_json(
        os.path.join(data_dir,
                     "response_lemmatized_" + os.path.basename(file_path)))
示例#7
0
  """
  repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
  repl = r'\1\2\3'
  if wordnet.synsets(word):
    return word
  repl_word = repeat_regexp.sub(repl, word)
  if repl_word != word:
    return remove_duplication(repl_word)
  else:
    return repl_word
remove_duplication("hello"),remove_duplication("blaaaaaabla")

"""### Lemmetization & Stemming"""

# Les fonctions pour séparer les mots et les transformer  vers leurs origin gramatical 
spacy_fr=fr_core_news_md.load()
spacy_en = en_core_web_sm.load()
# Convertir les francais mots vers leurs origin
fr_lemmatizer = lambda w:spacy_fr(w)[0].lemma_
# Convertir les mots anglais vers leurs origin
eng_lemmatizer = lambda w:spacy_en(w)[0].lemma_
# Convertir les mots arabe vers leurs origin
ar_lemmatizer = ISRIStemmer().stem
lemmatizer = lambda word: ar_lemmatizer(fr_lemmatizer(eng_lemmatizer(remove_duplication(word))))

lemmatizer("السلااام"),lemmatizer("donne"),lemmatizer("yeeux")

"""# Importing dataset"""
#lire le fichier answers
with open('intents.json', 'r', encoding='utf8', errors='ignore') as fin:
    data_file = fin.read().lower()
示例#8
0
# import spacy
# nlp = spacy.load('fr')

import fr_core_news_md

nlp = fr_core_news_md.load()

# train_data = [
#     (u"quelle heure il est", {"cats": {"GET_TIME": 1, "HELLO": 0}}),
#     (u"c'est quoi l'heure", {"cats": {"GET_TIME": 1, "HELLO": 0}}),
#     (u"j'ai besoin de connaître l'heure", {"cats": {"GET_TIME": 1, "HELLO": 0}}),
#     (u"aurais-tu une idée de l'heure qu'il est", {"cats": {"GET_TIME": 1, "HELLO": 0}}),
#
#     (u"bonjour", {"cats": {"GET_TIME": 0, "HELLO": 1}}),
#     (u"salut", {"cats": {"GET_TIME": 0, "HELLO": 1}}),
#     (u"hello", {"cats": {"GET_TIME": 0, "HELLO": 1}}),
#     (u"salutations", {"cats": {"GET_TIME": 0, "HELLO": 1}}),
# ]
#
# textcat = nlp.create_pipe('textcat')
# nlp.add_pipe(textcat, last=True)
# textcat.add_label('GET_TIME')
# textcat.add_label('HELLO')
# optimizer = nlp.begin_training()
# for itn in range(5):
#     for doc, gold in train_data:
#         nlp.update([doc], [gold], sgd=optimizer)
doc = nlp(u"t'as pas l'heure s'il te plaît")
print(doc.cats)
doc = nlp(u'bonjour à vous')
print(doc.cats)
    'lang',
    type=str,
    help=
    "The Wikipedia version we want to retrieve data from (e.g. 'fr' for French, 'en' for English, etc."
)
parser.add_argument('output', type=str, help='Output directory')

args = parser.parse_args()
check_output_dir(args.output)

#TODO: internationalize spacy & nlp imports
spacy_models = {"fr": "fr_core_news_md", "en": "en_core_web_md"}
try:
    if args.lang == "fr":
        import fr_core_news_md  #if it doesn't work, an alternative is: nlp = spacy.load('fr_core_news_sm') https://spacy.io/models/fr. See also line nlp = fr_core_news_sm.load(), at the bottom of the page
        nlp = fr_core_news_md.load(
        )  #if it doesn't work, try: nlp = spacy.load('fr_core_news_sm'). See  imports, and https://spacy.io/models/fr, https://spacy.io/models/fr, etc.
    #elif args.lang == "en":
    else:
        import en_core_web_md
        nlp = en_core_web_md.load()

except ImportError:
    from spacy.cli import download as spacy_model_download
    spacy_model_download(spacy_models['en'])  #[args.lang])
    nlp = spacy.load(spacy_models['en'])  #args.lang])
    import nltk
    nltk.download('punkt')

nlp.add_pipe(set_custom_boundaries, before='parser')
#tool = language_check.LanguageTool('fr-FR') #TODO for later
mapping_specific = [
示例#10
0
 def __init__(self, stop_words=None):
     self.nlp = fr_core_news_md.load()
     self.stop_words = stop_words