Python LanguageModel 예제들, wordninja.LanguageModel Python 예제들

예제 #1

0

파일 보기

파일: preprocessor.py 프로젝트: MirkoVentura/sentiment_it

 def __init__(self, args, lang):
     self.others = Strategy(args.others)  # valori validi 0,1,2 //ANDATA
     self.emoji = Strategy(
         args.emoji)  #0,1 emoji ,2 (emoji) ,3, 4 ,5 (traduzione) //
     self.emoticon = Strategy(
         args.emoticon)  #0,1 emoticon ,2 (emoticon) ,3, 4 ,5 (traduzione)
     self.url = Strategy(args.url)  # 0,1,2,3
     self.hashtag = Strategy(
         args.hashtag)  # 0,1 = #hashtag,2 ,3 (#hashtag),4,5
     self.punctuation = Strategy(args.punctuation)  #Valori validi 0,3
     self.mention = Strategy(args.mention)  #0,1,2,3
     self.lower = args.lower  #true o false
     self.lang = lang  # EN o IT
     self.ita_moji = pd.read_csv('./data/italianMoji.csv', sep=';')
     if self.lang == 'IT':
         self.lm = wordninja.LanguageModel('./data/words.last_all.txt.gz')
     else:
         self.lm = None
     self.text_processor = TextPreProcessor(
         remove=[
             'email',  #raw o nomralize.
             'percent',  #raw o nomralize: EN: percentage, IT: percentuale.
             'money',  # raw o nomralize: EN: money, IT: soldi. verificare se becca le valute
             'phone',  # raw o nomralize: EN: phone, IT: telefono
             'time',  # raw o nomralize: EN time, It: ore 
             'date',  # raw o nomralize EN date, It data
             'number'  #raw o nomralize En number, it numero.
         ],
         annotate={},
         fix_html=True,
         unpack_hashtags=False,
         tokenizer=SocialTokenizer(lowercase=self.lower).tokenize,
         dicts=[emoticons])

예제 #2

0

파일 보기

파일: pipeline_text.py 프로젝트: uk-gov-mirror/datasciencecampus.ace2

    def fit(self, X, y=None):
        dtime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

        df = pd.DataFrame({'words' :[self.__keep_correctly_spelled(token, self.spell_) for token in X]})
        word_count = dict(Counter(" ".join(df['words']).split(" "))) 
        word_count_df = pd.DataFrame.from_dict(word_count, orient='index').reset_index()
        word_count_df.columns= ['words', 'n_appearances']

        # Only keep actual words
        word_count_df['wordlength'] = word_count_df['words'].str.len()
        word_count_df = word_count_df[(word_count_df['wordlength'] >=3) |
                                      (word_count_df['words'].isin(self.stopwords_list_))]

        word_count_df = word_count_df.sort_values('n_appearances', ascending=False).reset_index(drop=True)
        word_count_df['words'] = word_count_df['words'].str.lower()

        lang_filepath = path.join(config_test['lang_path'], f'my_lang_{dtime}.txt.gz')
        word_count_df['words'].to_csv(lang_filepath,
                                      index=None,
                                      header=False,
                                      compression='gzip',
                                      encoding='utf-8')
        self.language_model_ = wordninja.LanguageModel(lang_filepath)

        return self

예제 #3

0

파일 보기

파일: main.py 프로젝트: Iskrata/Bukvalno-Buro

    def slice_word(word):
        wm = wordninja.LanguageModel('words.txt.gz')
        name_list = wm.split(word)

        y = [s for s in name_list if not len(s) == 1]
        print(word, '-->', y)

        return y

예제 #4

0

파일 보기

파일: tokenizer.py 프로젝트: giangnguyenvanvsi/chatbot_nlu_preprocess

    def __init__(self, path_dict_zip=path_dict_zip__):
        self.name_class = 'TOKENIZER'
        self.status_update = False
        if path_dict_zip != None and self.__check_valid_path(path_dict_zip):
            self.path_dict_zip = path_dict_zip
            logging.debug("The module load input dictionary succesfully")
        else:
            self.path_dict_zip = path_dict_zip_default__  # note
            logging.debug(
                "If you don't change the input path_dict_zip, the module will use the default spliter"
            )

        # print(os.path.join(os.path.abspath, self.path_dict_zip))
        self.spliter = wordninja.LanguageModel(self.path_dict_zip)

예제 #5

0

파일 보기

파일: test.py 프로젝트: yf1291/nlp3

 def test_custom_model(self):
     lm = wordninja.LanguageModel('test_lang.txt.gz')
     self.assertEqual(list(lm.split('derek')), ['der', 'ek'])

예제 #6

0

파일 보기

파일: tokenizer.py 프로젝트: giangnguyenvanvsi/chatbot_nlu_preprocess

 def update_spliter(self, path_dict_zip):
     self.spliter = wordninja.LanguageModel(self.path_dict_zip)

예제 #7

0

파일 보기

import re
pattern_space_before_capital = re.compile(r'((?<=[^\W[A-Z])[A-Z]|(?<=\S)[A-Z](?=[a-z]))')

from importlib import resources


############################
#       Segmenter
###########################
import wordninja
with resources.path("src.resources", "italian_words.txt.gz") as italian_words_gz:
    segmenter = wordninja.LanguageModel(italian_words_gz)

############################
#   unique_italian_words
###########################
with resources.path("src.resources", "parole_uniche.txt") as unique_italian_words_path:
    unique_italian_words   = {word.rstrip().lower() for word in open(unique_italian_words_path, 'r', encoding='utf8') if word.rstrip().lower() != ''}

# from ekphrasis.classes.preprocessor import TextPreProcessor
# from ekphrasis.classes.tokenizer import SocialTokenizer

from src.data.preprocessing.dicts.emoticons import emoticons
from src.data.preprocessing.dicts.wrong_word import wrong_word
from src.data.preprocessing.dicts.abbreviations import abbr_word, acronyms
# import spacy_udpipe
# spacy_udpipe.download("it-postwita")
# nlp = spacy_udpipe.load("it-postwita")

# social_tokenizer = lambda text : [  token.text for token in nlp(text)]

예제 #8

0

파일 보기

파일: sentiment_ita.py 프로젝트: MirkoVentura/sentiment_it

import pandas as pd
import argparse
import emoji
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
import re
import os
import json
import wordninja
import numpy as np

# commento di Rosario

ita_moji = pd.read_csv('./data/italianMoji.csv', sep=';')
lm = wordninja.LanguageModel('./data/words.last_all.txt.gz')


def add_pred_pos(row, model, task):
    predictions, raw_outputs = model.predict([row.text_preprocessed])
    if (task == 'opos'):
        return predictions[0]
    else:
        return predictions[0]


def trainer(train_df, OUTPUT_DIR, preproc, args):
    script_dir = os.path.dirname(__file__)
    abs_file_path = os.path.join(script_dir, args.modelConf)
    with open(abs_file_path) as f:
        model_param = json.loads(f.read())

예제 #9

0

파일 보기

파일: wordcloud.py 프로젝트: peterg-13/bios611_project1

import pandas as pd
import wordninja as wn
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

words = set(nltk.corpus.words.words())
stop = stopwords.words('english')

wn.DEFAULT_LANGUAGE_MODEL = wn.LanguageModel('covid_words.txt.gz')


## Deletes strings that aren't in English (including strings that aren't words at all)
def drop_nonword(word):
    word = " ".join(w for w in nltk.wordpunct_tokenize(word) \
             if w.lower() in words or not w.isalpha())
    return (word)


## Splits domains, removes stopwords, lemmatizes, and drops non-words
def cloud_prep(df):
    df = df.astype(str)
    df['Match'] = df['Match'].apply(wn.split)
    df['Match'] = df['Match'].apply(
        lambda x: [item for item in x if item not in stop])
    df['Match'] = df.explode('Match', ignore_index=True)
    df = df.astype(str)
    df['Match'] = df['Match'].apply(lemmatizer.lemmatize)
    df['Match'] = df['Match'].apply(drop_nonword)