def __init__(self, args, lang): self.others = Strategy(args.others) # valori validi 0,1,2 //ANDATA self.emoji = Strategy( args.emoji) #0,1 emoji ,2 (emoji) ,3, 4 ,5 (traduzione) // self.emoticon = Strategy( args.emoticon) #0,1 emoticon ,2 (emoticon) ,3, 4 ,5 (traduzione) self.url = Strategy(args.url) # 0,1,2,3 self.hashtag = Strategy( args.hashtag) # 0,1 = #hashtag,2 ,3 (#hashtag),4,5 self.punctuation = Strategy(args.punctuation) #Valori validi 0,3 self.mention = Strategy(args.mention) #0,1,2,3 self.lower = args.lower #true o false self.lang = lang # EN o IT self.ita_moji = pd.read_csv('./data/italianMoji.csv', sep=';') if self.lang == 'IT': self.lm = wordninja.LanguageModel('./data/words.last_all.txt.gz') else: self.lm = None self.text_processor = TextPreProcessor( remove=[ 'email', #raw o nomralize. 'percent', #raw o nomralize: EN: percentage, IT: percentuale. 'money', # raw o nomralize: EN: money, IT: soldi. verificare se becca le valute 'phone', # raw o nomralize: EN: phone, IT: telefono 'time', # raw o nomralize: EN time, It: ore 'date', # raw o nomralize EN date, It data 'number' #raw o nomralize En number, it numero. ], annotate={}, fix_html=True, unpack_hashtags=False, tokenizer=SocialTokenizer(lowercase=self.lower).tokenize, dicts=[emoticons])
def fit(self, X, y=None): dtime = datetime.now().strftime('%Y-%m-%d %H:%M:%S') df = pd.DataFrame({'words' :[self.__keep_correctly_spelled(token, self.spell_) for token in X]}) word_count = dict(Counter(" ".join(df['words']).split(" "))) word_count_df = pd.DataFrame.from_dict(word_count, orient='index').reset_index() word_count_df.columns= ['words', 'n_appearances'] # Only keep actual words word_count_df['wordlength'] = word_count_df['words'].str.len() word_count_df = word_count_df[(word_count_df['wordlength'] >=3) | (word_count_df['words'].isin(self.stopwords_list_))] word_count_df = word_count_df.sort_values('n_appearances', ascending=False).reset_index(drop=True) word_count_df['words'] = word_count_df['words'].str.lower() lang_filepath = path.join(config_test['lang_path'], f'my_lang_{dtime}.txt.gz') word_count_df['words'].to_csv(lang_filepath, index=None, header=False, compression='gzip', encoding='utf-8') self.language_model_ = wordninja.LanguageModel(lang_filepath) return self
def slice_word(word): wm = wordninja.LanguageModel('words.txt.gz') name_list = wm.split(word) y = [s for s in name_list if not len(s) == 1] print(word, '-->', y) return y
def __init__(self, path_dict_zip=path_dict_zip__): self.name_class = 'TOKENIZER' self.status_update = False if path_dict_zip != None and self.__check_valid_path(path_dict_zip): self.path_dict_zip = path_dict_zip logging.debug("The module load input dictionary succesfully") else: self.path_dict_zip = path_dict_zip_default__ # note logging.debug( "If you don't change the input path_dict_zip, the module will use the default spliter" ) # print(os.path.join(os.path.abspath, self.path_dict_zip)) self.spliter = wordninja.LanguageModel(self.path_dict_zip)
def test_custom_model(self): lm = wordninja.LanguageModel('test_lang.txt.gz') self.assertEqual(list(lm.split('derek')), ['der', 'ek'])
def update_spliter(self, path_dict_zip): self.spliter = wordninja.LanguageModel(self.path_dict_zip)
import re pattern_space_before_capital = re.compile(r'((?<=[^\W[A-Z])[A-Z]|(?<=\S)[A-Z](?=[a-z]))') from importlib import resources ############################ # Segmenter ########################### import wordninja with resources.path("src.resources", "italian_words.txt.gz") as italian_words_gz: segmenter = wordninja.LanguageModel(italian_words_gz) ############################ # unique_italian_words ########################### with resources.path("src.resources", "parole_uniche.txt") as unique_italian_words_path: unique_italian_words = {word.rstrip().lower() for word in open(unique_italian_words_path, 'r', encoding='utf8') if word.rstrip().lower() != ''} # from ekphrasis.classes.preprocessor import TextPreProcessor # from ekphrasis.classes.tokenizer import SocialTokenizer from src.data.preprocessing.dicts.emoticons import emoticons from src.data.preprocessing.dicts.wrong_word import wrong_word from src.data.preprocessing.dicts.abbreviations import abbr_word, acronyms # import spacy_udpipe # spacy_udpipe.download("it-postwita") # nlp = spacy_udpipe.load("it-postwita") # social_tokenizer = lambda text : [ token.text for token in nlp(text)]
import pandas as pd import argparse import emoji from ekphrasis.classes.preprocessor import TextPreProcessor from ekphrasis.classes.tokenizer import SocialTokenizer from ekphrasis.dicts.emoticons import emoticons import re import os import json import wordninja import numpy as np # commento di Rosario ita_moji = pd.read_csv('./data/italianMoji.csv', sep=';') lm = wordninja.LanguageModel('./data/words.last_all.txt.gz') def add_pred_pos(row, model, task): predictions, raw_outputs = model.predict([row.text_preprocessed]) if (task == 'opos'): return predictions[0] else: return predictions[0] def trainer(train_df, OUTPUT_DIR, preproc, args): script_dir = os.path.dirname(__file__) abs_file_path = os.path.join(script_dir, args.modelConf) with open(abs_file_path) as f: model_param = json.loads(f.read())
import pandas as pd import wordninja as wn import nltk from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer lemmatizer = WordNetLemmatizer() words = set(nltk.corpus.words.words()) stop = stopwords.words('english') wn.DEFAULT_LANGUAGE_MODEL = wn.LanguageModel('covid_words.txt.gz') ## Deletes strings that aren't in English (including strings that aren't words at all) def drop_nonword(word): word = " ".join(w for w in nltk.wordpunct_tokenize(word) \ if w.lower() in words or not w.isalpha()) return (word) ## Splits domains, removes stopwords, lemmatizes, and drops non-words def cloud_prep(df): df = df.astype(str) df['Match'] = df['Match'].apply(wn.split) df['Match'] = df['Match'].apply( lambda x: [item for item in x if item not in stop]) df['Match'] = df.explode('Match', ignore_index=True) df = df.astype(str) df['Match'] = df['Match'].apply(lemmatizer.lemmatize) df['Match'] = df['Match'].apply(drop_nonword)