def char_grams(data_texts, n): vectors = [] for text in data_texts: #Uno la lista de string sin espacios text_union = "".join(text) #Separo en ngram ngram = [text_union[i:i + n] for i in range(len(text_union) - n + 1)] print(ngram) # dict_alpha_num = dict_alpha_numeric(n) dict_alpha_num = Utils.ngrams(n) for i in ngram: if i in dict_alpha_num: dict_alpha_num[i] += 1 print(dict_alpha_num[i]) else: print(i) print('El valor no existe en diccionario') continue vector_freq = freq_dict(dict_alpha_num) vectors.append(vector_freq) return vectors
from spacymoji import Emoji from tweetlib.singleton import Utils from tweetlib.definitions import TaggingMethod # import es_core_news_md # from collections import Counter # from spacy.tokens import retokenize # TODO: use Singleton of nlp # nlp = Singleton.get_nlp() nlp = Utils.load_nlp(TaggingMethod.SPACY) # text = ['🤾', '🏻\u200d', '♀', '️', 'La', 'selección', 'española', 'vence', 'a', 'Noruega', 'y', 'se', 'coloca', 'a', 'un', 'paso','de','ganar','el','Mundial','femenino','de','balonmano','👏','🏻','\n','¡','Muc','…','https://t.co/IbVGXUz8Lb'] # text = [ '\n', '🤾','♀', '👏', '🏻\u200d', '️', 'La', 'selección', 'española', 'vence', 'a', 'Noruega', 'y', '', '🏻','\n','¡','Muc','…', 'https://t.co/IbVGXUz8Lb'] # text = ['🤾', '🏻\u200d', '♀', '️', 'La', 'selección', 'española'] # text = ['🤾', '♀', '️👏', '️👏', 'La', 'selección', 'española'] # TODO: use snake case (snake_case vs. CamlCase) for function identifiers def rm_emoticons(text: list): """Elimina los emoji de un corpus Args: text (list[str]): Lista de tokens. nlp ([type]): Biblioteca de spacy, para utilizar sus atributos, en este caso "emoji". Returns: list[str]: Lista sin emoji. """ # nlp = es_core_news_lg.load() # list_within_emoji = [] # emoji = Emoji(nlp)
def run(self): # get data and classes from self.data data = self.dataset.get_data() y = self.dataset.get_y() preprocessing_list = self.config.get_preprocessing_methods() encoding = self.config.get_encoding_method() classifier_type = self.config.get_classification_method() tagging_method_type = self.config.get_tagging_method() # type_user = self.config.get_type_user() # vectors = [] #INITIALIZE THE LIBRARY TO USE nlp = Utils.load_nlp(tagging_method_type) #copy to data data_texts = data.copy() # gettting the type of task type_task = dict_task[self.task] if self.task.name == 'VALIDATE_MODEL' or self.task.name == 'MODEL_STORAGE' or self.task.name == 'PREDICTION': if self.task.name == 'PREDICTION': data_texts = self.text for preprocessing in preprocessing_list: prep_method = dict_preprocessing[preprocessing] if preprocessing.name != 'LOWERCASE' and preprocessing.name != 'REMOVE_STOP_WORDS' and preprocessing.name != 'MENTIONS': for idx, text_prep in enumerate(data_texts): prep = prep_method(text_prep) data_texts[idx] = prep else: for idx, text in enumerate(data_texts): prep = prep_method(text) data_texts[idx] = prep # apply encoding encoding_method = dict_encoding[encoding] if encoding.name == 'BIGRAM' or encoding.name == 'TRIGRAM' or encoding.name == 'CUATRIGRAM': vector_encoding = encoding_method(data_texts) elif encoding.name == 'ALL_CHARGRAM': vector_encoding = encoding_method(data_texts) # vector_encoding = np.concatenate(vectors[0], vectors[1]) elif encoding.name == 'POS_ALL_CHARGRAM': vector_encoding = encoding_method(data_texts, tagging_method_type.name, nlp) # vector_encoding = np.concatenate(vectors[0], vectors[1], vectors[2]) #postagging else: vector_encoding = encoding_method(data_texts, tagging_method_type.name, nlp) X = np.vstack(vector_encoding) # #classifier (validate_model) # y_test, pred_y = self.classifier.classification_method(X, y, classifier_type) # accuracy = self.classifier.classification_method(X, y, classifier_type) # instancia de Classification # c = self.classifier if self.task.name == 'VALIDATE_MODEL': #Devuelve una tupla con la lista de accuracy y un entero que es el promedio de esa lista validate_model = type_task(X, y, classifier_type) # accuracy = self.classifier.classification_method(X, y, classifier_type). n_value cantidad de clases con mejor presicion elif self.task.name == 'PREDICTION': predict_method = type_task(self.model, X, self.n_value) else: # save_model = self.classifier.save_model(X, y, preprocessing_list, encoding, classifier_type, tagging_method_type, type_task, type_user) model_storage = type_task(self.id_model, self.config, X, y, classifier_type)