def __init__(self, language='en', database_name='memory', memory_table='memory', listen_log_table='listen_log', speak_log_table='speak_log'): super().__init__(language, database_name, memory_table, listen_log_table, speak_log_table) try: json_file = open('modelo_gustos.json', 'r') loaded_model_json = json_file.read() json_file.close() self.model = model_from_json(loaded_model_json) self.model.load_weights("modelo_gustos.h5") self.model.compile(loss='mean_squared_error', optimizer='adam', metrics=['binary_accuracy']) except Exception: print('****ERROR: Error cargando modelo...****') self.stemmer = SpanishStemmer() self.words = [ '¿qu', '?', 'peli', 'pelis', 'color', 'favorit', 'leer', 'libr', 'novel', 'ver', 'prefier', 'gust', 'pelicul', 'jug', '¿cual', 'prefer', 'jueg', 'com', 'plat', 'animal', 'videojueg' ] self.classes = [ 'comida', 'color', 'animal', 'juego', 'libro', 'película' ]
def process_violence(lang, data_path, stopword_path, save_path): if lang == "English": stemmer = EnglishStemmer() elif lang == "Spanish": stemmer = SpanishStemmer() else: stemmer = None print "loading dataset" line_sentences = ProcessLineSentence(dataPath=data_path, label="violence", stopwordPath=stopword_path, stemmer=stemmer) with open(save_path, 'w') as f: writer = csv.writer(f) for sentence, label in line_sentences: if label == "no": l = [0] elif label == "violence": l = [1] elif label == "malpractice": l = [2] else: raise (Exception("Wrong label: {}".format(label))) writer.writerow(l + sentence)
def __init__(self, min_long=5): """ Para inicializar un `Tokenizer` es necesario saber el tamaño mínimo de caracteres `min_long`. que constituyen una palabra válidad :param min_long: un entero. Por defecto igual a cinco (5) """ self.stemmer = SpanishStemmer() self.min_long = min_long
def find_top_N_words(lang_entries, top_N, lang): dictionary = Lang_Dictionary({}, lang) for player in lang_entries: for chat in player.c: language = {'eng': 0, 'spn': 0, 'other': 0, 'tot': 0} sentence = player.c[chat] newlist = player.c[chat].strip().split(' ') newlist = [x.strip("''") for x in newlist] for word in newlist: language['tot'] += 1 if word.lower() not in Lang_dicts.lang_index: language['other'] += 1 else: word = Lang_dicts.lang_index[word.lower()] if word == "english": language['eng'] += 1 elif word == "spanish": language['spn'] += 1 else: language['other'] += 1 if language['other'] < 2 * (language['spn'] + language['eng']): print(sentence) if language['spn'] > language['eng']: print("SPANISH") stemmer = SpanishStemmer() else: print("ENGLISH") stemmer = EnglishStemmer() aslist = [] aslist += sentence sentence ="" j = ''.join(aslist) words = j.split(' ') for line in words: line = str(line).replace('\'', '') line = line.replace('""', '') line = line.replace('"', '') if len(line) > 0: if language["other"] < 2 * (language['spn'] + language["eng"]): sentence += stemmer.stem(line.encode(sys.stdout.encoding, errors = 'replace')) + " " print(sentence) ##INEFFICIENT - looking through dictionary each time? if line.lower() not in dictionary.d: dictionary.d[line.lower()] = 0 dictionary.d[line.lower()] += 1 ###wthCounts is a list of the word and its count wthCounts = [] for(w,c) in dictionary.d.iteritems(): wthCounts += [(c,w)] ##wc is the wthCounts list only sorted wc = sorted(wthCounts, reverse=True) return wc[:top_N]
def build_paragraph_inv_index(paragraphs, stem): p_index = {} stemmer = SpanishStemmer() for i, paragraph in enumerate(paragraphs): words = [word for word in paragraph.split() if word not in STOP_WORDS] for word in words: if stem: word = stemmer.stem(word) if word not in p_index: p_index[word] = [] p_index[word].append(i) return p_index
def __init__(self, lemma=False, stem=False): self.extra_dicts = Dicts() self.english_dict = enchant.Dict("en_EN") self.spanish_dict = enchant.Dict("es_AR") self.lemma = lemma self.stem = stem self.VARIANT_CLASS = 0 self.SPANISH_CLASS = 1 self.FOREIGN_CLASS = 2 if lemma: self.lemmatizer = Lemmatizer() if stem: self.stemmer = SpanishStemmer()
def __init__(self, stem=False): dictionaries = dicts() path = '/home/alangb/TWPP' # path to TreeTagger installation directory self.english_dict = enchant.Dict("en_EN") self.spanish_dict = enchant.Dict("es_ES") self.ND = dictionaries.norm self.SD = dictionaries.lemario self.PND = dictionaries.names self.stem = stem if stem: self.stemmer = SpanishStemmer() else: self.tagger = TreeTagger(TAGLANG='es', TAGDIR=path)
def __init__(self): self.reglasEntities.append(EmailRegla()) self.reglasEntities.append(UrlRegla()) self.reglasEntities.append(FechasRegla()) self.reglasEntities.append(TelefonosRegla()) self.reglasEntities.append(AbreviaturasRegla()) self.reglasEntities.append(NombresPropiosRegla()) self.reglasEntities.append(NumerosRegla()) self.reglasDocumento.append(MinusculasRegla()) self.reglasDocumento.append(TranslateRegla()) self.reglasDocumento.append(LimpiarHtmlTagsRegla()) self.reglasDocumento.append(LimpiadoBasicoRegla()) self.reglasTokens.append(MinMaxCaracteresRegla()) self.stemmer = SpanishStemmer()
def run_BM25_collection(output_dir,documents,queries,qrels,train,validation,test,k,language): if language=='en': stop_words = set(stopwords.words('english')) stemmer = PorterStemmer() elif language=='fr': stop_words = set(stopwords.words('french')) stemmer = FrenchStemmer() elif language=='es': stop_words = set(stopwords.words('spanish')) stemmer = SpanishStemmer() elif language=='it': stop_words = set(stopwords.words('italian')) stemmer = ItalianStemmer() corpus = [] doc_indexes = [] for key,value in documents.items(): doc_indexes.append(key) doc = [stemmer.stem(elem) for elem in value.split(" ") if elem not in stop_words] corpus.append(value.split(" ")) bm25 = BM25Okapi(corpus) print("Running BM25",flush=True) results = dict() for i,elem in enumerate(train): results[elem] = run_BM25_query(queries[elem],bm25,doc_indexes,k,language) if i%1000==0: print('Processing query',i,'/',len(train),flush=True) save_BM25_res(output_dir+'/training/BM25.res',results) save_BM25_qrels_dataframe(output_dir + '/training/BM25.qrels.csv',results,qrels,True) results = dict() for elem in validation: results[elem] = run_BM25_query(queries[elem],bm25,doc_indexes,k,language) save_BM25_res(output_dir+'/validation/BM25.res',results) save_BM25_qrels_dataframe(output_dir + '/validation/BM25.qrels.csv',results,qrels,False) results = dict() for elem in test: results[elem] = run_BM25_query(queries[elem],bm25,doc_indexes,k,language) save_BM25_res(output_dir+'/test/BM25.res',results) save_BM25_qrels_dataframe(output_dir + '/test/BM25.qrels.csv',results,qrels,False)
def __init__(self): self.tweets = 0 self.related_tweets = 0 self.stopwords = {} self.stemmers = {} self.stemmers["es"] = SpanishStemmer() self.stemmers["en"] = PorterStemmer() self.stemmers["fr"] = FrenchStemmer() self.stemmers["de"] = GermanStemmer() self.stopwords["es"] = self.load_stopwords_file( "spanish_stopwords.txt") self.stopwords["en"] = self.load_stopwords_file( "english_stopwords.txt") self.stopwords["fr"] = self.load_stopwords_file("french_stopwords.txt") self.stopwords["ge"] = self.load_stopwords_file("german_stopwords.txt") self.output_file = open(sys.argv[2], 'a')
def __init__(self, question, words, stem): self.question = question self.stem = stem self.stemmer = SpanishStemmer() self.words = words self.stemmed_words = self.stem_words(self.words) self.path_pfx = os.getcwd() self.inverted_index = self.load_doc_inverted_index() self.doc_names = self.init_doc_names() self.paragraph_indices = {} self.paragraph_inverted_indices = {} self.results = pd.DataFrame(columns=['text', 'law', 'score']) self.load_paragraph_indices() self.L = 23055.676666666666 #Manually obtained using bash self.scores = {'tf': {}, 'idf':{}, 'tfidf':{},'n_containing':{},\ 'score':{}}
def generate_stopwords(stopname='stopSpanish.pkl'): """ Remove stop words, and apply stemming """ stemmer=SpanishStemmer() stopwords_es = set(stopwords.words('spanish')) stopwords_es_sw = set(get_stop_words('spanish')) stopSpanishBeta = list(set(stopwords_es.union(stopwords_es_sw))) stopSpanish = set(stopwords_es.union(stopwords_es_sw)) for stopWord in stopSpanishBeta: stopSpanish.add(stemmer.stem(stopWord)) stopSpanish = list(stopSpanish) stopSpanish.extend(['tra', 'd', 'desc']) # Adding stopwords not present in the standard stopwords stopSpanish.remove('no') # Keep to help identify negative categories with open(f'{resource_path}/{stopname}', 'wb') as f: pickle.dump(stopSpanish, f) return stopSpanish
def build_index_from_words(words, stem): ''' Takes: - words, a list of strings Returns: - index, a dictionary with a count of times a word appears in the document ''' index = {} stemmer = SpanishStemmer() for word in words: if word not in STOP_WORDS: if stem: word = stemmer.stem(word) if word not in index: index[word] = 0 index[word] += 1 return index
class GigawordParser(StreamParser): STEMMERS = { "eng": PorterStemmer(ignore_stopwords=False), "spa": SpanishStemmer(), } def __init__(self, language): self.next_id = 0 self.language = language self.stemmer = self.STEMMERS.get(language) if self.stemmer is None: raise Exception("Unsupported language %s" % language) def init_id_counter(self, initial): self.next_id = initial def new_id(self): new_id = self.next_id self.next_id += 1 return new_id def parse_raw(self, xml_str): xml = minidom.parseString(xml_str) if self.language == "es": try: url = "gigaword:" + xml.getElementsByTagName( "DOC")[0].attributes["id"].value title = xml.getElementsByTagName( "HEADLINE")[0].firstChild.nodeValue except: url = "<NONE>" title = "<NONE>" else: url = "<NONE>" title = "<NONE>" text = stringio.StringIO() for node in xml.getElementsByTagName("TEXT")[0].childNodes: if len(node.childNodes) > 0: text.write(node.firstChild.nodeValue) content = text.getvalue() terms = text_to_terms(content, self.language) return RuwacDocument(self.new_id(), url, title, content, terms)
def lemmatize(self, text, lang): # spacy.prefer_gpu() # nlp = spacy.load(lang) # en fr "en_core_web_sm" if lang == "fr": stemmer = FrenchStemmer() elif lang == "es": stemmer = SpanishStemmer() else: stemmer = EnglishStemmer() stemmed = [] for word in text.split(" "): stemmed.append(stemmer.stem(word)) # doc = nlp(u""+text) # lem_terms = [] # for token in doc: # lem_terms.append(token.lemma_) return " ".join(stemmed)
def run_BM25_query(query,bm25,doc_indexes,k,language): if language=='en': stop_words = set(stopwords.words('english')) stemmer = PorterStemmer() elif language=='fr': stop_words = set(stopwords.words('french')) stemmer = FrenchStemmer() elif language=='es': stop_words = set(stopwords.words('spanish')) stemmer = SpanishStemmer() elif language=='it': stop_words = set(stopwords.words('italian')) stemmer = ItalianStemmer() tokenized_query = [stemmer.stem(elem) for elem in query.split(" ") if elem not in stop_words] doc_scores = bm25.get_scores(tokenized_query) top_k = np.argsort(doc_scores)[::-1][:k] results = [[doc_indexes[key],doc_scores[key]] for key in top_k] return results
def process_election(lang, data_path, stopword_path, save_path): if lang == "English": stemmer = EnglishStemmer() elif lang == "Spanish": stemmer = SpanishStemmer() else: stemmer = None print "loading dataset" line_sentences = ProcessLineSentence(dataPath=data_path, label="election", stopwordPath=stopword_path, stemmer=stemmer) with open(save_path, 'w') as f: writer = csv.writer(f) for sentence, label in line_sentences: if label == "yes": l = [1] else: l = [0] row = [w.encode('utf-8') for w in sentence] writer.writerow(l + row)
def getfeats(fields, o): """ This takes the word in question and the offset with respect to the instance word """ word = fields[0] stemmer = SpanishStemmer() with_hyphen = 0 if "-" in word: with_hyphen = 1 with_apostrophe = 0 if "'" in word: with_apostrophe = 1 o = str(o) features = [ (o + "word", word), (o + 'pos', fields[1]), #(o + 'prefix1', word[:1]), (o + 'prefix2', word[:2]), (o + 'prefix3', word[:3]), (o + 'prefix4', word[:4]), #(o + 'suffix1', word[-1:]), (o + 'suffix2', word[-2:]), (o + 'suffix3', word[-3:]), (o + 'suffix4', word[-4:]), (o + 'is_upper', word.isupper()), (o + 'is_title', word.istitle()), (o + 'is_digit', word.isdigit()), (o + 'with_hypen', with_hyphen), (o + 'with_apostrophe', with_apostrophe), (o + 'spanich_stem', stemmer.stem(word)), # (o + 'word_shape', word_shape(word)) ] return features
def spanish_swadesh_list(stemmed=True): """ Helper function that returns a list of strings with the stems of the spanish Swadesh entries. """ try: stemmer = SpanishStemmer(True) except: log.warn("Spanish stemmer could not be loaded!") return swadesh_entries = [] for line in util.read_text_file( util.data_path('swadesh', 'swadesh_spa.txt'), lines=True): line = line.strip() for e in line.split(","): e = e.strip() if stemmed: stem = stemmer.stem(e) swadesh_entries.append(stem) else: swadesh_entries.append(e) return swadesh_entries
def tokenizer_stemmer_global(document): stemmer = SpanishStemmer() my_tokenizer = RegexpTokenizer("[\w']+") return [ stemmer.stem(token) for token in my_tokenizer.tokenize(document) ]
def __init__(self): super(StemmerProcessor, self).__init__() self.stemmer = SpanishStemmer()
Hay multitud de ""stemmizadores"", yo voy a coger el de español ''' from nltk.stem.snowball import SnowballStemmer, SpanishStemmer #Igual que con punkt hay que bajar un paquete download('stopwords') #Si no conocemos el lenguaje a priori. #SnowballStemmer(language, ignore_stopwords=False) spanish_stem = SnowballStemmer("spanish", True) # Si conocemos el lenguaje de antemano, podemos importarlo directamente #SpanishStemmer(ignore_stopwords=False) spanish_stem = SpanishStemmer(True) print(spanish_stem.stem("Comiendo"), spanish_stem.stem("Bailando"), spanish_stem.stem("bailar"), spanish_stem.stem("estantería")) '''################################ # Obteniendo el verbo original # ################################ Conocido como lemmatization. NLTK no tiene esto en español, solo inglés. ''' from nltk.stem.wordnet import WordNetLemmatizer from nltk.corpus import wordnet #Igual que con punkt hay que bajar un paquete download('wordnet')
import csv import collections import operator import unicodedata import os cwd = os.getcwd() root = os.path.dirname(cwd) lematizador_dir = os.path.join(root, "data", "lematizador", "lematizador.csv") stopwords_dir = os.path.join(root, "data", "stopwords") from spellchecker import SpellChecker #https://pypi.org/project/pyspellchecker/ #Stemmer from nltk.stem.snowball import SpanishStemmer stemmer = SpanishStemmer() #Creación objeto para corrección ortografía spell = SpellChecker(language="es") metodo_desconocidas = spell.unknown metodo_correccion = spell.correction lista_blanca_regiones = [ "Arica", "Parinacota", "Tarapacá", "Antofagasta", "Atacama", "Coquimbo", "Valparaíso", "Metropolitana", "Santiago", "Libertador", "General", "Bernardo", "O’Higgins", "Maule", "Ñuble", "Biobío", "Araucanía", "Ríos", "Lagos", "Aysén", "General", "Carlos", "Ibáñez", "Campo", "Magallanes", "Antártica" ] lista_blanca_telecom = [ 'lte', 'whatsapp', 'instagram', 'telegram', 'youtube', 'facebook', 'entel', 'bafi', 'resetea', 'samsung', 'huawei', 'iphone', 'kb', 'mb', 'pixi',
def do_stemmer(df, stop_language='spanish'): """Apply stop words and Stemmers""" ## Como nos llegan tickets en dos idiomas añadimos las palabras de ambos idiomas stop = get_stop_words(stop_language) + get_stop_words('english') ## Añdimos nuestras propias palabras stop += [ "buenas", "buenos", "cid", "dias", "gracias", "hola", "mucho", "mucha", "poder", "proyecto", "please", "saludo", "tardes", "www", "habia" ] stop += [ 'ahora', 'algun', 'alguna', 'amanecia interrumpio', 'amanecia interrumpio relato', 'amanecia interrumpio relato habian', 'amanecia interrumpio relato habian dado', 'aquel', 'asi', 'aun', 'cada', 'vez', 'mas', 'cualquier', 'cosa', 'cuanto', 'dado', 'darse', 'debe', 'debia', 'despues', 'dia noche', 'dia siguiente', 'diez años', 'diez mil', 'dijo', 'dijo', 'dio', 'habia', 'mas', 'podia', 'podian', 'mismo', 'si', 'tal', 'tan', 'puede', 'pueden ser', 'pues', 'puso', 'toda', 'todas', 'vease tambien', 'primer lugar', 'varias', 'dos', 'largo', 'hacia' 'uno', 'una', 'unos', 'una', 'aquella', 'aquello', 'aquel', 'hace', 'muchas', 'mucho', 'muchos', 'mucha', 'pueden', 'puedo', 'unas', 'abrio puerta', 'arriba abajo', 'aqui alla', 'habian', 'doña', 'don', 'señor', 'señora', 'hizo', 'quedo', 'fuerza sino', 'quedo perplejo', 'parece haber', 'parece ser', 'parecia haber', 'mayor parte', 'mañana siguiente', 'media hora', 'hoy dia', 'iba ser', 'iii pag', 'haber hecho', 'habria podido', 'hacer cosas', 'hacia arriba', 'hacia atras', 'hacia puerta', 'hacia tiempo', 'decir verdad', 'dejo caer', 'demasiado tarde', 'derecha izquierda', 'di cuenta', 'dia anterior', 'dia noche', 'dia siguiente', 'casi siempre', 'cierto dia', 'cierto modo', 'cinco años', 'aqui alla', 'arriba abajo', 'aunque solo', 'año nuevo', 'años edad', 'buena parte', 'ninguna parte', 'noche anterior', 'noche dia', 'nunca visto', 'partido comunista', 'podria haber', 'podria ser', 'press cambridge', 'primer lugar', 'quiere decir', 'quiero decir', 'sentido comun', 'seria mejor', 'tras haber', 'tres años', 'tres cuatro', 'tres meses', 'voz alta', 'voz baja', ] stop_words_generated_tokens = [ 'abajo', 'abrio', 'alla', 'alta', 'amanecia', 'anterior', 'aqui', 'aren', 'arriba', 'atras', 'aunque', 'año', 'años', 'baja', 'buena', 'caer', 'cambridge', 'can', 'casi', 'cierto', 'cinco', 'comun', 'cosas', 'couldn', 'cuatro', 'cuenta', 'decir', 'dejo', 'demasiado', 'di', 'dia', 'didn', 'diez', 'doesn', 'edad', 'haber', 'habria', 'hacer', 'hacia', 'hadn', 'hasn', 'haven', 'hecho', 'hora', 'hoy', 'iba', 'iii', 'isn', 'let', 'll', 'lugar', 'mayor', 'mañana', 'media', 'mejor', 'meses', 'modo', 'mustn', 'ninguna', 'noche', 'nuevo', 'nunca', 'pag', 'parece', 'parecia', 'parte', 'partido', 'podido', 'podria', 'puerta', 'quiere', 'quiero', 're', 'relato', 'sentido', 'ser', 'seria', 'shan', 'shouldn', 'siempre', 'siguiente', 'sino', 'solo', 'tambien', 'tarde', 'tiempo', 'tras', 'tres', 've', 'vease', 'visto', 'wasn', 'weren', 'won', 'wouldn' ] stop += stop_words_generated_tokens ps = SpanishStemmer() a = [] df["stem"] = "n" for i, row in df.iterrows(): a.append( ps.stem(row["text"]).replace('fuerza sino', '').replace( 'acceder', 'acceso').replace('user', 'usuario').replace( 'access', 'acceso').replace('usuarios', 'usuario').replace( 'abrio puerta', '').replace('acto seguido', '')) df["stem"] = a return df, stop
# The next step is to connect spanish translations that contain the same stem. For this we first remove certain stop words from the translation (list of stopwords from NLTK). There are two cases then: just one word remains, or more than one word remains. # # We have to options now what to do with the latter: either they are not connected with anything at all (default behaviour), or each word is stemmed and the translation is connected with every other translation that contain the same stems. Right now this results in many connections that look not very useful. This should be done in a more intelligent way in the future (for example find heads of phrases in mulitword expression and only connect those; split the weight of the connections between all stems and work with weighted graphs from this step on; ...). # # To connect the spanish translations the script adds additional "stem nodes" to the graph. The name of these nodes consists of a spanish word stem plus a pipe symbol plus the string "stem". These nodes look like this in a dot file: # # > "tom|stem" [is_stem=True]; # # The introduction of these nodes later facilites the output of translation matrixes, as you can just search for stems within the graph and only output direct neighbours with spanish translations. It would also be possible to directly connect the spanish translations if they have a matching stem, but then the graph traversal to find matching translations and their heads is a bit more complex later. # # First we create a stemmer object from the SpanishStemmer in NLTK: # <codecell> from nltk.stem.snowball import SpanishStemmer stemmer = SpanishStemmer(True) # <markdowncell> # We create the list of stopwords and encode them as unicode strings: # <codecell> combined_graph_stemmed = copy.deepcopy(combined_graph) stopwords = nltk.corpus.stopwords.words("spanish") stopwords = [w.decode("utf-8") for w in stopwords] # <markdowncell> # Then we loop through all the nodes of the merged graph and add the stem nodes to each Spanish node. If the node has only one word (after stopword removal) we will use the NLTK stemmer; otherwise we just leave the phrase as it is:
def stemmer_all(tweet): stm = SpanishStemmer() split_tweet = [word for word in tweet.lower().split(' ') if word.strip()] return ' '.join([stm.stem(word.strip()) for word in split_tweet])
def remove_stopwords(text, stopSpanish): stemmer=SpanishStemmer() textList = text.split() textList = [word for word in textList if word not in stopSpanish] return ' '.join([stemmer.stem(word) for word in textList])
# -*- coding: utf-8 -*- """ Created on Mon May 6 16:07:59 2019 @author: Turing """ from bs4 import BeautifulSoup as Soup from _pickle import dump from nltk.stem.snowball import SpanishStemmer handler = open('senticon.es.xml', encoding="utf-8").read() soup = Soup(handler, 'lxml') diccionario_polaridad = {} ss = SpanishStemmer() for lemma in soup.find_all('lemma'): palabra = lemma.get_text() polaridad = float(lemma.attrs["pol"]) diccionario_polaridad[ss.stem(palabra.replace(' ', '')).lower()] = polaridad output = open("diccionario_polaridades.pk1", "wb") dump(diccionario_polaridad, output, -1) output.close()
def get_vector_matrix(self, freq_floor=50, context_words=3): nlp = es_core_web_md.load() STOPWORDS = spacy.es.STOP_WORDS def _clean_sent(sent): clean_sent = [] # remove stopwords for word in sent: word = word.lower() if not word in STOPWORDS: if not word.isdigit(): clean_sent.append(word) return clean_sent def _update_feature(word, feature_name, features): " dirty update of features " counts = 1 if word in vectors: if feature_name in vectors[word]: counts = vectors[word][feature_name] + 1 features[feature_name] = counts return features def _update_counts(feature_name, f_counts): counts = 1 if feature_name in f_counts: counts = f_counts[feature_name] + 1 f_counts[feature_name] = counts return f_counts sents = self.corpus.get_sents() stemmer = SpanishStemmer() # will use the words as keys and dict of features as values vectors = {} #freq_counts = {} for sent in sents: # TODO: PARALELLIZE!! #for doc in nlp.pipe(texts, batch_size=10000, n_threads=3): # take off stopwords && to get context_words! cleaned_sent = _clean_sent(sent) doc = nlp(' '.join(sent)) for word_idx in range(len(doc)): # get the word and the pos tag spacy_word = doc[word_idx] word = spacy_word.text.lower() pos_tag = spacy_word.pos_ if len(word) <= 2: continue if word in STOPWORDS: continue if word.isdigit(): continue # if not seen word if not word in vectors: features = {} else: features = vectors[word] # counts of frequency to normalze later #freq_counts = _update_counts(pos_tag, freq_counts) # context related (POS and words stemmed) features = _update_feature(word, pos_tag, features) if word_idx > 0: prev_tag = doc[word_idx - 1].pos_ feature_name = prev_tag + '_pos_prev' features = _update_feature(word, feature_name, features) if word_idx < len(sent) - 1: post_tag = doc[word_idx + 1].pos_ feature_name = post_tag + '_pos_post' features = _update_feature(word, feature_name, features) # dependency features. the objective of the dep is stemmed! dep_type = spacy_word.dep_ if dep_type != 'ROOT': dep_obj = stemmer.stem(spacy_word.head.text.lower()) feature_name = 'DEP:' + dep_type + '-' + dep_obj features = _update_feature(word, feature_name, features) # get n words from context as features (stemmed...!) for i in range(context_words): ctxt_word = (random.choice(cleaned_sent)) feature_word = stemmer.stem(ctxt_word) feature_name = ctxt_word + '_ctxt_word' features = _update_feature(word, feature_name, features) # agregar feature de synset (wordnet) :0 features['word'] = word # frequency counting features = _update_feature(word, 'freq', features) vectors[word] = features # sacar palabras con < 'freq' words_to_pop = set() for word, f_dict in vectors.items(): if f_dict['freq'] <= freq_floor: words_to_pop.add(word) for word in words_to_pop: vectors.pop(word) for word, f_dict in vectors.items(): #print(word, f_dict) f_dict['freq'] = 0 vectors[word] = f_dict # delete an irrelevant dimension! # normalizar los contextos de POS #for word, f_dict in vectors.items(): # f_dict[] # agregar palabra de contexto. .. LEMATIZADA ! # NORMALIZAR TODOS LOS CONTEXTOS! -> diccionario de frequencias de ... TODOS los features que ocurrieron self.words = list( vectors.keys()) # thankfully in the same order as vectors.values vectorizer = DictVectorizer(dtype=numpy.int32) vec_matrix = vectorizer.fit_transform(list(vectors.values())) vectors_shape = vec_matrix.get_shape() print(vectors_shape) """ freqs_vector = vectorizer.transform(freq_counts) vec_matrix = vstack([freqs_vector, vec_matrix]) print(s.get_shape) print(s) print(vectorizer.inverse_transform(s)) """ # normalization vec_matrix = normalize(vec_matrix, copy=False) ####### reduccion de dim no sup # reducir dimensionalidad con variance treshold #selector = VarianceThreshold(threshold = 0.0) #vec_matrix = selector.fit_transform(vec_matrix) # SVD (PCA) Trunc_svd = TruncatedSVD(n_components=1500) vec_matrix = Trunc_svd.fit_transform(vec_matrix) # reducir dimensionalidad con percentile de varianza #selected = SelectPercentile(chi2, percentile = 10) #word_vecs_new=selected.fit_transform(new_word_vecs,target_vec) print(vectorizer.inverse_transform(vec_matrix)) # -> to see features! return self.words, vec_matrix
def configurar_modelo(X, y, idioma='SPANISH', fitness=accuracy_score): """ Metodo que encuentra la mejor maquina de soporte vectorial para el problema. Parameters ---------- X : lista de String Lista con los textos de los tuits sobre los que se quiere entrenar y : lista de int Lista correspondiente con la clases de cada uno de los elementos de la muestra idioma : String SPANISH o ENGLISH dependiendo del idioma de la muestra fitness : funcion(y1,y2) Funcion para calcular la aptitud de los calsificadores. Recibe dos parametros, donde el primer parametro corresponde a las etiquetas de las clases reales y el segundo al encontrado. Debe devolver un valor numerico, donde numeros mayores corresponden a mejor aptitud """ # Constante para round_num = 4 #Prepara los datos #Tokenizador my_tokenizer = RegexpTokenizer("[\w']+") if (idioma == 'SPANISH'): stemmer = SpanishStemmer() elif (idioma == 'ENGLISH'): stemmer = EnglishStemmer() #Funcion stemmer def tokenizer_stemmer(document): return [ stemmer.stem(token) for token in my_tokenizer.tokenize(document) ] hash_vectorizer = HashingVectorizer( analyzer="word", tokenizer=tokenizer_stemmer, preprocessor=None, #stop_words = stopwords.words(idioma), n_features=10000, strip_accents='ascii', encoding='utf-8', ngram_range=(1, 3)) count_vectorizer = CountVectorizer( analyzer="word", tokenizer=tokenizer_stemmer, preprocessor=None, #stop_words = stopwords.words(idioma), strip_accents='ascii', encoding='utf-8', ngram_range=(1, 3)) hash_vectorizer = count_vectorizer X_svm = hash_vectorizer.fit_transform(X) X_mnb = count_vectorizer.fit_transform(X) #Parameters if (not prueba): option_machines = [] option_machines.append('linear') option_machines.append('polynomial') option_machines.append('rbf') option_machines.append('sigmoid') option_machines.append('bayes') num_ite = 5 C = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000] coef = [-10, 0, 10] degrees = [2, 3, 4, 5] gamma = [0.1, 1, 10] alpha = [0, 0.5, 1, 5, 10] else: option_machines = [] option_machines.append('linear') option_machines.append('bayes') num_ite = 2 C = [10] coef = [0] degrees = [2] gamma = [0.1] alpha = [1] machines = {} final_scores = {} if ('linear' in option_machines): #Kernel Lineal configurations = list(itertools.product(C)) scores = {} print('Linear Kernel') for i in range(num_ite): print(' Iteration ' + str(i + 1) + ' of ' + str(num_ite)) #Divide los datos X_train, X_test, y_train, y_test = EntrenarModelo.balanced_split( X_svm, y, test_size=0.2) for conf in configurations: clf = svm.SVC(C=conf[0], kernel='linear', gamma='scale', decision_function_shape='ovo') clf.fit(X_train, y_train) y_predicted = clf.predict(X_test) score = fitness(y_test, y_predicted) if (conf not in scores): scores[conf] = 0 scores[conf] = scores[conf] + score print(' Configuration: C = ' + str(conf[0]) + ' -- Score: ' + str(np.round(score, round_num))) scores = {k: v / num_ite for k, v in scores.items()} print(' ') conf = max(scores.items(), key=operator.itemgetter(1))[0] print(' Max Score: ' + str(scores[conf])) final_scores['linear'] = scores[conf] machines['linear'] = svm.SVC(C=conf[0], kernel='linear', gamma='scale', decision_function_shape='ovo') print(' ') if ('polynomial' in option_machines): #Kernel polynomial configurations = list(itertools.product(C, degrees, coef)) scores = {} print('Polynomial Kernel') for i in range(num_ite): print(' Iteration ' + str(i + 1) + ' of ' + str(num_ite)) #Divide los datos X_train, X_test, y_train, y_test = EntrenarModelo.balanced_split( X_svm, y, test_size=0.2) for conf in configurations: clf = svm.SVC(C=conf[0], kernel='poly', gamma='scale', decision_function_shape='ovo', degree=conf[1], coef0=conf[2]) clf.fit(X_train, y_train) y_predicted = clf.predict(X_test) score = fitness(y_test, y_predicted) if (conf not in scores): scores[conf] = 0 scores[conf] = scores[conf] + score print(' Configuration: C = ' + str(conf[0]) + ', deg = ' + str(conf[1]) + ' coef = ' + str(conf[2]) + ' -- Score: ' + str(np.round(score, round_num))) scores = {k: v / num_ite for k, v in scores.items()} print(' ') conf = max(scores.items(), key=operator.itemgetter(1))[0] print(' Max Score: ' + str(scores[conf])) final_scores['polynomial'] = scores[conf] machines['polynomial'] = svm.SVC(C=conf[0], kernel='poly', gamma='scale', decision_function_shape='ovo', degree=conf[1], coef0=conf[2]) print(' ') if ('rbf' in option_machines): #Kernel polynomial configurations = list(itertools.product(C, gamma)) scores = {} print('RBF Kernel') for i in range(num_ite): print(' Iteration ' + str(i + 1) + ' of ' + str(num_ite)) #Divide los datos X_train, X_test, y_train, y_test = EntrenarModelo.balanced_split( X_svm, y, test_size=0.2) for conf in configurations: clf = svm.SVC(C=conf[0], kernel='rbf', gamma=conf[1], decision_function_shape='ovo') clf.fit(X_train, y_train) y_predicted = clf.predict(X_test) score = fitness(y_test, y_predicted) if (conf not in scores): scores[conf] = 0 scores[conf] = scores[conf] + score print(' Configuration: C = ' + str(conf[0]) + ', gamma = ' + str(conf[1]) + ' -- Score: ' + str(np.round(score, round_num))) scores = {k: v / num_ite for k, v in scores.items()} print(' ') conf = max(scores.items(), key=operator.itemgetter(1))[0] print(' Max Score: ' + str(scores[conf])) final_scores['rbf'] = scores[conf] machines['rbf'] = svm.SVC(C=conf[0], kernel='rbf', gamma=conf[1], decision_function_shape='ovo') print(' ') if ('sigmoid' in option_machines): #Kernel polynomial configurations = list(itertools.product(C, coef)) scores = {} print('Sigmoid Kernel') for i in range(num_ite): print(' Iteration ' + str(i + 1) + ' of ' + str(num_ite)) #Divide los datos X_train, X_test, y_train, y_test = EntrenarModelo.balanced_split( X_svm, y, test_size=0.2) for conf in configurations: clf = svm.SVC(C=conf[0], kernel='sigmoid', gamma='scale', decision_function_shape='ovo', coef0=conf[1]) clf.fit(X_train, y_train) y_predicted = clf.predict(X_test) score = fitness(y_test, y_predicted) if (conf not in scores): scores[conf] = 0 scores[conf] = scores[conf] + score print(' Configuration: C = ' + str(conf[0]) + ' coef = ' + str(conf[1]) + ' -- Score: ' + str(np.round(score, round_num))) scores = {k: v / num_ite for k, v in scores.items()} print(' ') conf = max(scores.items(), key=operator.itemgetter(1))[0] print(' Max Score: ' + str(scores[conf])) final_scores['sigmoid'] = scores[conf] machines['sigmoid'] = svm.SVC(C=conf[0], kernel='sigmoid', gamma='scale', decision_function_shape='ovo', coef0=conf[1]) print(' ') if ('bayes' in option_machines): #Bayes configurations = list(itertools.product(alpha)) scores = {} print('Multinomial Bayes') for i in range(num_ite): print(' Iteration ' + str(i + 1) + ' of ' + str(num_ite)) #Divide los datos X_train, X_test, y_train, y_test = EntrenarModelo.balanced_split( X_mnb, y, test_size=0.2) for conf in configurations: clf = MultinomialNB(alpha=conf[0]) clf.fit(X_train, y_train) y_predicted = clf.predict(X_test) score = fitness(y_test, y_predicted) if (conf not in scores): scores[conf] = 0 scores[conf] = scores[conf] + score print(' Configuration: alpha = ' + str(conf[0]) + ' -- Score: ' + str(np.round(score, round_num))) scores = {k: v / num_ite for k, v in scores.items()} print(' ') conf = max(scores.items(), key=operator.itemgetter(1))[0] print(' Max Score: ' + str(scores[conf])) final_scores['bayes'] = scores[conf] machines['bayes'] = MultinomialNB(alpha=conf[0]) print(' ') mac = max(final_scores.items(), key=operator.itemgetter(1))[0] final_machine = machines[mac] print('') print('Best Machine: ' + str(mac)) print('Score: ' + str(final_scores[mac])) if (mac == 'bayes'): final_vectorizer = count_vectorizer else: final_vectorizer = hash_vectorizer clasificador = EncapsularClasificador(final_machine) class_results = {} class_results['accuracy'] = [] unique_classes = np.unique(y).tolist() for cla in unique_classes: class_results[cla] = {} class_results[cla]['precision'] = [] class_results[cla]['recall'] = [] #Constructs final statistics print('') print('Constructing Final Statistics') print('') for i in range(num_ite): print('Iteration ' + str(i + 1) + ' of ' + str(num_ite)) maquina = EncapsularClasificador.clone(clasificador) X_train, X_test, y_train, y_test = EntrenarModelo.balanced_split( X, y, test_size=0.2) maquina.fit(X_train, y_train) y_predicted = maquina.predict(X_test) class_results['accuracy'].append( accuracy_score(y_test, y_predicted)) for cla in unique_classes: #precision sub_test = y_test[y_predicted == cla] precision = np.sum(sub_test == cla) / (max(len(sub_test), 1)) #print('Precision for class ' + str(cla) + ': '+ str(precision)) class_results[cla]['precision'].append(precision) #recall sub_test = y_predicted[y_test == cla] recall = np.sum(sub_test == cla) / (max(len(sub_test), 1)) #print('Recall for class ' + str(cla) + ': '+ str(recall)) class_results[cla]['recall'].append(recall) class_results_consolidated = {} class_results_consolidated['accuracy'] = np.round( 100 * np.mean(class_results['accuracy']), 3) for cla in unique_classes: class_results_consolidated[cla] = {} precision = np.round( 100 * np.mean(class_results[cla]['precision']), 3) recall = np.round(100 * np.mean(class_results[cla]['recall']), 3) class_results_consolidated[cla]['precision'] = precision class_results_consolidated[cla]['recall'] = recall pprint.pprint(class_results_consolidated, width=1) return (sklearn.base.clone(final_machine))