Пример #1
0
    def __init__(self,
                 language='en',
                 database_name='memory',
                 memory_table='memory',
                 listen_log_table='listen_log',
                 speak_log_table='speak_log'):
        super().__init__(language, database_name, memory_table,
                         listen_log_table, speak_log_table)

        try:
            json_file = open('modelo_gustos.json', 'r')
            loaded_model_json = json_file.read()
            json_file.close()

            self.model = model_from_json(loaded_model_json)
            self.model.load_weights("modelo_gustos.h5")
            self.model.compile(loss='mean_squared_error',
                               optimizer='adam',
                               metrics=['binary_accuracy'])
        except Exception:
            print('****ERROR: Error cargando modelo...****')

        self.stemmer = SpanishStemmer()
        self.words = [
            '¿qu', '?', 'peli', 'pelis', 'color', 'favorit', 'leer', 'libr',
            'novel', 'ver', 'prefier', 'gust', 'pelicul', 'jug', '¿cual',
            'prefer', 'jueg', 'com', 'plat', 'animal', 'videojueg'
        ]
        self.classes = [
            'comida', 'color', 'animal', 'juego', 'libro', 'película'
        ]
def process_violence(lang, data_path, stopword_path, save_path):
    if lang == "English":
        stemmer = EnglishStemmer()
    elif lang == "Spanish":
        stemmer = SpanishStemmer()
    else:
        stemmer = None

    print "loading dataset"
    line_sentences = ProcessLineSentence(dataPath=data_path,
                                         label="violence",
                                         stopwordPath=stopword_path,
                                         stemmer=stemmer)

    with open(save_path, 'w') as f:
        writer = csv.writer(f)
        for sentence, label in line_sentences:
            if label == "no":
                l = [0]
            elif label == "violence":
                l = [1]
            elif label == "malpractice":
                l = [2]
            else:
                raise (Exception("Wrong label: {}".format(label)))

            writer.writerow(l + sentence)
Пример #3
0
 def __init__(self, min_long=5):
     """
     Para inicializar un `Tokenizer` es necesario saber el tamaño mínimo de caracteres
     `min_long`. que constituyen una palabra válidad
     :param min_long: un entero. Por defecto igual a cinco (5)
     """
     self.stemmer = SpanishStemmer()
     self.min_long = min_long
Пример #4
0
def find_top_N_words(lang_entries, top_N, lang):
    dictionary = Lang_Dictionary({}, lang)
    for player in lang_entries:
        for chat in player.c:
            language = {'eng': 0, 'spn': 0, 'other': 0, 'tot': 0}
            sentence = player.c[chat]
            newlist = player.c[chat].strip().split(' ')
            newlist = [x.strip("''") for x in newlist]
            for word in newlist:
                language['tot'] += 1
                if word.lower() not in Lang_dicts.lang_index:
                    language['other'] += 1
                else:
                    word = Lang_dicts.lang_index[word.lower()]
                    if word == "english":
                        language['eng'] += 1
                    elif word == "spanish":
                        language['spn'] += 1
                    else:
                        language['other'] += 1
            if language['other'] < 2 * (language['spn'] + language['eng']):
               print(sentence)
               if language['spn'] > language['eng']:
                   print("SPANISH")
                   stemmer = SpanishStemmer()
               else:
                   print("ENGLISH")
                   stemmer = EnglishStemmer()
            aslist = []
            aslist += sentence
            sentence =""
            j = ''.join(aslist)
            words = j.split(' ')
            for line in words:
                line = str(line).replace('\'', '')
                line = line.replace('""', '')
                line = line.replace('"', '')
                if len(line) > 0:
                    if language["other"] < 2 * (language['spn'] + language["eng"]):
                        sentence += stemmer.stem(line.encode(sys.stdout.encoding, errors = 'replace')) + " "
                        print(sentence)
                    ##INEFFICIENT - looking through dictionary each time?
                    if line.lower() not in dictionary.d:
                        dictionary.d[line.lower()] = 0
                    dictionary.d[line.lower()] += 1

    ###wthCounts is a list of the word and its count
    wthCounts = []
    for(w,c) in dictionary.d.iteritems():
        wthCounts += [(c,w)]
    ##wc is the wthCounts list only sorted
    wc = sorted(wthCounts, reverse=True)
    return wc[:top_N]
Пример #5
0
def build_paragraph_inv_index(paragraphs, stem):
    p_index = {}
    stemmer = SpanishStemmer()
    for i, paragraph in enumerate(paragraphs):
        words = [word for word in paragraph.split() if word not in STOP_WORDS]
        for word in words:
            if stem:
                word = stemmer.stem(word)
            if word not in p_index:
                p_index[word] = []
            p_index[word].append(i)
    return p_index
Пример #6
0
 def __init__(self, lemma=False, stem=False):
     self.extra_dicts = Dicts()
     self.english_dict = enchant.Dict("en_EN")
     self.spanish_dict = enchant.Dict("es_AR")
     self.lemma = lemma
     self.stem = stem
     self.VARIANT_CLASS = 0
     self.SPANISH_CLASS = 1
     self.FOREIGN_CLASS = 2
     if lemma:
         self.lemmatizer = Lemmatizer()
     if stem:
         self.stemmer = SpanishStemmer()
Пример #7
0
 def __init__(self, stem=False):
     dictionaries = dicts()
     path = '/home/alangb/TWPP'  # path to TreeTagger installation directory
     self.english_dict = enchant.Dict("en_EN")
     self.spanish_dict = enchant.Dict("es_ES")
     self.ND = dictionaries.norm
     self.SD = dictionaries.lemario
     self.PND = dictionaries.names
     self.stem = stem
     if stem:
         self.stemmer = SpanishStemmer()
     else:
         self.tagger = TreeTagger(TAGLANG='es', TAGDIR=path)
    def __init__(self):

        self.reglasEntities.append(EmailRegla())
        self.reglasEntities.append(UrlRegla())
        self.reglasEntities.append(FechasRegla())
        self.reglasEntities.append(TelefonosRegla())
        self.reglasEntities.append(AbreviaturasRegla())
        self.reglasEntities.append(NombresPropiosRegla())
        self.reglasEntities.append(NumerosRegla())
        self.reglasDocumento.append(MinusculasRegla())
        self.reglasDocumento.append(TranslateRegla())
        self.reglasDocumento.append(LimpiarHtmlTagsRegla())
        self.reglasDocumento.append(LimpiadoBasicoRegla())
        self.reglasTokens.append(MinMaxCaracteresRegla())
        self.stemmer = SpanishStemmer()
Пример #9
0
def run_BM25_collection(output_dir,documents,queries,qrels,train,validation,test,k,language):
    
    if language=='en':
        stop_words = set(stopwords.words('english'))
        stemmer = PorterStemmer()
    
    elif language=='fr':
        stop_words = set(stopwords.words('french'))
        stemmer = FrenchStemmer()
    
    elif language=='es':
        stop_words = set(stopwords.words('spanish'))
        stemmer = SpanishStemmer()
        
    elif language=='it':
        stop_words = set(stopwords.words('italian'))
        stemmer = ItalianStemmer()
    
    
    corpus = [] 
    doc_indexes = []
    for key,value in documents.items():
        doc_indexes.append(key)
        doc = [stemmer.stem(elem) for elem in value.split(" ") if elem not in stop_words]
        corpus.append(value.split(" "))
    bm25 = BM25Okapi(corpus)
    
    print("Running BM25",flush=True)
    
    results = dict()
    for i,elem in enumerate(train):
        results[elem] = run_BM25_query(queries[elem],bm25,doc_indexes,k,language)
        if i%1000==0:
            print('Processing query',i,'/',len(train),flush=True)
    save_BM25_res(output_dir+'/training/BM25.res',results)
    save_BM25_qrels_dataframe(output_dir + '/training/BM25.qrels.csv',results,qrels,True)
    
    results = dict()
    for elem in validation:
        results[elem] = run_BM25_query(queries[elem],bm25,doc_indexes,k,language)
    save_BM25_res(output_dir+'/validation/BM25.res',results)
    save_BM25_qrels_dataframe(output_dir + '/validation/BM25.qrels.csv',results,qrels,False)
    
    results = dict()
    for elem in test:
        results[elem] = run_BM25_query(queries[elem],bm25,doc_indexes,k,language)
    save_BM25_res(output_dir+'/test/BM25.res',results)
    save_BM25_qrels_dataframe(output_dir + '/test/BM25.qrels.csv',results,qrels,False)
Пример #10
0
 def __init__(self):
     self.tweets = 0
     self.related_tweets = 0
     self.stopwords = {}
     self.stemmers = {}
     self.stemmers["es"] = SpanishStemmer()
     self.stemmers["en"] = PorterStemmer()
     self.stemmers["fr"] = FrenchStemmer()
     self.stemmers["de"] = GermanStemmer()
     self.stopwords["es"] = self.load_stopwords_file(
         "spanish_stopwords.txt")
     self.stopwords["en"] = self.load_stopwords_file(
         "english_stopwords.txt")
     self.stopwords["fr"] = self.load_stopwords_file("french_stopwords.txt")
     self.stopwords["ge"] = self.load_stopwords_file("german_stopwords.txt")
     self.output_file = open(sys.argv[2], 'a')
    def __init__(self, question, words, stem):
        self.question = question
        self.stem = stem
        self.stemmer = SpanishStemmer()
        self.words = words
        self.stemmed_words = self.stem_words(self.words)
        self.path_pfx = os.getcwd()

        self.inverted_index = self.load_doc_inverted_index()
        self.doc_names = self.init_doc_names()
        self.paragraph_indices = {}
        self.paragraph_inverted_indices = {}
        self.results = pd.DataFrame(columns=['text', 'law', 'score'])
        self.load_paragraph_indices()

        self.L = 23055.676666666666  #Manually obtained using bash
        self.scores = {'tf': {}, 'idf':{}, 'tfidf':{},'n_containing':{},\
              'score':{}}
def generate_stopwords(stopname='stopSpanish.pkl'):
    """ Remove stop words, and apply stemming """
    stemmer=SpanishStemmer()
    stopwords_es = set(stopwords.words('spanish'))
    stopwords_es_sw = set(get_stop_words('spanish'))
    stopSpanishBeta = list(set(stopwords_es.union(stopwords_es_sw)))

    stopSpanish = set(stopwords_es.union(stopwords_es_sw))
    for stopWord in stopSpanishBeta:
        stopSpanish.add(stemmer.stem(stopWord))

    stopSpanish = list(stopSpanish)
    stopSpanish.extend(['tra', 'd', 'desc']) # Adding stopwords not present in the standard stopwords
    stopSpanish.remove('no')  # Keep to help identify negative categories

    with open(f'{resource_path}/{stopname}', 'wb') as f:
        pickle.dump(stopSpanish, f)

    return stopSpanish
Пример #13
0
def build_index_from_words(words, stem):
    '''
	Takes: 
	- words, a list of strings
	
	Returns:
	- index, a dictionary with a count of times a word appears
	in the document
	'''
    index = {}
    stemmer = SpanishStemmer()
    for word in words:
        if word not in STOP_WORDS:
            if stem:
                word = stemmer.stem(word)
            if word not in index:
                index[word] = 0
            index[word] += 1
    return index
Пример #14
0
class GigawordParser(StreamParser):
    STEMMERS = {
        "eng": PorterStemmer(ignore_stopwords=False),
        "spa": SpanishStemmer(),
    }

    def __init__(self, language):
        self.next_id = 0
        self.language = language
        self.stemmer = self.STEMMERS.get(language)
        if self.stemmer is None:
            raise Exception("Unsupported language %s" % language)

    def init_id_counter(self, initial):
        self.next_id = initial

    def new_id(self):
        new_id = self.next_id
        self.next_id += 1
        return new_id

    def parse_raw(self, xml_str):
        xml = minidom.parseString(xml_str)
        if self.language == "es":
            try:
                url = "gigaword:" + xml.getElementsByTagName(
                    "DOC")[0].attributes["id"].value
                title = xml.getElementsByTagName(
                    "HEADLINE")[0].firstChild.nodeValue
            except:
                url = "<NONE>"
                title = "<NONE>"
        else:
            url = "<NONE>"
            title = "<NONE>"
        text = stringio.StringIO()
        for node in xml.getElementsByTagName("TEXT")[0].childNodes:
            if len(node.childNodes) > 0:
                text.write(node.firstChild.nodeValue)
        content = text.getvalue()
        terms = text_to_terms(content, self.language)
        return RuwacDocument(self.new_id(), url, title, content, terms)
Пример #15
0
    def lemmatize(self, text, lang):

        # spacy.prefer_gpu()
        # nlp = spacy.load(lang) # en fr "en_core_web_sm"
        if lang == "fr":
            stemmer = FrenchStemmer()
        elif lang == "es":
            stemmer = SpanishStemmer()
        else:
            stemmer = EnglishStemmer()

        stemmed = []
        for word in text.split(" "):
            stemmed.append(stemmer.stem(word))

        # doc = nlp(u""+text)
        # lem_terms = []
        # for token in doc:
        #     lem_terms.append(token.lemma_)

        return " ".join(stemmed)
Пример #16
0
def run_BM25_query(query,bm25,doc_indexes,k,language):
    
    if language=='en':
        stop_words = set(stopwords.words('english'))
        stemmer = PorterStemmer()
    
    elif language=='fr':
        stop_words = set(stopwords.words('french'))
        stemmer = FrenchStemmer()
    
    elif language=='es':
        stop_words = set(stopwords.words('spanish'))
        stemmer = SpanishStemmer()
        
    elif language=='it':
        stop_words = set(stopwords.words('italian'))
        stemmer = ItalianStemmer()
    
    tokenized_query = [stemmer.stem(elem) for elem in query.split(" ") if elem not in stop_words]
    doc_scores = bm25.get_scores(tokenized_query)
    top_k = np.argsort(doc_scores)[::-1][:k]
    results = [[doc_indexes[key],doc_scores[key]] for key in top_k]
    return results
def process_election(lang, data_path, stopword_path, save_path):
    if lang == "English":
        stemmer = EnglishStemmer()
    elif lang == "Spanish":
        stemmer = SpanishStemmer()
    else:
        stemmer = None

    print "loading dataset"
    line_sentences = ProcessLineSentence(dataPath=data_path,
                                         label="election",
                                         stopwordPath=stopword_path,
                                         stemmer=stemmer)

    with open(save_path, 'w') as f:
        writer = csv.writer(f)
        for sentence, label in line_sentences:
            if label == "yes":
                l = [1]
            else:
                l = [0]
            row = [w.encode('utf-8') for w in sentence]
            writer.writerow(l + row)
Пример #18
0
def getfeats(fields, o):
    """ This takes the word in question and
    the offset with respect to the instance
    word """
    word = fields[0]
    stemmer = SpanishStemmer()

    with_hyphen = 0
    if "-" in word:
        with_hyphen = 1

    with_apostrophe = 0
    if "'" in word:
        with_apostrophe = 1

    o = str(o)
    features = [
        (o + "word", word),
        (o + 'pos', fields[1]),
        #(o + 'prefix1', word[:1]),
        (o + 'prefix2', word[:2]),
        (o + 'prefix3', word[:3]),
        (o + 'prefix4', word[:4]),
        #(o + 'suffix1', word[-1:]),
        (o + 'suffix2', word[-2:]),
        (o + 'suffix3', word[-3:]),
        (o + 'suffix4', word[-4:]),
        (o + 'is_upper', word.isupper()),
        (o + 'is_title', word.istitle()),
        (o + 'is_digit', word.isdigit()),
        (o + 'with_hypen', with_hyphen),
        (o + 'with_apostrophe', with_apostrophe),
        (o + 'spanich_stem', stemmer.stem(word)),
        # (o + 'word_shape', word_shape(word))
    ]

    return features
Пример #19
0
def spanish_swadesh_list(stemmed=True):
    """
    Helper function that returns a list of strings with the stems of the
    spanish Swadesh entries.

    """
    try:
        stemmer = SpanishStemmer(True)
    except:
        log.warn("Spanish stemmer could not be loaded!")
        return

    swadesh_entries = []
    for line in util.read_text_file(
            util.data_path('swadesh', 'swadesh_spa.txt'), lines=True):
        line = line.strip()
        for e in line.split(","):
            e = e.strip()
            if stemmed:
                stem = stemmer.stem(e)
                swadesh_entries.append(stem)
            else:
                swadesh_entries.append(e)
    return swadesh_entries
Пример #20
0
 def tokenizer_stemmer_global(document):
     stemmer = SpanishStemmer()
     my_tokenizer = RegexpTokenizer("[\w']+")
     return [
         stemmer.stem(token) for token in my_tokenizer.tokenize(document)
     ]
Пример #21
0
 def __init__(self):
     super(StemmerProcessor, self).__init__()
     self.stemmer = SpanishStemmer()
Пример #22
0
   Hay multitud de ""stemmizadores"", yo voy a coger el de español
'''

from nltk.stem.snowball import SnowballStemmer, SpanishStemmer

#Igual que con punkt hay que bajar un paquete
download('stopwords')

#Si no conocemos el lenguaje a priori.
#SnowballStemmer(language, ignore_stopwords=False)
spanish_stem = SnowballStemmer("spanish", True)

# Si conocemos el lenguaje de antemano, podemos importarlo directamente
#SpanishStemmer(ignore_stopwords=False)
spanish_stem = SpanishStemmer(True)
print(spanish_stem.stem("Comiendo"), spanish_stem.stem("Bailando"),
      spanish_stem.stem("bailar"), spanish_stem.stem("estantería"))
'''################################
   # Obteniendo el verbo original #
   ################################

   Conocido como lemmatization.

   NLTK no tiene esto en español, solo inglés.
'''

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
#Igual que con punkt hay que bajar un paquete
download('wordnet')
Пример #23
0
import csv
import collections
import operator
import unicodedata

import os
cwd = os.getcwd()
root = os.path.dirname(cwd)
lematizador_dir = os.path.join(root, "data", "lematizador", "lematizador.csv")
stopwords_dir = os.path.join(root, "data", "stopwords")

from spellchecker import SpellChecker  #https://pypi.org/project/pyspellchecker/

#Stemmer
from nltk.stem.snowball import SpanishStemmer
stemmer = SpanishStemmer()

#Creación objeto para corrección ortografía
spell = SpellChecker(language="es")
metodo_desconocidas = spell.unknown
metodo_correccion = spell.correction
lista_blanca_regiones = [
    "Arica", "Parinacota", "Tarapacá", "Antofagasta", "Atacama", "Coquimbo",
    "Valparaíso", "Metropolitana", "Santiago", "Libertador", "General",
    "Bernardo", "O’Higgins", "Maule", "Ñuble", "Biobío", "Araucanía", "Ríos",
    "Lagos", "Aysén", "General", "Carlos", "Ibáñez", "Campo", "Magallanes",
    "Antártica"
]
lista_blanca_telecom = [
    'lte', 'whatsapp', 'instagram', 'telegram', 'youtube', 'facebook', 'entel',
    'bafi', 'resetea', 'samsung', 'huawei', 'iphone', 'kb', 'mb', 'pixi',
Пример #24
0
def do_stemmer(df, stop_language='spanish'):
    """Apply stop words and Stemmers"""
    ##  Como nos llegan tickets en dos idiomas añadimos las palabras de ambos idiomas
    stop = get_stop_words(stop_language) + get_stop_words('english')
    ## Añdimos nuestras propias palabras
    stop += [
        "buenas", "buenos", "cid", "dias", "gracias", "hola", "mucho", "mucha",
        "poder", "proyecto", "please", "saludo", "tardes", "www", "habia"
    ]
    stop += [
        'ahora',
        'algun',
        'alguna',
        'amanecia interrumpio',
        'amanecia interrumpio relato',
        'amanecia interrumpio relato habian',
        'amanecia interrumpio relato habian dado',
        'aquel',
        'asi',
        'aun',
        'cada',
        'vez',
        'mas',
        'cualquier',
        'cosa',
        'cuanto',
        'dado',
        'darse',
        'debe',
        'debia',
        'despues',
        'dia noche',
        'dia siguiente',
        'diez años',
        'diez mil',
        'dijo',
        'dijo',
        'dio',
        'habia',
        'mas',
        'podia',
        'podian',
        'mismo',
        'si',
        'tal',
        'tan',
        'puede',
        'pueden ser',
        'pues',
        'puso',
        'toda',
        'todas',
        'vease tambien',
        'primer lugar',
        'varias',
        'dos',
        'largo',
        'hacia'
        'uno',
        'una',
        'unos',
        'una',
        'aquella',
        'aquello',
        'aquel',
        'hace',
        'muchas',
        'mucho',
        'muchos',
        'mucha',
        'pueden',
        'puedo',
        'unas',
        'abrio puerta',
        'arriba abajo',
        'aqui alla',
        'habian',
        'doña',
        'don',
        'señor',
        'señora',
        'hizo',
        'quedo',
        'fuerza sino',
        'quedo perplejo',
        'parece haber',
        'parece ser',
        'parecia haber',
        'mayor parte',
        'mañana siguiente',
        'media hora',
        'hoy dia',
        'iba ser',
        'iii pag',
        'haber hecho',
        'habria podido',
        'hacer cosas',
        'hacia arriba',
        'hacia atras',
        'hacia puerta',
        'hacia tiempo',
        'decir verdad',
        'dejo caer',
        'demasiado tarde',
        'derecha izquierda',
        'di cuenta',
        'dia anterior',
        'dia noche',
        'dia siguiente',
        'casi siempre',
        'cierto dia',
        'cierto modo',
        'cinco años',
        'aqui alla',
        'arriba abajo',
        'aunque solo',
        'año nuevo',
        'años edad',
        'buena parte',
        'ninguna parte',
        'noche anterior',
        'noche dia',
        'nunca visto',
        'partido comunista',
        'podria haber',
        'podria ser',
        'press cambridge',
        'primer lugar',
        'quiere decir',
        'quiero decir',
        'sentido comun',
        'seria mejor',
        'tras haber',
        'tres años',
        'tres cuatro',
        'tres meses',
        'voz alta',
        'voz baja',
    ]
    stop_words_generated_tokens = [
        'abajo', 'abrio', 'alla', 'alta', 'amanecia', 'anterior', 'aqui',
        'aren', 'arriba', 'atras', 'aunque', 'año', 'años', 'baja', 'buena',
        'caer', 'cambridge', 'can', 'casi', 'cierto', 'cinco', 'comun',
        'cosas', 'couldn', 'cuatro', 'cuenta', 'decir', 'dejo', 'demasiado',
        'di', 'dia', 'didn', 'diez', 'doesn', 'edad', 'haber', 'habria',
        'hacer', 'hacia', 'hadn', 'hasn', 'haven', 'hecho', 'hora', 'hoy',
        'iba', 'iii', 'isn', 'let', 'll', 'lugar', 'mayor', 'mañana', 'media',
        'mejor', 'meses', 'modo', 'mustn', 'ninguna', 'noche', 'nuevo',
        'nunca', 'pag', 'parece', 'parecia', 'parte', 'partido', 'podido',
        'podria', 'puerta', 'quiere', 'quiero', 're', 'relato', 'sentido',
        'ser', 'seria', 'shan', 'shouldn', 'siempre', 'siguiente', 'sino',
        'solo', 'tambien', 'tarde', 'tiempo', 'tras', 'tres', 've', 'vease',
        'visto', 'wasn', 'weren', 'won', 'wouldn'
    ]
    stop += stop_words_generated_tokens
    ps = SpanishStemmer()

    a = []
    df["stem"] = "n"
    for i, row in df.iterrows():
        a.append(
            ps.stem(row["text"]).replace('fuerza sino', '').replace(
                'acceder', 'acceso').replace('user', 'usuario').replace(
                    'access', 'acceso').replace('usuarios', 'usuario').replace(
                        'abrio puerta', '').replace('acto seguido', ''))
    df["stem"] = a
    return df, stop
# The next step is to connect spanish translations that contain the same stem. For this we first remove certain stop words from the translation (list of stopwords from NLTK). There are two cases then: just one word remains, or more than one word remains.
#
# We have to options now what to do with the latter: either they are not connected with anything at all (default behaviour), or each word is stemmed and the translation is connected with every other translation that contain the same stems. Right now this results in many connections that look not very useful. This should be done in a more intelligent way in the future (for example find heads of phrases in mulitword expression and only connect those; split the weight of the connections between all stems and work with weighted graphs from this step on; ...).
#
# To connect the spanish translations the script adds additional "stem nodes" to the graph. The name of these nodes consists of a spanish word stem plus a pipe symbol plus the string "stem". These nodes look like this in a dot file:
#
# > "tom|stem" [is_stem=True];
#
# The introduction of these nodes later facilites the output of translation matrixes, as you can just search for stems within the graph and only output direct neighbours with spanish translations. It would also be possible to directly connect the spanish translations if they have a matching stem, but then the graph traversal to find matching translations and their heads is a bit more complex later.
#
# First we create a stemmer object from the SpanishStemmer in NLTK:

# <codecell>

from nltk.stem.snowball import SpanishStemmer
stemmer = SpanishStemmer(True)

# <markdowncell>

# We create the list of stopwords and encode them as unicode strings:

# <codecell>

combined_graph_stemmed = copy.deepcopy(combined_graph)
stopwords = nltk.corpus.stopwords.words("spanish")
stopwords = [w.decode("utf-8") for w in stopwords]

# <markdowncell>

# Then we loop through all the nodes of the merged graph and add the stem nodes to each Spanish node. If the node has only one word (after stopword removal) we will use the NLTK stemmer; otherwise we just leave the phrase as it is:
Пример #26
0
def stemmer_all(tweet):
    stm = SpanishStemmer()
    split_tweet = [word for word in tweet.lower().split(' ') if word.strip()]
    return ' '.join([stm.stem(word.strip()) for word in split_tweet])
def remove_stopwords(text, stopSpanish):
    stemmer=SpanishStemmer()
    textList = text.split()
    textList = [word for word in textList if word not in stopSpanish]
    return ' '.join([stemmer.stem(word) for word in textList])
Пример #28
0
# -*- coding: utf-8 -*-
"""
Created on Mon May  6 16:07:59 2019

@author: Turing
"""

from bs4 import BeautifulSoup as Soup
from _pickle import dump
from nltk.stem.snowball import SpanishStemmer

handler = open('senticon.es.xml', encoding="utf-8").read()
soup = Soup(handler, 'lxml')
diccionario_polaridad = {}
ss = SpanishStemmer()
for lemma in soup.find_all('lemma'):
    palabra = lemma.get_text()
    polaridad = float(lemma.attrs["pol"])
    diccionario_polaridad[ss.stem(palabra.replace(' ',
                                                  '')).lower()] = polaridad

output = open("diccionario_polaridades.pk1", "wb")
dump(diccionario_polaridad, output, -1)
output.close()
Пример #29
0
    def get_vector_matrix(self, freq_floor=50, context_words=3):

        nlp = es_core_web_md.load()
        STOPWORDS = spacy.es.STOP_WORDS

        def _clean_sent(sent):
            clean_sent = []
            # remove stopwords
            for word in sent:
                word = word.lower()
                if not word in STOPWORDS:
                    if not word.isdigit():
                        clean_sent.append(word)
            return clean_sent

        def _update_feature(word, feature_name, features):
            " dirty update of features "
            counts = 1
            if word in vectors:
                if feature_name in vectors[word]:
                    counts = vectors[word][feature_name] + 1
            features[feature_name] = counts
            return features

        def _update_counts(feature_name, f_counts):
            counts = 1
            if feature_name in f_counts:
                counts = f_counts[feature_name] + 1
            f_counts[feature_name] = counts
            return f_counts

        sents = self.corpus.get_sents()
        stemmer = SpanishStemmer()

        # will use the words as keys and dict of features as values
        vectors = {}
        #freq_counts = {}
        for sent in sents:
            # TODO: PARALELLIZE!!
            #for doc in nlp.pipe(texts, batch_size=10000, n_threads=3):
            # take off stopwords && to get context_words!
            cleaned_sent = _clean_sent(sent)
            doc = nlp(' '.join(sent))
            for word_idx in range(len(doc)):
                # get the word and the pos tag
                spacy_word = doc[word_idx]
                word = spacy_word.text.lower()

                pos_tag = spacy_word.pos_

                if len(word) <= 2:
                    continue
                if word in STOPWORDS:
                    continue
                if word.isdigit():
                    continue

                # if not seen word
                if not word in vectors:
                    features = {}
                else:
                    features = vectors[word]

                # counts of frequency to normalze later
                #freq_counts = _update_counts(pos_tag, freq_counts)

                # context related (POS and words stemmed)
                features = _update_feature(word, pos_tag, features)
                if word_idx > 0:
                    prev_tag = doc[word_idx - 1].pos_
                    feature_name = prev_tag + '_pos_prev'
                    features = _update_feature(word, feature_name, features)
                if word_idx < len(sent) - 1:
                    post_tag = doc[word_idx + 1].pos_
                    feature_name = post_tag + '_pos_post'
                    features = _update_feature(word, feature_name, features)

                # dependency features. the objective of the dep is stemmed!
                dep_type = spacy_word.dep_
                if dep_type != 'ROOT':
                    dep_obj = stemmer.stem(spacy_word.head.text.lower())
                    feature_name = 'DEP:' + dep_type + '-' + dep_obj
                    features = _update_feature(word, feature_name, features)

                # get n words from context as features (stemmed...!)
                for i in range(context_words):
                    ctxt_word = (random.choice(cleaned_sent))
                    feature_word = stemmer.stem(ctxt_word)
                    feature_name = ctxt_word + '_ctxt_word'
                    features = _update_feature(word, feature_name, features)
                # agregar feature de synset (wordnet) :0
                features['word'] = word

                # frequency counting
                features = _update_feature(word, 'freq', features)

                vectors[word] = features

        # sacar palabras con < 'freq'
        words_to_pop = set()
        for word, f_dict in vectors.items():
            if f_dict['freq'] <= freq_floor:
                words_to_pop.add(word)
        for word in words_to_pop:
            vectors.pop(word)

        for word, f_dict in vectors.items():
            #print(word, f_dict)
            f_dict['freq'] = 0
            vectors[word] = f_dict  # delete an irrelevant dimension!
        # normalizar los contextos de POS
        #for word, f_dict in vectors.items():
        #    f_dict[]

        # agregar palabra de contexto. .. LEMATIZADA !

        # NORMALIZAR TODOS LOS CONTEXTOS! -> diccionario de frequencias de ... TODOS los features que ocurrieron
        self.words = list(
            vectors.keys())  # thankfully in the same order as vectors.values

        vectorizer = DictVectorizer(dtype=numpy.int32)
        vec_matrix = vectorizer.fit_transform(list(vectors.values()))
        vectors_shape = vec_matrix.get_shape()
        print(vectors_shape)
        """
        freqs_vector = vectorizer.transform(freq_counts)

        vec_matrix = vstack([freqs_vector, vec_matrix])
        print(s.get_shape)
        print(s)
        print(vectorizer.inverse_transform(s))
        """

        # normalization
        vec_matrix = normalize(vec_matrix, copy=False)

        ####### reduccion de dim no sup
        # reducir dimensionalidad con variance treshold
        #selector = VarianceThreshold(threshold = 0.0)
        #vec_matrix = selector.fit_transform(vec_matrix)

        # SVD (PCA)
        Trunc_svd = TruncatedSVD(n_components=1500)
        vec_matrix = Trunc_svd.fit_transform(vec_matrix)

        # reducir dimensionalidad con percentile de varianza
        #selected = SelectPercentile(chi2, percentile = 10)
        #word_vecs_new=selected.fit_transform(new_word_vecs,target_vec)

        print(vectorizer.inverse_transform(vec_matrix))  # -> to see features!

        return self.words, vec_matrix
Пример #30
0
    def configurar_modelo(X, y, idioma='SPANISH', fitness=accuracy_score):
        """
        Metodo que encuentra la mejor maquina de soporte vectorial para el problema.


        Parameters
        ----------
        X : lista de String
            Lista con los textos de los tuits sobre los que se quiere entrenar

        y : lista de int
            Lista correspondiente con la clases de cada uno de los elementos de la muestra

        idioma : String
            SPANISH o ENGLISH dependiendo del idioma de la muestra

        fitness : funcion(y1,y2)
            Funcion para calcular la aptitud de los calsificadores. Recibe dos parametros,
            donde el primer parametro corresponde a las etiquetas de las clases reales y el segundo
            al encontrado. Debe devolver un valor numerico, donde numeros mayores corresponden a
            mejor aptitud
        """

        # Constante para
        round_num = 4

        #Prepara los datos

        #Tokenizador
        my_tokenizer = RegexpTokenizer("[\w']+")

        if (idioma == 'SPANISH'):
            stemmer = SpanishStemmer()
        elif (idioma == 'ENGLISH'):
            stemmer = EnglishStemmer()

        #Funcion stemmer
        def tokenizer_stemmer(document):
            return [
                stemmer.stem(token)
                for token in my_tokenizer.tokenize(document)
            ]

        hash_vectorizer = HashingVectorizer(
            analyzer="word",
            tokenizer=tokenizer_stemmer,
            preprocessor=None,
            #stop_words = stopwords.words(idioma),
            n_features=10000,
            strip_accents='ascii',
            encoding='utf-8',
            ngram_range=(1, 3))

        count_vectorizer = CountVectorizer(
            analyzer="word",
            tokenizer=tokenizer_stemmer,
            preprocessor=None,
            #stop_words = stopwords.words(idioma),
            strip_accents='ascii',
            encoding='utf-8',
            ngram_range=(1, 3))

        hash_vectorizer = count_vectorizer

        X_svm = hash_vectorizer.fit_transform(X)
        X_mnb = count_vectorizer.fit_transform(X)

        #Parameters
        if (not prueba):
            option_machines = []
            option_machines.append('linear')
            option_machines.append('polynomial')
            option_machines.append('rbf')
            option_machines.append('sigmoid')
            option_machines.append('bayes')
            num_ite = 5
            C = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
            coef = [-10, 0, 10]
            degrees = [2, 3, 4, 5]
            gamma = [0.1, 1, 10]
            alpha = [0, 0.5, 1, 5, 10]
        else:
            option_machines = []
            option_machines.append('linear')
            option_machines.append('bayes')
            num_ite = 2
            C = [10]
            coef = [0]
            degrees = [2]
            gamma = [0.1]
            alpha = [1]

        machines = {}
        final_scores = {}

        if ('linear' in option_machines):
            #Kernel Lineal
            configurations = list(itertools.product(C))
            scores = {}

            print('Linear Kernel')
            for i in range(num_ite):
                print('     Iteration ' + str(i + 1) + ' of ' + str(num_ite))

                #Divide los datos
                X_train, X_test, y_train, y_test = EntrenarModelo.balanced_split(
                    X_svm, y, test_size=0.2)

                for conf in configurations:

                    clf = svm.SVC(C=conf[0],
                                  kernel='linear',
                                  gamma='scale',
                                  decision_function_shape='ovo')
                    clf.fit(X_train, y_train)

                    y_predicted = clf.predict(X_test)

                    score = fitness(y_test, y_predicted)

                    if (conf not in scores):
                        scores[conf] = 0

                    scores[conf] = scores[conf] + score

                    print('         Configuration: C = ' + str(conf[0]) +
                          ' -- Score: ' + str(np.round(score, round_num)))

            scores = {k: v / num_ite for k, v in scores.items()}

            print(' ')

            conf = max(scores.items(), key=operator.itemgetter(1))[0]
            print(' Max Score: ' + str(scores[conf]))
            final_scores['linear'] = scores[conf]
            machines['linear'] = svm.SVC(C=conf[0],
                                         kernel='linear',
                                         gamma='scale',
                                         decision_function_shape='ovo')

            print(' ')

        if ('polynomial' in option_machines):
            #Kernel polynomial
            configurations = list(itertools.product(C, degrees, coef))
            scores = {}

            print('Polynomial Kernel')

            for i in range(num_ite):
                print('     Iteration ' + str(i + 1) + ' of ' + str(num_ite))

                #Divide los datos
                X_train, X_test, y_train, y_test = EntrenarModelo.balanced_split(
                    X_svm, y, test_size=0.2)

                for conf in configurations:

                    clf = svm.SVC(C=conf[0],
                                  kernel='poly',
                                  gamma='scale',
                                  decision_function_shape='ovo',
                                  degree=conf[1],
                                  coef0=conf[2])
                    clf.fit(X_train, y_train)

                    y_predicted = clf.predict(X_test)

                    score = fitness(y_test, y_predicted)

                    if (conf not in scores):
                        scores[conf] = 0

                    scores[conf] = scores[conf] + score

                    print('         Configuration: C = ' + str(conf[0]) +
                          ', deg = ' + str(conf[1]) + ' coef = ' +
                          str(conf[2]) + ' -- Score: ' +
                          str(np.round(score, round_num)))

            scores = {k: v / num_ite for k, v in scores.items()}

            print(' ')

            conf = max(scores.items(), key=operator.itemgetter(1))[0]
            print(' Max Score: ' + str(scores[conf]))
            final_scores['polynomial'] = scores[conf]
            machines['polynomial'] = svm.SVC(C=conf[0],
                                             kernel='poly',
                                             gamma='scale',
                                             decision_function_shape='ovo',
                                             degree=conf[1],
                                             coef0=conf[2])

            print(' ')

        if ('rbf' in option_machines):
            #Kernel polynomial
            configurations = list(itertools.product(C, gamma))
            scores = {}

            print('RBF Kernel')

            for i in range(num_ite):
                print('     Iteration ' + str(i + 1) + ' of ' + str(num_ite))

                #Divide los datos
                X_train, X_test, y_train, y_test = EntrenarModelo.balanced_split(
                    X_svm, y, test_size=0.2)

                for conf in configurations:

                    clf = svm.SVC(C=conf[0],
                                  kernel='rbf',
                                  gamma=conf[1],
                                  decision_function_shape='ovo')
                    clf.fit(X_train, y_train)

                    y_predicted = clf.predict(X_test)

                    score = fitness(y_test, y_predicted)

                    if (conf not in scores):
                        scores[conf] = 0

                    scores[conf] = scores[conf] + score

                    print('         Configuration: C = ' + str(conf[0]) +
                          ', gamma = ' + str(conf[1]) + ' -- Score: ' +
                          str(np.round(score, round_num)))

            scores = {k: v / num_ite for k, v in scores.items()}

            print(' ')

            conf = max(scores.items(), key=operator.itemgetter(1))[0]
            print(' Max Score: ' + str(scores[conf]))
            final_scores['rbf'] = scores[conf]
            machines['rbf'] = svm.SVC(C=conf[0],
                                      kernel='rbf',
                                      gamma=conf[1],
                                      decision_function_shape='ovo')

            print(' ')

        if ('sigmoid' in option_machines):
            #Kernel polynomial
            configurations = list(itertools.product(C, coef))
            scores = {}

            print('Sigmoid Kernel')

            for i in range(num_ite):
                print('     Iteration ' + str(i + 1) + ' of ' + str(num_ite))

                #Divide los datos
                X_train, X_test, y_train, y_test = EntrenarModelo.balanced_split(
                    X_svm, y, test_size=0.2)

                for conf in configurations:

                    clf = svm.SVC(C=conf[0],
                                  kernel='sigmoid',
                                  gamma='scale',
                                  decision_function_shape='ovo',
                                  coef0=conf[1])
                    clf.fit(X_train, y_train)

                    y_predicted = clf.predict(X_test)

                    score = fitness(y_test, y_predicted)

                    if (conf not in scores):
                        scores[conf] = 0

                    scores[conf] = scores[conf] + score

                    print('         Configuration: C = ' + str(conf[0]) +
                          ' coef = ' + str(conf[1]) + ' -- Score: ' +
                          str(np.round(score, round_num)))

            scores = {k: v / num_ite for k, v in scores.items()}

            print(' ')

            conf = max(scores.items(), key=operator.itemgetter(1))[0]
            print(' Max Score: ' + str(scores[conf]))
            final_scores['sigmoid'] = scores[conf]
            machines['sigmoid'] = svm.SVC(C=conf[0],
                                          kernel='sigmoid',
                                          gamma='scale',
                                          decision_function_shape='ovo',
                                          coef0=conf[1])

            print(' ')

        if ('bayes' in option_machines):

            #Bayes
            configurations = list(itertools.product(alpha))
            scores = {}

            print('Multinomial Bayes')

            for i in range(num_ite):
                print('     Iteration ' + str(i + 1) + ' of ' + str(num_ite))

                #Divide los datos
                X_train, X_test, y_train, y_test = EntrenarModelo.balanced_split(
                    X_mnb, y, test_size=0.2)

                for conf in configurations:

                    clf = MultinomialNB(alpha=conf[0])
                    clf.fit(X_train, y_train)

                    y_predicted = clf.predict(X_test)

                    score = fitness(y_test, y_predicted)

                    if (conf not in scores):
                        scores[conf] = 0

                    scores[conf] = scores[conf] + score

                    print('         Configuration: alpha = ' + str(conf[0]) +
                          ' -- Score: ' + str(np.round(score, round_num)))

            scores = {k: v / num_ite for k, v in scores.items()}

            print(' ')

            conf = max(scores.items(), key=operator.itemgetter(1))[0]
            print(' Max Score: ' + str(scores[conf]))
            final_scores['bayes'] = scores[conf]
            machines['bayes'] = MultinomialNB(alpha=conf[0])

            print(' ')

        mac = max(final_scores.items(), key=operator.itemgetter(1))[0]

        final_machine = machines[mac]

        print('')
        print('Best Machine: ' + str(mac))
        print('Score: ' + str(final_scores[mac]))

        if (mac == 'bayes'):
            final_vectorizer = count_vectorizer
        else:
            final_vectorizer = hash_vectorizer

        clasificador = EncapsularClasificador(final_machine)

        class_results = {}
        class_results['accuracy'] = []

        unique_classes = np.unique(y).tolist()

        for cla in unique_classes:
            class_results[cla] = {}
            class_results[cla]['precision'] = []
            class_results[cla]['recall'] = []

        #Constructs final statistics
        print('')
        print('Constructing Final Statistics')
        print('')
        for i in range(num_ite):

            print('Iteration ' + str(i + 1) + ' of ' + str(num_ite))
            maquina = EncapsularClasificador.clone(clasificador)
            X_train, X_test, y_train, y_test = EntrenarModelo.balanced_split(
                X, y, test_size=0.2)

            maquina.fit(X_train, y_train)
            y_predicted = maquina.predict(X_test)

            class_results['accuracy'].append(
                accuracy_score(y_test, y_predicted))

            for cla in unique_classes:
                #precision
                sub_test = y_test[y_predicted == cla]
                precision = np.sum(sub_test == cla) / (max(len(sub_test), 1))
                #print('Precision for class ' + str(cla) + ': '+ str(precision))
                class_results[cla]['precision'].append(precision)

                #recall
                sub_test = y_predicted[y_test == cla]
                recall = np.sum(sub_test == cla) / (max(len(sub_test), 1))
                #print('Recall for class ' + str(cla) + ': '+ str(recall))
                class_results[cla]['recall'].append(recall)

        class_results_consolidated = {}
        class_results_consolidated['accuracy'] = np.round(
            100 * np.mean(class_results['accuracy']), 3)

        for cla in unique_classes:
            class_results_consolidated[cla] = {}

            precision = np.round(
                100 * np.mean(class_results[cla]['precision']), 3)
            recall = np.round(100 * np.mean(class_results[cla]['recall']), 3)

            class_results_consolidated[cla]['precision'] = precision
            class_results_consolidated[cla]['recall'] = recall

        pprint.pprint(class_results_consolidated, width=1)

        return (sklearn.base.clone(final_machine))