Python SpanishStemmer.SpanishStemmer примеры, nltk.stem.snowball.SpanishStemmer.SpanishStemmer Python примеры использования

Пример #1

0

Показать файл

    def __init__(self,
                 language='en',
                 database_name='memory',
                 memory_table='memory',
                 listen_log_table='listen_log',
                 speak_log_table='speak_log'):
        super().__init__(language, database_name, memory_table,
                         listen_log_table, speak_log_table)

        try:
            json_file = open('modelo_gustos.json', 'r')
            loaded_model_json = json_file.read()
            json_file.close()

            self.model = model_from_json(loaded_model_json)
            self.model.load_weights("modelo_gustos.h5")
            self.model.compile(loss='mean_squared_error',
                               optimizer='adam',
                               metrics=['binary_accuracy'])
        except Exception:
            print('****ERROR: Error cargando modelo...****')

        self.stemmer = SpanishStemmer()
        self.words = [
            '¿qu', '?', 'peli', 'pelis', 'color', 'favorit', 'leer', 'libr',
            'novel', 'ver', 'prefier', 'gust', 'pelicul', 'jug', '¿cual',
            'prefer', 'jueg', 'com', 'plat', 'animal', 'videojueg'
        ]
        self.classes = [
            'comida', 'color', 'animal', 'juego', 'libro', 'película'
        ]

Пример #2

0

Показать файл

Файл: preprocess_twitter_dataset.py Проект: zzyxzz/Twitter-Election-Classification

def process_violence(lang, data_path, stopword_path, save_path):
    if lang == "English":
        stemmer = EnglishStemmer()
    elif lang == "Spanish":
        stemmer = SpanishStemmer()
    else:
        stemmer = None

    print "loading dataset"
    line_sentences = ProcessLineSentence(dataPath=data_path,
                                         label="violence",
                                         stopwordPath=stopword_path,
                                         stemmer=stemmer)

    with open(save_path, 'w') as f:
        writer = csv.writer(f)
        for sentence, label in line_sentences:
            if label == "no":
                l = [0]
            elif label == "violence":
                l = [1]
            elif label == "malpractice":
                l = [2]
            else:
                raise (Exception("Wrong label: {}".format(label)))

            writer.writerow(l + sentence)

Пример #3

0

Показать файл

 def __init__(self, min_long=5):
     """
     Para inicializar un `Tokenizer` es necesario saber el tamaño mínimo de caracteres
     `min_long`. que constituyen una palabra válidad
     :param min_long: un entero. Por defecto igual a cinco (5)
     """
     self.stemmer = SpanishStemmer()
     self.min_long = min_long

Пример #4

0

Показать файл

def find_top_N_words(lang_entries, top_N, lang):
    dictionary = Lang_Dictionary({}, lang)
    for player in lang_entries:
        for chat in player.c:
            language = {'eng': 0, 'spn': 0, 'other': 0, 'tot': 0}
            sentence = player.c[chat]
            newlist = player.c[chat].strip().split(' ')
            newlist = [x.strip("''") for x in newlist]
            for word in newlist:
                language['tot'] += 1
                if word.lower() not in Lang_dicts.lang_index:
                    language['other'] += 1
                else:
                    word = Lang_dicts.lang_index[word.lower()]
                    if word == "english":
                        language['eng'] += 1
                    elif word == "spanish":
                        language['spn'] += 1
                    else:
                        language['other'] += 1
            if language['other'] < 2 * (language['spn'] + language['eng']):
               print(sentence)
               if language['spn'] > language['eng']:
                   print("SPANISH")
                   stemmer = SpanishStemmer()
               else:
                   print("ENGLISH")
                   stemmer = EnglishStemmer()
            aslist = []
            aslist += sentence
            sentence =""
            j = ''.join(aslist)
            words = j.split(' ')
            for line in words:
                line = str(line).replace('\'', '')
                line = line.replace('""', '')
                line = line.replace('"', '')
                if len(line) > 0:
                    if language["other"] < 2 * (language['spn'] + language["eng"]):
                        sentence += stemmer.stem(line.encode(sys.stdout.encoding, errors = 'replace')) + " "
                        print(sentence)
                    ##INEFFICIENT - looking through dictionary each time?
                    if line.lower() not in dictionary.d:
                        dictionary.d[line.lower()] = 0
                    dictionary.d[line.lower()] += 1

    ###wthCounts is a list of the word and its count
    wthCounts = []
    for(w,c) in dictionary.d.iteritems():
        wthCounts += [(c,w)]
    ##wc is the wthCounts list only sorted
    wc = sorted(wthCounts, reverse=True)
    return wc[:top_N]

Пример #5

0

Показать файл

Файл: index.py Проект: hectorsalvador/question-answering-mx

def build_paragraph_inv_index(paragraphs, stem):
    p_index = {}
    stemmer = SpanishStemmer()
    for i, paragraph in enumerate(paragraphs):
        words = [word for word in paragraph.split() if word not in STOP_WORDS]
        for word in words:
            if stem:
                word = stemmer.stem(word)
            if word not in p_index:
                p_index[word] = []
            p_index[word].append(i)
    return p_index

Пример #6

0

Показать файл

 def __init__(self, lemma=False, stem=False):
     self.extra_dicts = Dicts()
     self.english_dict = enchant.Dict("en_EN")
     self.spanish_dict = enchant.Dict("es_AR")
     self.lemma = lemma
     self.stem = stem
     self.VARIANT_CLASS = 0
     self.SPANISH_CLASS = 1
     self.FOREIGN_CLASS = 2
     if lemma:
         self.lemmatizer = Lemmatizer()
     if stem:
         self.stemmer = SpanishStemmer()

Пример #7

0

Показать файл

Файл: oov_classifier.py Проект: internetagb/Tw_norm

 def __init__(self, stem=False):
     dictionaries = dicts()
     path = '/home/alangb/TWPP'  # path to TreeTagger installation directory
     self.english_dict = enchant.Dict("en_EN")
     self.spanish_dict = enchant.Dict("es_ES")
     self.ND = dictionaries.norm
     self.SD = dictionaries.lemario
     self.PND = dictionaries.names
     self.stem = stem
     if stem:
         self.stemmer = SpanishStemmer()
     else:
         self.tagger = TreeTagger(TAGLANG='es', TAGDIR=path)

Пример #8

0

Показать файл

Файл: TokenRepository.py Проект: zamudio-fabian/RI-Estructuras-de-Datos

    def __init__(self):

        self.reglasEntities.append(EmailRegla())
        self.reglasEntities.append(UrlRegla())
        self.reglasEntities.append(FechasRegla())
        self.reglasEntities.append(TelefonosRegla())
        self.reglasEntities.append(AbreviaturasRegla())
        self.reglasEntities.append(NombresPropiosRegla())
        self.reglasEntities.append(NumerosRegla())
        self.reglasDocumento.append(MinusculasRegla())
        self.reglasDocumento.append(TranslateRegla())
        self.reglasDocumento.append(LimpiarHtmlTagsRegla())
        self.reglasDocumento.append(LimpiadoBasicoRegla())
        self.reglasTokens.append(MinMaxCaracteresRegla())
        self.stemmer = SpanishStemmer()

Пример #9

0

Показать файл

def run_BM25_collection(output_dir,documents,queries,qrels,train,validation,test,k,language):
    
    if language=='en':
        stop_words = set(stopwords.words('english'))
        stemmer = PorterStemmer()
    
    elif language=='fr':
        stop_words = set(stopwords.words('french'))
        stemmer = FrenchStemmer()
    
    elif language=='es':
        stop_words = set(stopwords.words('spanish'))
        stemmer = SpanishStemmer()
        
    elif language=='it':
        stop_words = set(stopwords.words('italian'))
        stemmer = ItalianStemmer()
    
    
    corpus = [] 
    doc_indexes = []
    for key,value in documents.items():
        doc_indexes.append(key)
        doc = [stemmer.stem(elem) for elem in value.split(" ") if elem not in stop_words]
        corpus.append(value.split(" "))
    bm25 = BM25Okapi(corpus)
    
    print("Running BM25",flush=True)
    
    results = dict()
    for i,elem in enumerate(train):
        results[elem] = run_BM25_query(queries[elem],bm25,doc_indexes,k,language)
        if i%1000==0:
            print('Processing query',i,'/',len(train),flush=True)
    save_BM25_res(output_dir+'/training/BM25.res',results)
    save_BM25_qrels_dataframe(output_dir + '/training/BM25.qrels.csv',results,qrels,True)
    
    results = dict()
    for elem in validation:
        results[elem] = run_BM25_query(queries[elem],bm25,doc_indexes,k,language)
    save_BM25_res(output_dir+'/validation/BM25.res',results)
    save_BM25_qrels_dataframe(output_dir + '/validation/BM25.qrels.csv',results,qrels,False)
    
    results = dict()
    for elem in test:
        results[elem] = run_BM25_query(queries[elem],bm25,doc_indexes,k,language)
    save_BM25_res(output_dir+'/test/BM25.res',results)
    save_BM25_qrels_dataframe(output_dir + '/test/BM25.qrels.csv',results,qrels,False)

Пример #10

0

Показать файл

Файл: parseFile.py Проект: D-Troy/tweet_parsing

 def __init__(self):
     self.tweets = 0
     self.related_tweets = 0
     self.stopwords = {}
     self.stemmers = {}
     self.stemmers["es"] = SpanishStemmer()
     self.stemmers["en"] = PorterStemmer()
     self.stemmers["fr"] = FrenchStemmer()
     self.stemmers["de"] = GermanStemmer()
     self.stopwords["es"] = self.load_stopwords_file(
         "spanish_stopwords.txt")
     self.stopwords["en"] = self.load_stopwords_file(
         "english_stopwords.txt")
     self.stopwords["fr"] = self.load_stopwords_file("french_stopwords.txt")
     self.stopwords["ge"] = self.load_stopwords_file("german_stopwords.txt")
     self.output_file = open(sys.argv[2], 'a')

Пример #11

0

Показать файл

Файл: retrieval.py Проект: hectorsalvador/question-answering-mx

    def __init__(self, question, words, stem):
        self.question = question
        self.stem = stem
        self.stemmer = SpanishStemmer()
        self.words = words
        self.stemmed_words = self.stem_words(self.words)
        self.path_pfx = os.getcwd()

        self.inverted_index = self.load_doc_inverted_index()
        self.doc_names = self.init_doc_names()
        self.paragraph_indices = {}
        self.paragraph_inverted_indices = {}
        self.results = pd.DataFrame(columns=['text', 'law', 'score'])
        self.load_paragraph_indices()

        self.L = 23055.676666666666  #Manually obtained using bash
        self.scores = {'tf': {}, 'idf':{}, 'tfidf':{},'n_containing':{},\
              'score':{}}

Пример #12

0

Показать файл

Файл: utils.py Проект: jwilliamn/multiclassTextClassifierRepo

def generate_stopwords(stopname='stopSpanish.pkl'):
    """ Remove stop words, and apply stemming """
    stemmer=SpanishStemmer()
    stopwords_es = set(stopwords.words('spanish'))
    stopwords_es_sw = set(get_stop_words('spanish'))
    stopSpanishBeta = list(set(stopwords_es.union(stopwords_es_sw)))

    stopSpanish = set(stopwords_es.union(stopwords_es_sw))
    for stopWord in stopSpanishBeta:
        stopSpanish.add(stemmer.stem(stopWord))

    stopSpanish = list(stopSpanish)
    stopSpanish.extend(['tra', 'd', 'desc']) # Adding stopwords not present in the standard stopwords
    stopSpanish.remove('no')  # Keep to help identify negative categories

    with open(f'{resource_path}/{stopname}', 'wb') as f:
        pickle.dump(stopSpanish, f)

    return stopSpanish

Пример #13

0

Показать файл

Файл: index.py Проект: hectorsalvador/question-answering-mx

def build_index_from_words(words, stem):
    '''
	Takes: 
	- words, a list of strings
	
	Returns:
	- index, a dictionary with a count of times a word appears
	in the document
	'''
    index = {}
    stemmer = SpanishStemmer()
    for word in words:
        if word not in STOP_WORDS:
            if stem:
                word = stemmer.stem(word)
            if word not in index:
                index[word] = 0
            index[word] += 1
    return index

Пример #14

0

Показать файл

Файл: gigaword.py Проект: sjoerdapp/metaphor-search

class GigawordParser(StreamParser):
    STEMMERS = {
        "eng": PorterStemmer(ignore_stopwords=False),
        "spa": SpanishStemmer(),
    }

    def __init__(self, language):
        self.next_id = 0
        self.language = language
        self.stemmer = self.STEMMERS.get(language)
        if self.stemmer is None:
            raise Exception("Unsupported language %s" % language)

    def init_id_counter(self, initial):
        self.next_id = initial

    def new_id(self):
        new_id = self.next_id
        self.next_id += 1
        return new_id

    def parse_raw(self, xml_str):
        xml = minidom.parseString(xml_str)
        if self.language == "es":
            try:
                url = "gigaword:" + xml.getElementsByTagName(
                    "DOC")[0].attributes["id"].value
                title = xml.getElementsByTagName(
                    "HEADLINE")[0].firstChild.nodeValue
            except:
                url = "<NONE>"
                title = "<NONE>"
        else:
            url = "<NONE>"
            title = "<NONE>"
        text = stringio.StringIO()
        for node in xml.getElementsByTagName("TEXT")[0].childNodes:
            if len(node.childNodes) > 0:
                text.write(node.firstChild.nodeValue)
        content = text.getvalue()
        terms = text_to_terms(content, self.language)
        return RuwacDocument(self.new_id(), url, title, content, terms)

Пример #15

0

Показать файл

    def lemmatize(self, text, lang):

        # spacy.prefer_gpu()
        # nlp = spacy.load(lang) # en fr "en_core_web_sm"
        if lang == "fr":
            stemmer = FrenchStemmer()
        elif lang == "es":
            stemmer = SpanishStemmer()
        else:
            stemmer = EnglishStemmer()

        stemmed = []
        for word in text.split(" "):
            stemmed.append(stemmer.stem(word))

        # doc = nlp(u""+text)
        # lem_terms = []
        # for token in doc:
        #     lem_terms.append(token.lemma_)

        return " ".join(stemmed)

Пример #16

0

Показать файл

def run_BM25_query(query,bm25,doc_indexes,k,language):
    
    if language=='en':
        stop_words = set(stopwords.words('english'))
        stemmer = PorterStemmer()
    
    elif language=='fr':
        stop_words = set(stopwords.words('french'))
        stemmer = FrenchStemmer()
    
    elif language=='es':
        stop_words = set(stopwords.words('spanish'))
        stemmer = SpanishStemmer()
        
    elif language=='it':
        stop_words = set(stopwords.words('italian'))
        stemmer = ItalianStemmer()
    
    tokenized_query = [stemmer.stem(elem) for elem in query.split(" ") if elem not in stop_words]
    doc_scores = bm25.get_scores(tokenized_query)
    top_k = np.argsort(doc_scores)[::-1][:k]
    results = [[doc_indexes[key],doc_scores[key]] for key in top_k]
    return results

Пример #17

0

Показать файл

Файл: preprocess_twitter_dataset.py Проект: zzyxzz/Twitter-Election-Classification

def process_election(lang, data_path, stopword_path, save_path):
    if lang == "English":
        stemmer = EnglishStemmer()
    elif lang == "Spanish":
        stemmer = SpanishStemmer()
    else:
        stemmer = None

    print "loading dataset"
    line_sentences = ProcessLineSentence(dataPath=data_path,
                                         label="election",
                                         stopwordPath=stopword_path,
                                         stemmer=stemmer)

    with open(save_path, 'w') as f:
        writer = csv.writer(f)
        for sentence, label in line_sentences:
            if label == "yes":
                l = [1]
            else:
                l = [0]
            row = [w.encode('utf-8') for w in sentence]
            writer.writerow(l + row)

Пример #18

0

Показать файл

Файл: partIII.py Проект: YuhaoT/NLP-Named-Entity-Recognition

def getfeats(fields, o):
    """ This takes the word in question and
    the offset with respect to the instance
    word """
    word = fields[0]
    stemmer = SpanishStemmer()

    with_hyphen = 0
    if "-" in word:
        with_hyphen = 1

    with_apostrophe = 0
    if "'" in word:
        with_apostrophe = 1

    o = str(o)
    features = [
        (o + "word", word),
        (o + 'pos', fields[1]),
        #(o + 'prefix1', word[:1]),
        (o + 'prefix2', word[:2]),
        (o + 'prefix3', word[:3]),
        (o + 'prefix4', word[:4]),
        #(o + 'suffix1', word[-1:]),
        (o + 'suffix2', word[-2:]),
        (o + 'suffix3', word[-3:]),
        (o + 'suffix4', word[-4:]),
        (o + 'is_upper', word.isupper()),
        (o + 'is_title', word.istitle()),
        (o + 'is_digit', word.isdigit()),
        (o + 'with_hypen', with_hyphen),
        (o + 'with_apostrophe', with_apostrophe),
        (o + 'spanich_stem', stemmer.stem(word)),
        # (o + 'word_shape', word_shape(word))
    ]

    return features

Пример #19

0

Показать файл

def spanish_swadesh_list(stemmed=True):
    """
    Helper function that returns a list of strings with the stems of the
    spanish Swadesh entries.

    """
    try:
        stemmer = SpanishStemmer(True)
    except:
        log.warn("Spanish stemmer could not be loaded!")
        return

    swadesh_entries = []
    for line in util.read_text_file(
            util.data_path('swadesh', 'swadesh_spa.txt'), lines=True):
        line = line.strip()
        for e in line.split(","):
            e = e.strip()
            if stemmed:
                stem = stemmer.stem(e)
                swadesh_entries.append(stem)
            else:
                swadesh_entries.append(e)
    return swadesh_entries

Пример #20

0

Показать файл

Файл: clasificador.py Проект: minigonche/big_data_talleres

 def tokenizer_stemmer_global(document):
     stemmer = SpanishStemmer()
     my_tokenizer = RegexpTokenizer("[\w']+")
     return [
         stemmer.stem(token) for token in my_tokenizer.tokenize(document)
     ]

Пример #21

0

Показать файл

Файл: processors.py Проект: slotbite/sentiment_classifier

 def __init__(self):
     super(StemmerProcessor, self).__init__()
     self.stemmer = SpanishStemmer()

Пример #22

0

Показать файл

   Hay multitud de ""stemmizadores"", yo voy a coger el de español
'''

from nltk.stem.snowball import SnowballStemmer, SpanishStemmer

#Igual que con punkt hay que bajar un paquete
download('stopwords')

#Si no conocemos el lenguaje a priori.
#SnowballStemmer(language, ignore_stopwords=False)
spanish_stem = SnowballStemmer("spanish", True)

# Si conocemos el lenguaje de antemano, podemos importarlo directamente
#SpanishStemmer(ignore_stopwords=False)
spanish_stem = SpanishStemmer(True)
print(spanish_stem.stem("Comiendo"), spanish_stem.stem("Bailando"),
      spanish_stem.stem("bailar"), spanish_stem.stem("estantería"))
'''################################
   # Obteniendo el verbo original #
   ################################

   Conocido como lemmatization.

   NLTK no tiene esto en español, solo inglés.
'''

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
#Igual que con punkt hay que bajar un paquete
download('wordnet')

Пример #23

0

Показать файл

import csv
import collections
import operator
import unicodedata

import os
cwd = os.getcwd()
root = os.path.dirname(cwd)
lematizador_dir = os.path.join(root, "data", "lematizador", "lematizador.csv")
stopwords_dir = os.path.join(root, "data", "stopwords")

from spellchecker import SpellChecker  #https://pypi.org/project/pyspellchecker/

#Stemmer
from nltk.stem.snowball import SpanishStemmer
stemmer = SpanishStemmer()

#Creación objeto para corrección ortografía
spell = SpellChecker(language="es")
metodo_desconocidas = spell.unknown
metodo_correccion = spell.correction
lista_blanca_regiones = [
    "Arica", "Parinacota", "Tarapacá", "Antofagasta", "Atacama", "Coquimbo",
    "Valparaíso", "Metropolitana", "Santiago", "Libertador", "General",
    "Bernardo", "O’Higgins", "Maule", "Ñuble", "Biobío", "Araucanía", "Ríos",
    "Lagos", "Aysén", "General", "Carlos", "Ibáñez", "Campo", "Magallanes",
    "Antártica"
]
lista_blanca_telecom = [
    'lte', 'whatsapp', 'instagram', 'telegram', 'youtube', 'facebook', 'entel',
    'bafi', 'resetea', 'samsung', 'huawei', 'iphone', 'kb', 'mb', 'pixi',

Пример #24

0

Показать файл

def do_stemmer(df, stop_language='spanish'):
    """Apply stop words and Stemmers"""
    ##  Como nos llegan tickets en dos idiomas añadimos las palabras de ambos idiomas
    stop = get_stop_words(stop_language) + get_stop_words('english')
    ## Añdimos nuestras propias palabras
    stop += [
        "buenas", "buenos", "cid", "dias", "gracias", "hola", "mucho", "mucha",
        "poder", "proyecto", "please", "saludo", "tardes", "www", "habia"
    ]
    stop += [
        'ahora',
        'algun',
        'alguna',
        'amanecia interrumpio',
        'amanecia interrumpio relato',
        'amanecia interrumpio relato habian',
        'amanecia interrumpio relato habian dado',
        'aquel',
        'asi',
        'aun',
        'cada',
        'vez',
        'mas',
        'cualquier',
        'cosa',
        'cuanto',
        'dado',
        'darse',
        'debe',
        'debia',
        'despues',
        'dia noche',
        'dia siguiente',
        'diez años',
        'diez mil',
        'dijo',
        'dijo',
        'dio',
        'habia',
        'mas',
        'podia',
        'podian',
        'mismo',
        'si',
        'tal',
        'tan',
        'puede',
        'pueden ser',
        'pues',
        'puso',
        'toda',
        'todas',
        'vease tambien',
        'primer lugar',
        'varias',
        'dos',
        'largo',
        'hacia'
        'uno',
        'una',
        'unos',
        'una',
        'aquella',
        'aquello',
        'aquel',
        'hace',
        'muchas',
        'mucho',
        'muchos',
        'mucha',
        'pueden',
        'puedo',
        'unas',
        'abrio puerta',
        'arriba abajo',
        'aqui alla',
        'habian',
        'doña',
        'don',
        'señor',
        'señora',
        'hizo',
        'quedo',
        'fuerza sino',
        'quedo perplejo',
        'parece haber',
        'parece ser',
        'parecia haber',
        'mayor parte',
        'mañana siguiente',
        'media hora',
        'hoy dia',
        'iba ser',
        'iii pag',
        'haber hecho',
        'habria podido',
        'hacer cosas',
        'hacia arriba',
        'hacia atras',
        'hacia puerta',
        'hacia tiempo',
        'decir verdad',
        'dejo caer',
        'demasiado tarde',
        'derecha izquierda',
        'di cuenta',
        'dia anterior',
        'dia noche',
        'dia siguiente',
        'casi siempre',
        'cierto dia',
        'cierto modo',
        'cinco años',
        'aqui alla',
        'arriba abajo',
        'aunque solo',
        'año nuevo',
        'años edad',
        'buena parte',
        'ninguna parte',
        'noche anterior',
        'noche dia',
        'nunca visto',
        'partido comunista',
        'podria haber',
        'podria ser',
        'press cambridge',
        'primer lugar',
        'quiere decir',
        'quiero decir',
        'sentido comun',
        'seria mejor',
        'tras haber',
        'tres años',
        'tres cuatro',
        'tres meses',
        'voz alta',
        'voz baja',
    ]
    stop_words_generated_tokens = [
        'abajo', 'abrio', 'alla', 'alta', 'amanecia', 'anterior', 'aqui',
        'aren', 'arriba', 'atras', 'aunque', 'año', 'años', 'baja', 'buena',
        'caer', 'cambridge', 'can', 'casi', 'cierto', 'cinco', 'comun',
        'cosas', 'couldn', 'cuatro', 'cuenta', 'decir', 'dejo', 'demasiado',
        'di', 'dia', 'didn', 'diez', 'doesn', 'edad', 'haber', 'habria',
        'hacer', 'hacia', 'hadn', 'hasn', 'haven', 'hecho', 'hora', 'hoy',
        'iba', 'iii', 'isn', 'let', 'll', 'lugar', 'mayor', 'mañana', 'media',
        'mejor', 'meses', 'modo', 'mustn', 'ninguna', 'noche', 'nuevo',
        'nunca', 'pag', 'parece', 'parecia', 'parte', 'partido', 'podido',
        'podria', 'puerta', 'quiere', 'quiero', 're', 'relato', 'sentido',
        'ser', 'seria', 'shan', 'shouldn', 'siempre', 'siguiente', 'sino',
        'solo', 'tambien', 'tarde', 'tiempo', 'tras', 'tres', 've', 'vease',
        'visto', 'wasn', 'weren', 'won', 'wouldn'
    ]
    stop += stop_words_generated_tokens
    ps = SpanishStemmer()

    a = []
    df["stem"] = "n"
    for i, row in df.iterrows():
        a.append(
            ps.stem(row["text"]).replace('fuerza sino', '').replace(
                'acceder', 'acceso').replace('user', 'usuario').replace(
                    'access', 'acceso').replace('usuarios', 'usuario').replace(
                        'abrio puerta', '').replace('acto seguido', ''))
    df["stem"] = a
    return df, stop

Пример #25

0

Показать файл

Файл: Translation Graph from GrAF.py Проект: ramonrod/graf-python

# The next step is to connect spanish translations that contain the same stem. For this we first remove certain stop words from the translation (list of stopwords from NLTK). There are two cases then: just one word remains, or more than one word remains.
#
# We have to options now what to do with the latter: either they are not connected with anything at all (default behaviour), or each word is stemmed and the translation is connected with every other translation that contain the same stems. Right now this results in many connections that look not very useful. This should be done in a more intelligent way in the future (for example find heads of phrases in mulitword expression and only connect those; split the weight of the connections between all stems and work with weighted graphs from this step on; ...).
#
# To connect the spanish translations the script adds additional "stem nodes" to the graph. The name of these nodes consists of a spanish word stem plus a pipe symbol plus the string "stem". These nodes look like this in a dot file:
#
# > "tom|stem" [is_stem=True];
#
# The introduction of these nodes later facilites the output of translation matrixes, as you can just search for stems within the graph and only output direct neighbours with spanish translations. It would also be possible to directly connect the spanish translations if they have a matching stem, but then the graph traversal to find matching translations and their heads is a bit more complex later.
#
# First we create a stemmer object from the SpanishStemmer in NLTK:

# <codecell>

from nltk.stem.snowball import SpanishStemmer
stemmer = SpanishStemmer(True)

# <markdowncell>

# We create the list of stopwords and encode them as unicode strings:

# <codecell>

combined_graph_stemmed = copy.deepcopy(combined_graph)
stopwords = nltk.corpus.stopwords.words("spanish")
stopwords = [w.decode("utf-8") for w in stopwords]

# <markdowncell>

# Then we loop through all the nodes of the merged graph and add the stem nodes to each Spanish node. If the node has only one word (after stopword removal) we will use the NLTK stemmer; otherwise we just leave the phrase as it is:

Пример #26

0

Показать файл

def stemmer_all(tweet):
    stm = SpanishStemmer()
    split_tweet = [word for word in tweet.lower().split(' ') if word.strip()]
    return ' '.join([stm.stem(word.strip()) for word in split_tweet])

Пример #27

0

Показать файл

Файл: utils.py Проект: jwilliamn/multiclassTextClassifierRepo

def remove_stopwords(text, stopSpanish):
    stemmer=SpanishStemmer()
    textList = text.split()
    textList = [word for word in textList if word not in stopSpanish]
    return ' '.join([stemmer.stem(word) for word in textList])

Пример #28

0

Показать файл

# -*- coding: utf-8 -*-
"""
Created on Mon May  6 16:07:59 2019

@author: Turing
"""

from bs4 import BeautifulSoup as Soup
from _pickle import dump
from nltk.stem.snowball import SpanishStemmer

handler = open('senticon.es.xml', encoding="utf-8").read()
soup = Soup(handler, 'lxml')
diccionario_polaridad = {}
ss = SpanishStemmer()
for lemma in soup.find_all('lemma'):
    palabra = lemma.get_text()
    polaridad = float(lemma.attrs["pol"])
    diccionario_polaridad[ss.stem(palabra.replace(' ',
                                                  '')).lower()] = polaridad

output = open("diccionario_polaridades.pk1", "wb")
dump(diccionario_polaridad, output, -1)
output.close()

Пример #29

0

Показать файл

Файл: vectorizer.py Проект: emeriles/mdt2017

    def get_vector_matrix(self, freq_floor=50, context_words=3):

        nlp = es_core_web_md.load()
        STOPWORDS = spacy.es.STOP_WORDS

        def _clean_sent(sent):
            clean_sent = []
            # remove stopwords
            for word in sent:
                word = word.lower()
                if not word in STOPWORDS:
                    if not word.isdigit():
                        clean_sent.append(word)
            return clean_sent

        def _update_feature(word, feature_name, features):
            " dirty update of features "
            counts = 1
            if word in vectors:
                if feature_name in vectors[word]:
                    counts = vectors[word][feature_name] + 1
            features[feature_name] = counts
            return features

        def _update_counts(feature_name, f_counts):
            counts = 1
            if feature_name in f_counts:
                counts = f_counts[feature_name] + 1
            f_counts[feature_name] = counts
            return f_counts

        sents = self.corpus.get_sents()
        stemmer = SpanishStemmer()

        # will use the words as keys and dict of features as values
        vectors = {}
        #freq_counts = {}
        for sent in sents:
            # TODO: PARALELLIZE!!
            #for doc in nlp.pipe(texts, batch_size=10000, n_threads=3):
            # take off stopwords && to get context_words!
            cleaned_sent = _clean_sent(sent)
            doc = nlp(' '.join(sent))
            for word_idx in range(len(doc)):
                # get the word and the pos tag
                spacy_word = doc[word_idx]
                word = spacy_word.text.lower()

                pos_tag = spacy_word.pos_

                if len(word) <= 2:
                    continue
                if word in STOPWORDS:
                    continue
                if word.isdigit():
                    continue

                # if not seen word
                if not word in vectors:
                    features = {}
                else:
                    features = vectors[word]

                # counts of frequency to normalze later
                #freq_counts = _update_counts(pos_tag, freq_counts)

                # context related (POS and words stemmed)
                features = _update_feature(word, pos_tag, features)
                if word_idx > 0:
                    prev_tag = doc[word_idx - 1].pos_
                    feature_name = prev_tag + '_pos_prev'
                    features = _update_feature(word, feature_name, features)
                if word_idx < len(sent) - 1:
                    post_tag = doc[word_idx + 1].pos_
                    feature_name = post_tag + '_pos_post'
                    features = _update_feature(word, feature_name, features)

                # dependency features. the objective of the dep is stemmed!
                dep_type = spacy_word.dep_
                if dep_type != 'ROOT':
                    dep_obj = stemmer.stem(spacy_word.head.text.lower())
                    feature_name = 'DEP:' + dep_type + '-' + dep_obj
                    features = _update_feature(word, feature_name, features)

                # get n words from context as features (stemmed...!)
                for i in range(context_words):
                    ctxt_word = (random.choice(cleaned_sent))
                    feature_word = stemmer.stem(ctxt_word)
                    feature_name = ctxt_word + '_ctxt_word'
                    features = _update_feature(word, feature_name, features)
                # agregar feature de synset (wordnet) :0
                features['word'] = word

                # frequency counting
                features = _update_feature(word, 'freq', features)

                vectors[word] = features

        # sacar palabras con < 'freq'
        words_to_pop = set()
        for word, f_dict in vectors.items():
            if f_dict['freq'] <= freq_floor:
                words_to_pop.add(word)
        for word in words_to_pop:
            vectors.pop(word)

        for word, f_dict in vectors.items():
            #print(word, f_dict)
            f_dict['freq'] = 0
            vectors[word] = f_dict  # delete an irrelevant dimension!
        # normalizar los contextos de POS
        #for word, f_dict in vectors.items():
        #    f_dict[]

        # agregar palabra de contexto. .. LEMATIZADA !

        # NORMALIZAR TODOS LOS CONTEXTOS! -> diccionario de frequencias de ... TODOS los features que ocurrieron
        self.words = list(
            vectors.keys())  # thankfully in the same order as vectors.values

        vectorizer = DictVectorizer(dtype=numpy.int32)
        vec_matrix = vectorizer.fit_transform(list(vectors.values()))
        vectors_shape = vec_matrix.get_shape()
        print(vectors_shape)
        """
        freqs_vector = vectorizer.transform(freq_counts)

        vec_matrix = vstack([freqs_vector, vec_matrix])
        print(s.get_shape)
        print(s)
        print(vectorizer.inverse_transform(s))
        """

        # normalization
        vec_matrix = normalize(vec_matrix, copy=False)

        ####### reduccion de dim no sup
        # reducir dimensionalidad con variance treshold
        #selector = VarianceThreshold(threshold = 0.0)
        #vec_matrix = selector.fit_transform(vec_matrix)

        # SVD (PCA)
        Trunc_svd = TruncatedSVD(n_components=1500)
        vec_matrix = Trunc_svd.fit_transform(vec_matrix)

        # reducir dimensionalidad con percentile de varianza
        #selected = SelectPercentile(chi2, percentile = 10)
        #word_vecs_new=selected.fit_transform(new_word_vecs,target_vec)

        print(vectorizer.inverse_transform(vec_matrix))  # -> to see features!

        return self.words, vec_matrix

Пример #30

0

Показать файл

Файл: clasificador.py Проект: minigonche/big_data_talleres

    def configurar_modelo(X, y, idioma='SPANISH', fitness=accuracy_score):
        """
        Metodo que encuentra la mejor maquina de soporte vectorial para el problema.


        Parameters
        ----------
        X : lista de String
            Lista con los textos de los tuits sobre los que se quiere entrenar

        y : lista de int
            Lista correspondiente con la clases de cada uno de los elementos de la muestra

        idioma : String
            SPANISH o ENGLISH dependiendo del idioma de la muestra

        fitness : funcion(y1,y2)
            Funcion para calcular la aptitud de los calsificadores. Recibe dos parametros,
            donde el primer parametro corresponde a las etiquetas de las clases reales y el segundo
            al encontrado. Debe devolver un valor numerico, donde numeros mayores corresponden a
            mejor aptitud
        """

        # Constante para
        round_num = 4

        #Prepara los datos

        #Tokenizador
        my_tokenizer = RegexpTokenizer("[\w']+")

        if (idioma == 'SPANISH'):
            stemmer = SpanishStemmer()
        elif (idioma == 'ENGLISH'):
            stemmer = EnglishStemmer()

        #Funcion stemmer
        def tokenizer_stemmer(document):
            return [
                stemmer.stem(token)
                for token in my_tokenizer.tokenize(document)
            ]

        hash_vectorizer = HashingVectorizer(
            analyzer="word",
            tokenizer=tokenizer_stemmer,
            preprocessor=None,
            #stop_words = stopwords.words(idioma),
            n_features=10000,
            strip_accents='ascii',
            encoding='utf-8',
            ngram_range=(1, 3))

        count_vectorizer = CountVectorizer(
            analyzer="word",
            tokenizer=tokenizer_stemmer,
            preprocessor=None,
            #stop_words = stopwords.words(idioma),
            strip_accents='ascii',
            encoding='utf-8',
            ngram_range=(1, 3))

        hash_vectorizer = count_vectorizer

        X_svm = hash_vectorizer.fit_transform(X)
        X_mnb = count_vectorizer.fit_transform(X)

        #Parameters
        if (not prueba):
            option_machines = []
            option_machines.append('linear')
            option_machines.append('polynomial')
            option_machines.append('rbf')
            option_machines.append('sigmoid')
            option_machines.append('bayes')
            num_ite = 5
            C = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
            coef = [-10, 0, 10]
            degrees = [2, 3, 4, 5]
            gamma = [0.1, 1, 10]
            alpha = [0, 0.5, 1, 5, 10]
        else:
            option_machines = []
            option_machines.append('linear')
            option_machines.append('bayes')
            num_ite = 2
            C = [10]
            coef = [0]
            degrees = [2]
            gamma = [0.1]
            alpha = [1]

        machines = {}
        final_scores = {}

        if ('linear' in option_machines):
            #Kernel Lineal
            configurations = list(itertools.product(C))
            scores = {}

            print('Linear Kernel')
            for i in range(num_ite):
                print('     Iteration ' + str(i + 1) + ' of ' + str(num_ite))

                #Divide los datos
                X_train, X_test, y_train, y_test = EntrenarModelo.balanced_split(
                    X_svm, y, test_size=0.2)

                for conf in configurations:

                    clf = svm.SVC(C=conf[0],
                                  kernel='linear',
                                  gamma='scale',
                                  decision_function_shape='ovo')
                    clf.fit(X_train, y_train)

                    y_predicted = clf.predict(X_test)

                    score = fitness(y_test, y_predicted)

                    if (conf not in scores):
                        scores[conf] = 0

                    scores[conf] = scores[conf] + score

                    print('         Configuration: C = ' + str(conf[0]) +
                          ' -- Score: ' + str(np.round(score, round_num)))

            scores = {k: v / num_ite for k, v in scores.items()}

            print(' ')

            conf = max(scores.items(), key=operator.itemgetter(1))[0]
            print(' Max Score: ' + str(scores[conf]))
            final_scores['linear'] = scores[conf]
            machines['linear'] = svm.SVC(C=conf[0],
                                         kernel='linear',
                                         gamma='scale',
                                         decision_function_shape='ovo')

            print(' ')

        if ('polynomial' in option_machines):
            #Kernel polynomial
            configurations = list(itertools.product(C, degrees, coef))
            scores = {}

            print('Polynomial Kernel')

            for i in range(num_ite):
                print('     Iteration ' + str(i + 1) + ' of ' + str(num_ite))

                #Divide los datos
                X_train, X_test, y_train, y_test = EntrenarModelo.balanced_split(
                    X_svm, y, test_size=0.2)

                for conf in configurations:

                    clf = svm.SVC(C=conf[0],
                                  kernel='poly',
                                  gamma='scale',
                                  decision_function_shape='ovo',
                                  degree=conf[1],
                                  coef0=conf[2])
                    clf.fit(X_train, y_train)

                    y_predicted = clf.predict(X_test)

                    score = fitness(y_test, y_predicted)

                    if (conf not in scores):
                        scores[conf] = 0

                    scores[conf] = scores[conf] + score

                    print('         Configuration: C = ' + str(conf[0]) +
                          ', deg = ' + str(conf[1]) + ' coef = ' +
                          str(conf[2]) + ' -- Score: ' +
                          str(np.round(score, round_num)))

            scores = {k: v / num_ite for k, v in scores.items()}

            print(' ')

            conf = max(scores.items(), key=operator.itemgetter(1))[0]
            print(' Max Score: ' + str(scores[conf]))
            final_scores['polynomial'] = scores[conf]
            machines['polynomial'] = svm.SVC(C=conf[0],
                                             kernel='poly',
                                             gamma='scale',
                                             decision_function_shape='ovo',
                                             degree=conf[1],
                                             coef0=conf[2])

            print(' ')

        if ('rbf' in option_machines):
            #Kernel polynomial
            configurations = list(itertools.product(C, gamma))
            scores = {}

            print('RBF Kernel')

            for i in range(num_ite):
                print('     Iteration ' + str(i + 1) + ' of ' + str(num_ite))

                #Divide los datos
                X_train, X_test, y_train, y_test = EntrenarModelo.balanced_split(
                    X_svm, y, test_size=0.2)

                for conf in configurations:

                    clf = svm.SVC(C=conf[0],
                                  kernel='rbf',
                                  gamma=conf[1],
                                  decision_function_shape='ovo')
                    clf.fit(X_train, y_train)

                    y_predicted = clf.predict(X_test)

                    score = fitness(y_test, y_predicted)

                    if (conf not in scores):
                        scores[conf] = 0

                    scores[conf] = scores[conf] + score

                    print('         Configuration: C = ' + str(conf[0]) +
                          ', gamma = ' + str(conf[1]) + ' -- Score: ' +
                          str(np.round(score, round_num)))

            scores = {k: v / num_ite for k, v in scores.items()}

            print(' ')

            conf = max(scores.items(), key=operator.itemgetter(1))[0]
            print(' Max Score: ' + str(scores[conf]))
            final_scores['rbf'] = scores[conf]
            machines['rbf'] = svm.SVC(C=conf[0],
                                      kernel='rbf',
                                      gamma=conf[1],
                                      decision_function_shape='ovo')

            print(' ')

        if ('sigmoid' in option_machines):
            #Kernel polynomial
            configurations = list(itertools.product(C, coef))
            scores = {}

            print('Sigmoid Kernel')

            for i in range(num_ite):
                print('     Iteration ' + str(i + 1) + ' of ' + str(num_ite))

                #Divide los datos
                X_train, X_test, y_train, y_test = EntrenarModelo.balanced_split(
                    X_svm, y, test_size=0.2)

                for conf in configurations:

                    clf = svm.SVC(C=conf[0],
                                  kernel='sigmoid',
                                  gamma='scale',
                                  decision_function_shape='ovo',
                                  coef0=conf[1])
                    clf.fit(X_train, y_train)

                    y_predicted = clf.predict(X_test)

                    score = fitness(y_test, y_predicted)

                    if (conf not in scores):
                        scores[conf] = 0

                    scores[conf] = scores[conf] + score

                    print('         Configuration: C = ' + str(conf[0]) +
                          ' coef = ' + str(conf[1]) + ' -- Score: ' +
                          str(np.round(score, round_num)))

            scores = {k: v / num_ite for k, v in scores.items()}

            print(' ')

            conf = max(scores.items(), key=operator.itemgetter(1))[0]
            print(' Max Score: ' + str(scores[conf]))
            final_scores['sigmoid'] = scores[conf]
            machines['sigmoid'] = svm.SVC(C=conf[0],
                                          kernel='sigmoid',
                                          gamma='scale',
                                          decision_function_shape='ovo',
                                          coef0=conf[1])

            print(' ')

        if ('bayes' in option_machines):

            #Bayes
            configurations = list(itertools.product(alpha))
            scores = {}

            print('Multinomial Bayes')

            for i in range(num_ite):
                print('     Iteration ' + str(i + 1) + ' of ' + str(num_ite))

                #Divide los datos
                X_train, X_test, y_train, y_test = EntrenarModelo.balanced_split(
                    X_mnb, y, test_size=0.2)

                for conf in configurations:

                    clf = MultinomialNB(alpha=conf[0])
                    clf.fit(X_train, y_train)

                    y_predicted = clf.predict(X_test)

                    score = fitness(y_test, y_predicted)

                    if (conf not in scores):
                        scores[conf] = 0

                    scores[conf] = scores[conf] + score

                    print('         Configuration: alpha = ' + str(conf[0]) +
                          ' -- Score: ' + str(np.round(score, round_num)))

            scores = {k: v / num_ite for k, v in scores.items()}

            print(' ')

            conf = max(scores.items(), key=operator.itemgetter(1))[0]
            print(' Max Score: ' + str(scores[conf]))
            final_scores['bayes'] = scores[conf]
            machines['bayes'] = MultinomialNB(alpha=conf[0])

            print(' ')

        mac = max(final_scores.items(), key=operator.itemgetter(1))[0]

        final_machine = machines[mac]

        print('')
        print('Best Machine: ' + str(mac))
        print('Score: ' + str(final_scores[mac]))

        if (mac == 'bayes'):
            final_vectorizer = count_vectorizer
        else:
            final_vectorizer = hash_vectorizer

        clasificador = EncapsularClasificador(final_machine)

        class_results = {}
        class_results['accuracy'] = []

        unique_classes = np.unique(y).tolist()

        for cla in unique_classes:
            class_results[cla] = {}
            class_results[cla]['precision'] = []
            class_results[cla]['recall'] = []

        #Constructs final statistics
        print('')
        print('Constructing Final Statistics')
        print('')
        for i in range(num_ite):

            print('Iteration ' + str(i + 1) + ' of ' + str(num_ite))
            maquina = EncapsularClasificador.clone(clasificador)
            X_train, X_test, y_train, y_test = EntrenarModelo.balanced_split(
                X, y, test_size=0.2)

            maquina.fit(X_train, y_train)
            y_predicted = maquina.predict(X_test)

            class_results['accuracy'].append(
                accuracy_score(y_test, y_predicted))

            for cla in unique_classes:
                #precision
                sub_test = y_test[y_predicted == cla]
                precision = np.sum(sub_test == cla) / (max(len(sub_test), 1))
                #print('Precision for class ' + str(cla) + ': '+ str(precision))
                class_results[cla]['precision'].append(precision)

                #recall
                sub_test = y_predicted[y_test == cla]
                recall = np.sum(sub_test == cla) / (max(len(sub_test), 1))
                #print('Recall for class ' + str(cla) + ': '+ str(recall))
                class_results[cla]['recall'].append(recall)

        class_results_consolidated = {}
        class_results_consolidated['accuracy'] = np.round(
            100 * np.mean(class_results['accuracy']), 3)

        for cla in unique_classes:
            class_results_consolidated[cla] = {}

            precision = np.round(
                100 * np.mean(class_results[cla]['precision']), 3)
            recall = np.round(100 * np.mean(class_results[cla]['recall']), 3)

            class_results_consolidated[cla]['precision'] = precision
            class_results_consolidated[cla]['recall'] = recall

        pprint.pprint(class_results_consolidated, width=1)

        return (sklearn.base.clone(final_machine))

Python SpanishStemmer.SpanishStemmer примеры использования