def ProcessCorpus(V, L):
    try:
        stem = SpanishStemmer()
        for l in stdin:
            l = l.split()
            if len(l) < 3: 
                stderr.write('Warning: Short line: \"%s\"\n' % ' '.join(l))
                continue
            tid = l[0]
            uid = l[1]
            lv = [0 for w in V]
            for w in l[2:]:
                w = stem.stem(w.decode('utf-8'))
                d = V.get(w, None)
                if d is None: 
                    #stderr.write('Warning: \"%s\" not in the lexicon\n' % w);
                    continue
                lv[d] = lv[d] + 1
            if sum(lv) == 0:
                stderr.write('Warning: %s with null vector. Label: %d\n' % (tid, L[l[0]]) )
            stdout.write('%d ' % L[l[0]])
            for i in range(len(lv)):
                stdout.write('%d:%d ' % (i+1, lv[i]))
            stdout.write('# %s %s\n' % (tid, uid))
        return 0
    except Exception as ex:
        stderr.write('Exception: %s\n' % repr(ex))
        return 1
示例#2
0
    def __init__(self,
                 language='en',
                 database_name='memory',
                 memory_table='memory',
                 listen_log_table='listen_log',
                 speak_log_table='speak_log'):
        super().__init__(language, database_name, memory_table,
                         listen_log_table, speak_log_table)

        try:
            json_file = open('modelo_gustos.json', 'r')
            loaded_model_json = json_file.read()
            json_file.close()

            self.model = model_from_json(loaded_model_json)
            self.model.load_weights("modelo_gustos.h5")
            self.model.compile(loss='mean_squared_error',
                               optimizer='adam',
                               metrics=['binary_accuracy'])
        except Exception:
            print('****ERROR: Error cargando modelo...****')

        self.stemmer = SpanishStemmer()
        self.words = [
            '¿qu', '?', 'peli', 'pelis', 'color', 'favorit', 'leer', 'libr',
            'novel', 'ver', 'prefier', 'gust', 'pelicul', 'jug', '¿cual',
            'prefer', 'jueg', 'com', 'plat', 'animal', 'videojueg'
        ]
        self.classes = [
            'comida', 'color', 'animal', 'juego', 'libro', 'película'
        ]
示例#3
0
 def __init__(self, min_long=5):
     """
     Para inicializar un `Tokenizer` es necesario saber el tamaño mínimo de caracteres
     `min_long`. que constituyen una palabra válidad
     :param min_long: un entero. Por defecto igual a cinco (5)
     """
     self.stemmer = SpanishStemmer()
     self.min_long = min_long
示例#4
0
def find_top_N_words(lang_entries, top_N, lang):
    dictionary = Lang_Dictionary({}, lang)
    for player in lang_entries:
        for chat in player.c:
            language = {'eng': 0, 'spn': 0, 'other': 0, 'tot': 0}
            sentence = player.c[chat]
            newlist = player.c[chat].strip().split(' ')
            newlist = [x.strip("''") for x in newlist]
            for word in newlist:
                language['tot'] += 1
                if word.lower() not in Lang_dicts.lang_index:
                    language['other'] += 1
                else:
                    word = Lang_dicts.lang_index[word.lower()]
                    if word == "english":
                        language['eng'] += 1
                    elif word == "spanish":
                        language['spn'] += 1
                    else:
                        language['other'] += 1
            if language['other'] < 2 * (language['spn'] + language['eng']):
               print(sentence)
               if language['spn'] > language['eng']:
                   print("SPANISH")
                   stemmer = SpanishStemmer()
               else:
                   print("ENGLISH")
                   stemmer = EnglishStemmer()
            aslist = []
            aslist += sentence
            sentence =""
            j = ''.join(aslist)
            words = j.split(' ')
            for line in words:
                line = str(line).replace('\'', '')
                line = line.replace('""', '')
                line = line.replace('"', '')
                if len(line) > 0:
                    if language["other"] < 2 * (language['spn'] + language["eng"]):
                        sentence += stemmer.stem(line.encode(sys.stdout.encoding, errors = 'replace')) + " "
                        print(sentence)
                    ##INEFFICIENT - looking through dictionary each time?
                    if line.lower() not in dictionary.d:
                        dictionary.d[line.lower()] = 0
                    dictionary.d[line.lower()] += 1

    ###wthCounts is a list of the word and its count
    wthCounts = []
    for(w,c) in dictionary.d.iteritems():
        wthCounts += [(c,w)]
    ##wc is the wthCounts list only sorted
    wc = sorted(wthCounts, reverse=True)
    return wc[:top_N]
def build_paragraph_inv_index(paragraphs, stem):
    p_index = {}
    stemmer = SpanishStemmer()
    for i, paragraph in enumerate(paragraphs):
        words = [word for word in paragraph.split() if word not in STOP_WORDS]
        for word in words:
            if stem:
                word = stemmer.stem(word)
            if word not in p_index:
                p_index[word] = []
            p_index[word].append(i)
    return p_index
示例#6
0
 def __init__(self, lemma=False, stem=False):
     self.extra_dicts = Dicts()
     self.english_dict = enchant.Dict("en_EN")
     self.spanish_dict = enchant.Dict("es_AR")
     self.lemma = lemma
     self.stem = stem
     self.VARIANT_CLASS = 0
     self.SPANISH_CLASS = 1
     self.FOREIGN_CLASS = 2
     if lemma:
         self.lemmatizer = Lemmatizer()
     if stem:
         self.stemmer = SpanishStemmer()
示例#7
0
 def __init__(self, stem=False):
     dictionaries = dicts()
     path = '/home/alangb/TWPP'  # path to TreeTagger installation directory
     self.english_dict = enchant.Dict("en_EN")
     self.spanish_dict = enchant.Dict("es_ES")
     self.ND = dictionaries.norm
     self.SD = dictionaries.lemario
     self.PND = dictionaries.names
     self.stem = stem
     if stem:
         self.stemmer = SpanishStemmer()
     else:
         self.tagger = TreeTagger(TAGLANG='es', TAGDIR=path)
    def __init__(self):

        self.reglasEntities.append(EmailRegla())
        self.reglasEntities.append(UrlRegla())
        self.reglasEntities.append(FechasRegla())
        self.reglasEntities.append(TelefonosRegla())
        self.reglasEntities.append(AbreviaturasRegla())
        self.reglasEntities.append(NombresPropiosRegla())
        self.reglasEntities.append(NumerosRegla())
        self.reglasDocumento.append(MinusculasRegla())
        self.reglasDocumento.append(TranslateRegla())
        self.reglasDocumento.append(LimpiarHtmlTagsRegla())
        self.reglasDocumento.append(LimpiadoBasicoRegla())
        self.reglasTokens.append(MinMaxCaracteresRegla())
        self.stemmer = SpanishStemmer()
def process_violence(lang, data_path, stopword_path, save_path):
    if lang == "English":
        stemmer = EnglishStemmer()
    elif lang == "Spanish":
        stemmer = SpanishStemmer()
    else:
        stemmer = None

    print "loading dataset"
    line_sentences = ProcessLineSentence(dataPath=data_path,
                                         label="violence",
                                         stopwordPath=stopword_path,
                                         stemmer=stemmer)

    with open(save_path, 'w') as f:
        writer = csv.writer(f)
        for sentence, label in line_sentences:
            if label == "no":
                l = [0]
            elif label == "violence":
                l = [1]
            elif label == "malpractice":
                l = [2]
            else:
                raise (Exception("Wrong label: {}".format(label)))

            writer.writerow(l + sentence)
示例#10
0
class Tokenizer(object):
    """
    Esta clase es la encargada de obtener las palabras de los
    documentos recuperados por el `Crawler`
    """

    def __init__(self, min_long=5):
        """
        Para inicializar un `Tokenizer` es necesario saber el tamaño mínimo de caracteres
        `min_long`. que constituyen una palabra válidad
        :param min_long: un entero. Por defecto igual a cinco (5)
        """
        self.stemmer = SpanishStemmer()
        self.min_long = min_long

    def obtener_palabras(self, contenido):
        """
        Este método devuelve una lista de palabras recuperadas del `contenido`
        :param contenido: una cadena con el contenido de texto del documento
        :return: una lista de cadenas de caracteres representando las palabras
        """
        # Realizo el stemming en todo el contenido. Eliminando acentos mayusculas y dejando las raices.
        cont_stemed = self.stemmer.stem(contenido)
        # Divido el texto por palabras eliminando las repetidas
        conjunto_palabras = set(re.split(r'\W+', cont_stemed))
        # Elimino Stopwords, palabras menores a min_long y retorno lista
        return [palabra for palabra in conjunto_palabras if
                palabra not in stopwords.words('spanish') and not len(palabra) < self.min_long]
示例#11
0
class ConceptComparerSpanishStem(ConceptComparerBase):
    """
    Implementation of a concept comparer based on a stemmer for spanish.

    Parameters
    ----------
    None.

    Notes
    -----
    This is a sub-class of
    :py:class:`~lingpy.meaning.concepts.ConceptComparerBase`. It uses a simple
    match of the stem of a given (spanish) string against a given context
    (that is supposed to be a stemmed spanish word stem).

    See also
    --------
    ConceptComparerBase
    ConceptGraph
    """

    def __init__(self):
        self.stemmer = SpanishStemmer(True)
        self.re_brackets = re.compile(" ?\([^)]\)")

    def compare_to_concept(self, element, concept):
        """Compares a given element to a concept.

        Parameters
        ----------
        element : str
            The string (for example a lexical item: head or translation) to
            compare to the concept.
        concept : str or object
            The conpect to compare to.

        Return
        ------
        match : bool
            True if element matches the given concept, False otherwise.

        Notes
        -----
        The `element` is supposed to be a spanish word, the concept a stemmed
        entry of the spanish Swadesh List.

        See also
        --------
        spanish_swadesh_list

        """
        element = self.re_brackets.sub("", element)
        element = element.strip()
        if not " " in element:
            stem = self.stemmer.stem(element)
            if stem == concept:
                return True
        return False
    def __init__(self, question, words, stem):
        self.question = question
        self.stem = stem
        self.stemmer = SpanishStemmer()
        self.words = words
        self.stemmed_words = self.stem_words(self.words)
        self.path_pfx = os.getcwd()

        self.inverted_index = self.load_doc_inverted_index()
        self.doc_names = self.init_doc_names()
        self.paragraph_indices = {}
        self.paragraph_inverted_indices = {}
        self.results = pd.DataFrame(columns=['text', 'law', 'score'])
        self.load_paragraph_indices()

        self.L = 23055.676666666666  #Manually obtained using bash
        self.scores = {'tf': {}, 'idf':{}, 'tfidf':{},'n_containing':{},\
              'score':{}}
示例#13
0
class OOVclassifier(object):
    def __init__(self, stem=False):
        dictionaries = dicts()
        path = '/home/alangb/TWPP'  # path to TreeTagger installation directory
        self.english_dict = enchant.Dict("en_EN")
        self.spanish_dict = enchant.Dict("es_ES")
        self.ND = dictionaries.norm
        self.SD = dictionaries.lemario
        self.PND = dictionaries.names
        self.stem = stem
        if stem:
            self.stemmer = SpanishStemmer()
        else:
            self.tagger = TreeTagger(TAGLANG='es', TAGDIR=path)

    def dictionary_lookup(self, word):
        result = (word in self.SD or word in self.PND
                  or word in self.ND.values())
        return result

    def affix_check(self, word):
        result = False
        if word.islower() or word.istitle():
            if self.stem:
                n = len(word)
                stem = self.stemmer.stem(word)
                # compare with first substring of length n of each word in SD
                for w in [x[:n] for x in self.SD if len(x) >= n]:
                    result = (word == w)
                    if result:
                        break
            else:
                lemma = make_tags(self.tagger.tag_text(word))[0].lemma
                result = self.dictionary_lookup(lemma)
        return result

    def check(self, word):
        result = self.spanish_dict.check(word)
        if not result:
            result = self.dictionary_lookup(word) or self.affix_check(word)
        return result

    def check_NoES(self, word):
        result = False
        if len(word) > 1:
            result = self.english_dict.check(word)
        return result

    def classify(self, word):
        if self.check(word):
            result = 1
        elif self.check_NoES(word):
            result = 2
        else:
            result = 0
        return result
def build_index_from_words(words, stem):
    '''
	Takes: 
	- words, a list of strings
	
	Returns:
	- index, a dictionary with a count of times a word appears
	in the document
	'''
    index = {}
    stemmer = SpanishStemmer()
    for word in words:
        if word not in STOP_WORDS:
            if stem:
                word = stemmer.stem(word)
            if word not in index:
                index[word] = 0
            index[word] += 1
    return index
示例#15
0
class StemmerProcessor(DocumentAtATimeCorpusProcessor):
    def __init__(self):
        super(StemmerProcessor, self).__init__()
        self.stemmer = SpanishStemmer()

    def process_document(self, document):
        processed_document = []
        for word in document:
            processed_document.append(self.stemmer.stem(word))
        return processed_document
示例#16
0
class SpanishStemmer(Normalizer):

    def __init__(self, next_normalizer=None):
        super(SpanishStemmer, self).__init__(next_normalizer)
        self._stemmer = NLTKSpanishStemmer()

    def _apply_normalizer(self, data):
        stem_word = lambda x: self._stemmer.stem(x)
        stem_word_list = lambda xl: [stem_word(w) for w in xl]
        return stem_word(data) if not isinstance(data, (list, tuple)) else stem_word_list(data)
class StemmerProcessor(DocumentAtATimeCorpusProcessor):
    def __init__(self):
        super(StemmerProcessor, self).__init__()
        self.stemmer = SpanishStemmer()

    def process_document(self, document):
        processed_document = []
        for word in document:
            processed_document.append(self.stemmer.stem(word))
        return processed_document
def generate_stopwords(stopname='stopSpanish.pkl'):
    """ Remove stop words, and apply stemming """
    stemmer=SpanishStemmer()
    stopwords_es = set(stopwords.words('spanish'))
    stopwords_es_sw = set(get_stop_words('spanish'))
    stopSpanishBeta = list(set(stopwords_es.union(stopwords_es_sw)))

    stopSpanish = set(stopwords_es.union(stopwords_es_sw))
    for stopWord in stopSpanishBeta:
        stopSpanish.add(stemmer.stem(stopWord))

    stopSpanish = list(stopSpanish)
    stopSpanish.extend(['tra', 'd', 'desc']) # Adding stopwords not present in the standard stopwords
    stopSpanish.remove('no')  # Keep to help identify negative categories

    with open(f'{resource_path}/{stopname}', 'wb') as f:
        pickle.dump(stopSpanish, f)

    return stopSpanish
def getfeats(fields, o):
    """ This takes the word in question and
    the offset with respect to the instance
    word """
    word = fields[0]
    stemmer = SpanishStemmer()

    with_hyphen = 0
    if "-" in word:
        with_hyphen = 1

    with_apostrophe = 0
    if "'" in word:
        with_apostrophe = 1

    o = str(o)
    features = [
        (o + "word", word),
        (o + 'pos', fields[1]),
        #(o + 'prefix1', word[:1]),
        (o + 'prefix2', word[:2]),
        (o + 'prefix3', word[:3]),
        (o + 'prefix4', word[:4]),
        #(o + 'suffix1', word[-1:]),
        (o + 'suffix2', word[-2:]),
        (o + 'suffix3', word[-3:]),
        (o + 'suffix4', word[-4:]),
        (o + 'is_upper', word.isupper()),
        (o + 'is_title', word.istitle()),
        (o + 'is_digit', word.isdigit()),
        (o + 'with_hypen', with_hyphen),
        (o + 'with_apostrophe', with_apostrophe),
        (o + 'spanich_stem', stemmer.stem(word)),
        # (o + 'word_shape', word_shape(word))
    ]

    return features
示例#20
0
def spanish_swadesh_list(stemmed=True):
    """
    Helper function that returns a list of strings with the stems of the
    spanish Swadesh entries.

    """
    try:
        stemmer = SpanishStemmer(True)
    except:
        log.warn("Spanish stemmer could not be loaded!")
        return

    swadesh_entries = []
    for line in util.read_text_file(
            util.data_path('swadesh', 'swadesh_spa.txt'), lines=True):
        line = line.strip()
        for e in line.split(","):
            e = e.strip()
            if stemmed:
                stem = stemmer.stem(e)
                swadesh_entries.append(stem)
            else:
                swadesh_entries.append(e)
    return swadesh_entries
示例#21
0
def run_BM25_collection(output_dir,documents,queries,qrels,train,validation,test,k,language):
    
    if language=='en':
        stop_words = set(stopwords.words('english'))
        stemmer = PorterStemmer()
    
    elif language=='fr':
        stop_words = set(stopwords.words('french'))
        stemmer = FrenchStemmer()
    
    elif language=='es':
        stop_words = set(stopwords.words('spanish'))
        stemmer = SpanishStemmer()
        
    elif language=='it':
        stop_words = set(stopwords.words('italian'))
        stemmer = ItalianStemmer()
    
    
    corpus = [] 
    doc_indexes = []
    for key,value in documents.items():
        doc_indexes.append(key)
        doc = [stemmer.stem(elem) for elem in value.split(" ") if elem not in stop_words]
        corpus.append(value.split(" "))
    bm25 = BM25Okapi(corpus)
    
    print("Running BM25",flush=True)
    
    results = dict()
    for i,elem in enumerate(train):
        results[elem] = run_BM25_query(queries[elem],bm25,doc_indexes,k,language)
        if i%1000==0:
            print('Processing query',i,'/',len(train),flush=True)
    save_BM25_res(output_dir+'/training/BM25.res',results)
    save_BM25_qrels_dataframe(output_dir + '/training/BM25.qrels.csv',results,qrels,True)
    
    results = dict()
    for elem in validation:
        results[elem] = run_BM25_query(queries[elem],bm25,doc_indexes,k,language)
    save_BM25_res(output_dir+'/validation/BM25.res',results)
    save_BM25_qrels_dataframe(output_dir + '/validation/BM25.qrels.csv',results,qrels,False)
    
    results = dict()
    for elem in test:
        results[elem] = run_BM25_query(queries[elem],bm25,doc_indexes,k,language)
    save_BM25_res(output_dir+'/test/BM25.res',results)
    save_BM25_qrels_dataframe(output_dir + '/test/BM25.qrels.csv',results,qrels,False)
示例#22
0
 def __init__(self):
     self.tweets = 0
     self.related_tweets = 0
     self.stopwords = {}
     self.stemmers = {}
     self.stemmers["es"] = SpanishStemmer()
     self.stemmers["en"] = PorterStemmer()
     self.stemmers["fr"] = FrenchStemmer()
     self.stemmers["de"] = GermanStemmer()
     self.stopwords["es"] = self.load_stopwords_file(
         "spanish_stopwords.txt")
     self.stopwords["en"] = self.load_stopwords_file(
         "english_stopwords.txt")
     self.stopwords["fr"] = self.load_stopwords_file("french_stopwords.txt")
     self.stopwords["ge"] = self.load_stopwords_file("german_stopwords.txt")
     self.output_file = open(sys.argv[2], 'a')
示例#23
0
class GigawordParser(StreamParser):
    STEMMERS = {
        "eng": PorterStemmer(ignore_stopwords=False),
        "spa": SpanishStemmer(),
    }

    def __init__(self, language):
        self.next_id = 0
        self.language = language
        self.stemmer = self.STEMMERS.get(language)
        if self.stemmer is None:
            raise Exception("Unsupported language %s" % language)

    def init_id_counter(self, initial):
        self.next_id = initial

    def new_id(self):
        new_id = self.next_id
        self.next_id += 1
        return new_id

    def parse_raw(self, xml_str):
        xml = minidom.parseString(xml_str)
        if self.language == "es":
            try:
                url = "gigaword:" + xml.getElementsByTagName(
                    "DOC")[0].attributes["id"].value
                title = xml.getElementsByTagName(
                    "HEADLINE")[0].firstChild.nodeValue
            except:
                url = "<NONE>"
                title = "<NONE>"
        else:
            url = "<NONE>"
            title = "<NONE>"
        text = stringio.StringIO()
        for node in xml.getElementsByTagName("TEXT")[0].childNodes:
            if len(node.childNodes) > 0:
                text.write(node.firstChild.nodeValue)
        content = text.getvalue()
        terms = text_to_terms(content, self.language)
        return RuwacDocument(self.new_id(), url, title, content, terms)
示例#24
0
    def lemmatize(self, text, lang):

        # spacy.prefer_gpu()
        # nlp = spacy.load(lang) # en fr "en_core_web_sm"
        if lang == "fr":
            stemmer = FrenchStemmer()
        elif lang == "es":
            stemmer = SpanishStemmer()
        else:
            stemmer = EnglishStemmer()

        stemmed = []
        for word in text.split(" "):
            stemmed.append(stemmer.stem(word))

        # doc = nlp(u""+text)
        # lem_terms = []
        # for token in doc:
        #     lem_terms.append(token.lemma_)

        return " ".join(stemmed)
示例#25
0
def run_BM25_query(query,bm25,doc_indexes,k,language):
    
    if language=='en':
        stop_words = set(stopwords.words('english'))
        stemmer = PorterStemmer()
    
    elif language=='fr':
        stop_words = set(stopwords.words('french'))
        stemmer = FrenchStemmer()
    
    elif language=='es':
        stop_words = set(stopwords.words('spanish'))
        stemmer = SpanishStemmer()
        
    elif language=='it':
        stop_words = set(stopwords.words('italian'))
        stemmer = ItalianStemmer()
    
    tokenized_query = [stemmer.stem(elem) for elem in query.split(" ") if elem not in stop_words]
    doc_scores = bm25.get_scores(tokenized_query)
    top_k = np.argsort(doc_scores)[::-1][:k]
    results = [[doc_indexes[key],doc_scores[key]] for key in top_k]
    return results
def process_election(lang, data_path, stopword_path, save_path):
    if lang == "English":
        stemmer = EnglishStemmer()
    elif lang == "Spanish":
        stemmer = SpanishStemmer()
    else:
        stemmer = None

    print "loading dataset"
    line_sentences = ProcessLineSentence(dataPath=data_path,
                                         label="election",
                                         stopwordPath=stopword_path,
                                         stemmer=stemmer)

    with open(save_path, 'w') as f:
        writer = csv.writer(f)
        for sentence, label in line_sentences:
            if label == "yes":
                l = [1]
            else:
                l = [0]
            row = [w.encode('utf-8') for w in sentence]
            writer.writerow(l + row)
示例#27
0
 def __init__(self):
     self.lemmatizer = treetaggerwrapper.TreeTagger(TAGLANG='es')
     self.stopEnglish = stopwords.words('english')
     self.stopSpanish = stopwords.words('spanish')
     self.stopSpanish.append('y/o')
     self.spanishStemmer=SpanishStemmer()
示例#28
0
import pickle

def stopwords_from_file(stopwords_filepath = "data/stopwords/spa.txt"):
    stopwords = codecs.open(stopwords_filepath, "r", "utf-8")
    ret = set()
    for line in stopwords:
        word = line.rstrip("\n")
        word = regex.sub(" *\|.*$", "", word)
        if regex.search("[^\s]", word):
            word = unicodedata.normalize("NFD", word)
            ret.add(word)
    return ret

tokenizer = nltk.load("tokenizers/punkt/spanish.pickle")
stopwords = stopwords_from_file("../../src/qlc/data/stopwords/spa.txt")
stemmer = SpanishStemmer()

doc = ""
doc_id = 0
sentence_id = 0

sentences_for_stem = collections.defaultdict(set)
docs_for_stem = collections.defaultdict(set)

for l in fileinput.input("/Users/ramon/qlc-github/data/eswiki/AA/wiki00"):
    l = l.strip()
    l = l.decode("utf-8")
    l = unicodedata.normalize("NFD", l)
    
    if l.startswith("</doc>"):
        sentences = tokenizer.tokenize(doc)
示例#29
0
   Hay multitud de ""stemmizadores"", yo voy a coger el de español
'''

from nltk.stem.snowball import SnowballStemmer, SpanishStemmer

#Igual que con punkt hay que bajar un paquete
download('stopwords')

#Si no conocemos el lenguaje a priori.
#SnowballStemmer(language, ignore_stopwords=False)
spanish_stem = SnowballStemmer("spanish", True)

# Si conocemos el lenguaje de antemano, podemos importarlo directamente
#SpanishStemmer(ignore_stopwords=False)
spanish_stem = SpanishStemmer(True)
print(spanish_stem.stem("Comiendo"), spanish_stem.stem("Bailando"),
      spanish_stem.stem("bailar"), spanish_stem.stem("estantería"))
'''################################
   # Obteniendo el verbo original #
   ################################

   Conocido como lemmatization.

   NLTK no tiene esto en español, solo inglés.
'''

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
#Igual que con punkt hay que bajar un paquete
download('wordnet')
示例#30
0
import csv
import collections
import operator
import unicodedata

import os
cwd = os.getcwd()
root = os.path.dirname(cwd)
lematizador_dir = os.path.join(root, "data", "lematizador", "lematizador.csv")
stopwords_dir = os.path.join(root, "data", "stopwords")

from spellchecker import SpellChecker  #https://pypi.org/project/pyspellchecker/

#Stemmer
from nltk.stem.snowball import SpanishStemmer
stemmer = SpanishStemmer()

#Creación objeto para corrección ortografía
spell = SpellChecker(language="es")
metodo_desconocidas = spell.unknown
metodo_correccion = spell.correction
lista_blanca_regiones = [
    "Arica", "Parinacota", "Tarapacá", "Antofagasta", "Atacama", "Coquimbo",
    "Valparaíso", "Metropolitana", "Santiago", "Libertador", "General",
    "Bernardo", "O’Higgins", "Maule", "Ñuble", "Biobío", "Araucanía", "Ríos",
    "Lagos", "Aysén", "General", "Carlos", "Ibáñez", "Campo", "Magallanes",
    "Antártica"
]
lista_blanca_telecom = [
    'lte', 'whatsapp', 'instagram', 'telegram', 'youtube', 'facebook', 'entel',
    'bafi', 'resetea', 'samsung', 'huawei', 'iphone', 'kb', 'mb', 'pixi',
示例#31
0
def main(argv):
    log = logging.getLogger()
    logging.basicConfig(level=logging.INFO)
    
    conf = appconfig('config:development.ini', relative_to='.')
    config = None
    if not pylons.test.pylonsapp:
        config = load_environment(conf.global_conf, conf.local_conf)

    stemmer = SpanishStemmer(True)

    # load swadesh list
    swadesh_file = codecs.open(os.path.join(os.path.dirname(
        os.path.realpath(
            __file__)), "swadesh_spa.txt"), "r", "utf-8")

    swadesh_entries = []
    for line in swadesh_file:
        line = line.strip()
        for e in line.split(","):
            stem = stemmer.stem(e)
            swadesh_entries.append(stem)

    for b in quanthistling.dictdata.books.list:
        #if b['bibtex_key'] != "thiesen1998":
        #    continue

        book = model.meta.Session.query(model.Book).filter_by(bibtex_key=b['bibtex_key']).first()
        
        if book:

            print "Filtering entries in %s..." % b['bibtex_key']

            for dictdata in book.dictdata:

                entries = model.meta.Session.query(model.Entry).filter(model.Entry.dictdata_id==dictdata.id).order_by("startpage", "pos_on_page").all()

                annotations = model.meta.Session.query(model.Annotation).join(model.Entry, model.Annotation.entry_id==model.Entry.id).filter(model.Entry.dictdata_id==dictdata.id).all()
                dict_annotations = collections.defaultdict(list)
                for a in annotations:
                    dict_annotations[a.entry_id].append(a)

                for e in entries:
                    if b['bibtex_key'] == "thiesen1998":
                        e.filtered = False
                    else:
                        e.filtered = True
                        for a in dict_annotations[e.id]:
                            if a.value == "iso-639-3" and a.string == "spa":
                                for a2 in dict_annotations[e.id]:
                                    if (a2.value == "head" or a2.value == "translation") and a2.start == a.start:
                                        phrase = re.sub(" ?\([^)]\)", "", a2.string)
                                        phrase = phrase.strip()
                                        if not " " in phrase:
                                            stem = stemmer.stem(phrase)
                                            if stem in swadesh_entries:
                                                e.filtered = False
#                                                if e.is_subentry:
#                                                    e.mainentry().filtered = False

                Session.commit()
def stemmer_all(tweet):
    stm = SpanishStemmer()
    split_tweet = [word for word in tweet.lower().split(' ') if word.strip()]
    return ' '.join([stm.stem(word.strip())
                     for word in split_tweet])
 def __init__(self):
     super(StemmerProcessor, self).__init__()
     self.stemmer = SpanishStemmer()
示例#34
0
def export_swadesh_entries(input_path, output_path=None):

    print("Input: {0}".format(input_path))
    print("Ouput: {0}".format(output_path))

    cr = CorpusReaderDict(input_path)
    print("Data loaded")

    files = [ "book.csv",
          "component.csv",
          "corpusversion.csv",
          "dictdata.csv",
          "language_iso.csv",
          "language_bookname.csv",
          "language_src.csv",
          "language_tgt.csv",
          "nondictdata.csv",
          "wordlistdata.csv",
          "wordlistconcept.csv"
        ]
    
    for f in files:
        shutil.copyfile(os.path.join(
            input_path, f), os.path.join(output_path, f))
    
    from nltk.stem.snowball import SpanishStemmer
    stemmer = SpanishStemmer()
    import qlc.utils

    #get stopwords
    stopwords = qlc.utils.stopwords_from_file(os.path.join(os.path.dirname(
        os.path.realpath(
            __file__)), "data", "stopwords", "spa.txt"))

    # load swadesh list
    swadesh_file = codecs.open(os.path.join(os.path.dirname(
        os.path.realpath(
            __file__)), "data", "swadesh", "spa.txt"), "r", "utf-8")

    swadesh_entries = []
    for line in swadesh_file:
        line = line.strip()
        for e in line.split(","):
            stem = stemmer.stem(e)
            swadesh_entries.append(stem)

    # find all entries that contain one of the swadesh words
    # save entry ids to list
    entry_ids = []

    dictdata_ids = cr.dictdata_string_ids
    for dictdata_id in dictdata_ids:
        src_language_iso = cr.src_languages_iso_for_dictdata_id(dictdata_id)
        tgt_language_iso = cr.tgt_languages_iso_for_dictdata_id(dictdata_id)
        # is there some spanish?
        if (src_language_iso != ['spa']) and (tgt_language_iso != ['spa']):
            continue

        for entry_id, head, translation in \
                cr.ids_with_heads_with_translations_for_dictdata_id(
                    dictdata_id):
            if src_language_iso == [ 'spa' ]:
                (head, translation) = (translation, head)

            translation = re.sub(" ?\([^)]\)", "", translation)
            if translation in stopwords:
                entry_ids.append(entry_id)
            else:
                translation = qlc.utils.remove_stopwords(translation, stopwords)
                phrase_stems = qlc.utils.stem_phrase(translation, stemmer, True)
                for stem in phrase_stems:
                    if stem in swadesh_entries:
                        entry_ids.append(entry_id)

    #print(len(entry_ids))
    #return

    input_entry_csv = os.path.join(input_path, "entry.csv")
    output_entry_csv = os.path.join(output_path, "entry.csv")

    input_annotation_csv = os.path.join(input_path, "annotation.csv")
    output_annotation_csv = os.path.join(output_path, "annotation.csv")

    output_annotation = codecs.open(output_annotation_csv, "w", "utf-8")

    annotation_dict = collections.defaultdict(list)

    # cache annotations for lookup
    for i, line in enumerate(fileinput.input(
            input_annotation_csv, openhook=fileinput.hook_encoded("utf-8"))):
        if i == 0:
            output_annotation.write(line)
            continue
        data = line.strip().split("\t")
        annotation_dict[
            data[_annotation_table_columns['entry_id'] + 1]].append(line)
    
    fileinput.nextfile()

    output = codecs.open(output_entry_csv, "w", "utf-8")
    
    count_entries = 0
    for i, line in enumerate(fileinput.input(
            input_entry_csv, openhook=fileinput.hook_encoded("utf-8"))):
        if i == 0:
            output.write(line)
            continue
        data = line.strip().split("\t")
        if data[0] in entry_ids:
            output.write(line)
            for annotation_line in annotation_dict[data[0]]:
                output_annotation.write(annotation_line)

    fileinput.nextfile()
    output.close()
    output_annotation.close()
    
    # Worldists
    cr = CorpusReaderWordlist(sys.argv[1])
    print("Data loaded")

    # find all entries that contain one of the swadesh words
    # save entry ids to list
    wordlistdata_ids = cr.wordlistdata_string_ids
    bibtex_keys = collections.defaultdict(list)
    for wid in wordlistdata_ids:
        wordlistdata_string = cr.wordlistdata_string_ids[wid]
        bibtex_key = wordlistdata_string.split("_")[0]
        bibtex_keys[bibtex_key].append(wid)

    wordlistentry_ids = []
    for bibtex_key in bibtex_key:
        # first collect all concepts in this book where the spanish counterpart
        # has one of the swadesh words
        concepts = []
        for wordlistentry_id in wordlistentry_ids:
            language_iso = cr.get_language_code_for_wordlistdata_id(
                wordlistdata_id)
            # is there some spanish?
            if language_iso != ['spa']:
                continue

            for entry_id, concept, counterpart in \
                    cr.ids_with_concepts_with_counterparts_for_dictdata_id(
                        dictdata_id):

                counterpart = re.sub(" ?\([^)]\)", "", counterpart)
                if counterpart in stopwords:
                    entry_ids.append(entry_id)
                else:
                    counterpart = qlc.utils.remove_stopwords(
                        counterpart, stopwords)
                    phrase_stems = qlc.utils.stem_phrase(
                        counterpart, stemmer, True)
                    for stem in phrase_stems:
                        if stem in swadesh_entries:
                            concepts.append(concept)

        # now collect the entry ids for those concepts
        for wordlistentry_id in wordlistentry_ids:

            for entry_id, concept, counterpart in \
                    cr.ids_with_concepts_with_counterparts_for_dictdata_id(
                        dictdata_id):
                if concept in concepts:
                    wordlistentry_ids.append(entry_id)
    
    input_entry_csv = os.path.join(input_path, "wordlistentry.csv")
    output_entry_csv = os.path.join(output_path, "wordlistentry.csv")

    input_annotation_csv = os.path.join(input_path, "wordlistannotation.csv")
    output_annotation_csv = os.path.join(output_path, "wordlistannotation.csv")

    output_annotation = codecs.open(output_annotation_csv, "w", "utf-8")

    annotation_dict = collections.defaultdict(list)

    for i, line in enumerate(fileinput.input(input_annotation_csv, openhook=fileinput.hook_encoded("utf-8"))):
        if i == 0:
            output_annotation.write(line)
            continue
        data = line.strip().split("\t")
        annotation_dict[data[_wordlistannotation_table_columns['entry_id'] + 1]].append(line)
    
    fileinput.nextfile()

    output = codecs.open(output_entry_csv, "w", "utf-8")
    count_entries = 0
    for i, line in enumerate(fileinput.input(input_entry_csv, openhook=fileinput.hook_encoded("utf-8"))):
        if i == 0:
            output.write(line)
            continue
        data = line.strip().split("\t")
        if data[0] in entry_ids:
            output.write(line)
            for annotation_line in annotation_dict[data[0]]:
                output_annotation.write(annotation_line)

    fileinput.nextfile()
    output.close()
    output_annotation.close()    
# The next step is to connect spanish translations that contain the same stem. For this we first remove certain stop words from the translation (list of stopwords from NLTK). There are two cases then: just one word remains, or more than one word remains.
# 
# We have to options now what to do with the latter: either they are not connected with anything at all (default behaviour), or each word is stemmed and the translation is connected with every other translation that contain the same stems. Right now this results in many connections that look not very useful. This should be done in a more intelligent way in the future (for example find heads of phrases in mulitword expression and only connect those; split the weight of the connections between all stems and work with weighted graphs from this step on; ...).
# 
# To connect the spanish translations the script adds additional "stem nodes" to the graph. The name of these nodes consists of a spanish word stem plus a pipe symbol plus the string "stem". These nodes look like this in a dot file:
# 
# > "tom|stem" [is_stem=True];
# 
# The introduction of these nodes later facilites the output of translation matrixes, as you can just search for stems within the graph and only output direct neighbours with spanish translations. It would also be possible to directly connect the spanish translations if they have a matching stem, but then the graph traversal to find matching translations and their heads is a bit more complex later.
# 
# First we create a stemmer object from the SpanishStemmer in NLTK:

# <codecell>

from nltk.stem.snowball import SpanishStemmer
stemmer = SpanishStemmer(True)

# <markdowncell>

# We create the list of stopwords and encode them as unicode strings:

# <codecell>

combined_graph_stemmed = copy.deepcopy(combined_graph)
stopwords = nltk.corpus.stopwords.words("spanish")
stopwords = [w.decode("utf-8") for w in stopwords]

# <markdowncell>

# Then we loop through all the nodes of the merged graph and add the stem nodes to each Spanish node. If the node has only one word (after stopword removal) we will use the NLTK stemmer; otherwise we just leave the phrase as it is:
示例#36
0
def stemmer_all(tweet):
    stm = SpanishStemmer()
    split_tweet = [word for word in tweet.lower().split(' ') if word.strip()]
    return ' '.join([stm.stem(word.strip()) for word in split_tweet])
def remove_stopwords(text, stopSpanish):
    stemmer=SpanishStemmer()
    textList = text.split()
    textList = [word for word in textList if word not in stopSpanish]
    return ' '.join([stemmer.stem(word) for word in textList])
示例#38
0
import pickle
import codecs
import os

"""The application's model objects"""
from quanthistling.model.meta import Session, metadata
from sqlalchemy import schema, types
from sqlalchemy import orm, func
from sqlalchemy import and_

from webhelpers.html import literal
from operator import attrgetter
from pylons import config

from nltk.stem.snowball import SpanishStemmer
stemmer = SpanishStemmer(True)

# load swadesh list
swadesh_file = codecs.open(os.path.join(os.path.dirname(
    os.path.realpath(
        __file__)), "spa.txt"), "r", "utf-8")

swadesh_list = []
for line in swadesh_file:
    line = line.strip()
    for e in line.split(","):
        stem = stemmer.stem(e)
        swadesh_list.append(stem)

def init_model(engine):
    """Call me before using any of the tables or classes in the model"""
示例#39
0
    def get_vector_matrix(self, freq_floor=50, context_words=3):

        nlp = es_core_web_md.load()
        STOPWORDS = spacy.es.STOP_WORDS

        def _clean_sent(sent):
            clean_sent = []
            # remove stopwords
            for word in sent:
                word = word.lower()
                if not word in STOPWORDS:
                    if not word.isdigit():
                        clean_sent.append(word)
            return clean_sent

        def _update_feature(word, feature_name, features):
            " dirty update of features "
            counts = 1
            if word in vectors:
                if feature_name in vectors[word]:
                    counts = vectors[word][feature_name] + 1
            features[feature_name] = counts
            return features

        def _update_counts(feature_name, f_counts):
            counts = 1
            if feature_name in f_counts:
                counts = f_counts[feature_name] + 1
            f_counts[feature_name] = counts
            return f_counts

        sents = self.corpus.get_sents()
        stemmer = SpanishStemmer()

        # will use the words as keys and dict of features as values
        vectors = {}
        #freq_counts = {}
        for sent in sents:
            # TODO: PARALELLIZE!!
            #for doc in nlp.pipe(texts, batch_size=10000, n_threads=3):
            # take off stopwords && to get context_words!
            cleaned_sent = _clean_sent(sent)
            doc = nlp(' '.join(sent))
            for word_idx in range(len(doc)):
                # get the word and the pos tag
                spacy_word = doc[word_idx]
                word = spacy_word.text.lower()

                pos_tag = spacy_word.pos_

                if len(word) <= 2:
                    continue
                if word in STOPWORDS:
                    continue
                if word.isdigit():
                    continue

                # if not seen word
                if not word in vectors:
                    features = {}
                else:
                    features = vectors[word]

                # counts of frequency to normalze later
                #freq_counts = _update_counts(pos_tag, freq_counts)

                # context related (POS and words stemmed)
                features = _update_feature(word, pos_tag, features)
                if word_idx > 0:
                    prev_tag = doc[word_idx - 1].pos_
                    feature_name = prev_tag + '_pos_prev'
                    features = _update_feature(word, feature_name, features)
                if word_idx < len(sent) - 1:
                    post_tag = doc[word_idx + 1].pos_
                    feature_name = post_tag + '_pos_post'
                    features = _update_feature(word, feature_name, features)

                # dependency features. the objective of the dep is stemmed!
                dep_type = spacy_word.dep_
                if dep_type != 'ROOT':
                    dep_obj = stemmer.stem(spacy_word.head.text.lower())
                    feature_name = 'DEP:' + dep_type + '-' + dep_obj
                    features = _update_feature(word, feature_name, features)

                # get n words from context as features (stemmed...!)
                for i in range(context_words):
                    ctxt_word = (random.choice(cleaned_sent))
                    feature_word = stemmer.stem(ctxt_word)
                    feature_name = ctxt_word + '_ctxt_word'
                    features = _update_feature(word, feature_name, features)
                # agregar feature de synset (wordnet) :0
                features['word'] = word

                # frequency counting
                features = _update_feature(word, 'freq', features)

                vectors[word] = features

        # sacar palabras con < 'freq'
        words_to_pop = set()
        for word, f_dict in vectors.items():
            if f_dict['freq'] <= freq_floor:
                words_to_pop.add(word)
        for word in words_to_pop:
            vectors.pop(word)

        for word, f_dict in vectors.items():
            #print(word, f_dict)
            f_dict['freq'] = 0
            vectors[word] = f_dict  # delete an irrelevant dimension!
        # normalizar los contextos de POS
        #for word, f_dict in vectors.items():
        #    f_dict[]

        # agregar palabra de contexto. .. LEMATIZADA !

        # NORMALIZAR TODOS LOS CONTEXTOS! -> diccionario de frequencias de ... TODOS los features que ocurrieron
        self.words = list(
            vectors.keys())  # thankfully in the same order as vectors.values

        vectorizer = DictVectorizer(dtype=numpy.int32)
        vec_matrix = vectorizer.fit_transform(list(vectors.values()))
        vectors_shape = vec_matrix.get_shape()
        print(vectors_shape)
        """
        freqs_vector = vectorizer.transform(freq_counts)

        vec_matrix = vstack([freqs_vector, vec_matrix])
        print(s.get_shape)
        print(s)
        print(vectorizer.inverse_transform(s))
        """

        # normalization
        vec_matrix = normalize(vec_matrix, copy=False)

        ####### reduccion de dim no sup
        # reducir dimensionalidad con variance treshold
        #selector = VarianceThreshold(threshold = 0.0)
        #vec_matrix = selector.fit_transform(vec_matrix)

        # SVD (PCA)
        Trunc_svd = TruncatedSVD(n_components=1500)
        vec_matrix = Trunc_svd.fit_transform(vec_matrix)

        # reducir dimensionalidad con percentile de varianza
        #selected = SelectPercentile(chi2, percentile = 10)
        #word_vecs_new=selected.fit_transform(new_word_vecs,target_vec)

        print(vectorizer.inverse_transform(vec_matrix))  # -> to see features!

        return self.words, vec_matrix
示例#40
0
 def __init__(self):
     super(StemmerProcessor, self).__init__()
     self.stemmer = SpanishStemmer()
示例#41
0
class MLAssistant(Assistant):
    def __init__(self,
                 language='en',
                 database_name='memory',
                 memory_table='memory',
                 listen_log_table='listen_log',
                 speak_log_table='speak_log'):
        super().__init__(language, database_name, memory_table,
                         listen_log_table, speak_log_table)

        try:
            json_file = open('modelo_gustos.json', 'r')
            loaded_model_json = json_file.read()
            json_file.close()

            self.model = model_from_json(loaded_model_json)
            self.model.load_weights("modelo_gustos.h5")
            self.model.compile(loss='mean_squared_error',
                               optimizer='adam',
                               metrics=['binary_accuracy'])
        except Exception:
            print('****ERROR: Error cargando modelo...****')

        self.stemmer = SpanishStemmer()
        self.words = [
            '¿qu', '?', 'peli', 'pelis', 'color', 'favorit', 'leer', 'libr',
            'novel', 'ver', 'prefier', 'gust', 'pelicul', 'jug', '¿cual',
            'prefer', 'jueg', 'com', 'plat', 'animal', 'videojueg'
        ]
        self.classes = [
            'comida', 'color', 'animal', 'juego', 'libro', 'película'
        ]

    def main(self, initial_sentence='¿Qué deseas?'):
        self.speak(initial_sentence, remember=False)
        self.listen()
        self.process_orders(self.last_recognised)
        self.adjust_for_ambient_noise()

    def process_orders(self, sentence):
        _class = self.classify_sentence(sentence)

        if not _class:
            self.speak('no estoy segura de lo que me quieres preguntar')
        else:
            if _class == 'comida':
                self.speak(
                    'Sin lugar a dudas mi comida preferida son los nachos con queso'
                )
            if _class == 'color':
                self.speak('Mi color preferido es el escarlata.')
            if _class == 'animal':
                self.speak(
                    'Me gustan mucho los grandes felinos, pero mi animal preferido es una perra que se llama'
                    ' Arale.')
            if _class == 'juego':
                self.speak('¡Me encanta Hollywood Monsters!')
            if _class == 'libro':
                self.speak(
                    'No queda muy bien decirlo, pero me han programado para decir siempre la verdad. No tengo'
                    ' tiempo para leer, y por tanto no tengo libro preferido.')
            if _class == 'película':
                self.speak(
                    'No tengo una película preferida, pero me gustan especialmente las películas de Disney y'
                    ' las del Studio Ghibli.')

    def classify_sentence(self, sentence, min_val=0.5):
        results = self._get_classification(sentence)
        if float(results[0][1]) < min_val:
            return None
        else:
            return results[0][0]

    def _clean_up_sentence(self, sentence):
        sentence_words = nltk.word_tokenize(sentence)
        sentence_words = [
            self.stemmer.stem(word.lower()) for word in sentence_words
        ]
        return sentence_words

    def _bow(self, sentence, words):
        sentence_words = self._clean_up_sentence(sentence)
        bag = [0] * len(words)
        for s in sentence_words:
            for i, w in enumerate(words):
                if w == s:
                    bag[i] = 1
        return np.array(bag)

    def _get_classification(self, sentence):
        array = [self._bow(sentence, self.words)]
        np_array = np.array(array, "float32")
        prediction = self.model.predict(np_array).round(2)[0]
        result = dict(zip(self.classes, prediction))
        return sorted(result.items(), key=operator.itemgetter(1))[::-1]
示例#42
0
class TextProcessor:
    lemmatizer=None
    stopEnglish=None
    stopSpanish=None
    spanishStemmer=None

    def __init__(self):
        self.lemmatizer = treetaggerwrapper.TreeTagger(TAGLANG='es')
        self.stopEnglish = stopwords.words('english')
        self.stopSpanish = stopwords.words('spanish')
        self.stopSpanish.append('y/o')
        self.spanishStemmer=SpanishStemmer()

    def _remove_numbers(self, text):
        "Elimina los números del texto"

        return ''.join([letter for letter in text if not letter.isdigit()])

    def _remove_punctuation(self, text):
        "Elimina los signos de puntuacion del texto"

        regex = re.compile('[%s]' % re.escape(string.punctuation))
        return regex.sub(' ', text)

    def preprocessText(self,text):
        text=text.lower()
        text=self._remove_punctuation(text)
        text=self._remove_numbers(text)
        return text

    def lematizeText(self,text):
        newText = ""
        firstElement = 0
        firstWord=True
        for word in text.split():
            if word not in self.stopEnglish and word not in self.stopSpanish:
                word = word.replace("\ufeff", "")
                lemmaResult = self.lemmatizer.tag_text(word)
                # Return [[word,type of word, lemma]]
                if (len(lemmaResult) != 0):
                    word = lemmaResult[firstElement].split()[2]
                    if firstWord:
                        newText += word
                        firstWord = False
                    else:
                        newText += " " + word
        return newText

    def stemText(self,text):
        newText = ""
        firstWord = True
        for word in text.split():
            if word not in self.stopEnglish and word not in self.stopSpanish:
                word = word.replace("\ufeff", "")
                wordStemmed = self.spanishStemmer.stem(word)
                if firstWord:
                    newText += wordStemmed
                    firstWord = False
                else:
                    newText += " " + wordStemmed
        return newText
示例#43
0
 def __init__(self, next_normalizer=None):
     super(SpanishStemmer, self).__init__(next_normalizer)
     self._stemmer = NLTKSpanishStemmer()