def ProcessCorpus(V, L): try: stem = SpanishStemmer() for l in stdin: l = l.split() if len(l) < 3: stderr.write('Warning: Short line: \"%s\"\n' % ' '.join(l)) continue tid = l[0] uid = l[1] lv = [0 for w in V] for w in l[2:]: w = stem.stem(w.decode('utf-8')) d = V.get(w, None) if d is None: #stderr.write('Warning: \"%s\" not in the lexicon\n' % w); continue lv[d] = lv[d] + 1 if sum(lv) == 0: stderr.write('Warning: %s with null vector. Label: %d\n' % (tid, L[l[0]]) ) stdout.write('%d ' % L[l[0]]) for i in range(len(lv)): stdout.write('%d:%d ' % (i+1, lv[i])) stdout.write('# %s %s\n' % (tid, uid)) return 0 except Exception as ex: stderr.write('Exception: %s\n' % repr(ex)) return 1
def __init__(self, language='en', database_name='memory', memory_table='memory', listen_log_table='listen_log', speak_log_table='speak_log'): super().__init__(language, database_name, memory_table, listen_log_table, speak_log_table) try: json_file = open('modelo_gustos.json', 'r') loaded_model_json = json_file.read() json_file.close() self.model = model_from_json(loaded_model_json) self.model.load_weights("modelo_gustos.h5") self.model.compile(loss='mean_squared_error', optimizer='adam', metrics=['binary_accuracy']) except Exception: print('****ERROR: Error cargando modelo...****') self.stemmer = SpanishStemmer() self.words = [ '¿qu', '?', 'peli', 'pelis', 'color', 'favorit', 'leer', 'libr', 'novel', 'ver', 'prefier', 'gust', 'pelicul', 'jug', '¿cual', 'prefer', 'jueg', 'com', 'plat', 'animal', 'videojueg' ] self.classes = [ 'comida', 'color', 'animal', 'juego', 'libro', 'película' ]
def __init__(self, min_long=5): """ Para inicializar un `Tokenizer` es necesario saber el tamaño mínimo de caracteres `min_long`. que constituyen una palabra válidad :param min_long: un entero. Por defecto igual a cinco (5) """ self.stemmer = SpanishStemmer() self.min_long = min_long
def find_top_N_words(lang_entries, top_N, lang): dictionary = Lang_Dictionary({}, lang) for player in lang_entries: for chat in player.c: language = {'eng': 0, 'spn': 0, 'other': 0, 'tot': 0} sentence = player.c[chat] newlist = player.c[chat].strip().split(' ') newlist = [x.strip("''") for x in newlist] for word in newlist: language['tot'] += 1 if word.lower() not in Lang_dicts.lang_index: language['other'] += 1 else: word = Lang_dicts.lang_index[word.lower()] if word == "english": language['eng'] += 1 elif word == "spanish": language['spn'] += 1 else: language['other'] += 1 if language['other'] < 2 * (language['spn'] + language['eng']): print(sentence) if language['spn'] > language['eng']: print("SPANISH") stemmer = SpanishStemmer() else: print("ENGLISH") stemmer = EnglishStemmer() aslist = [] aslist += sentence sentence ="" j = ''.join(aslist) words = j.split(' ') for line in words: line = str(line).replace('\'', '') line = line.replace('""', '') line = line.replace('"', '') if len(line) > 0: if language["other"] < 2 * (language['spn'] + language["eng"]): sentence += stemmer.stem(line.encode(sys.stdout.encoding, errors = 'replace')) + " " print(sentence) ##INEFFICIENT - looking through dictionary each time? if line.lower() not in dictionary.d: dictionary.d[line.lower()] = 0 dictionary.d[line.lower()] += 1 ###wthCounts is a list of the word and its count wthCounts = [] for(w,c) in dictionary.d.iteritems(): wthCounts += [(c,w)] ##wc is the wthCounts list only sorted wc = sorted(wthCounts, reverse=True) return wc[:top_N]
def build_paragraph_inv_index(paragraphs, stem): p_index = {} stemmer = SpanishStemmer() for i, paragraph in enumerate(paragraphs): words = [word for word in paragraph.split() if word not in STOP_WORDS] for word in words: if stem: word = stemmer.stem(word) if word not in p_index: p_index[word] = [] p_index[word].append(i) return p_index
def __init__(self, lemma=False, stem=False): self.extra_dicts = Dicts() self.english_dict = enchant.Dict("en_EN") self.spanish_dict = enchant.Dict("es_AR") self.lemma = lemma self.stem = stem self.VARIANT_CLASS = 0 self.SPANISH_CLASS = 1 self.FOREIGN_CLASS = 2 if lemma: self.lemmatizer = Lemmatizer() if stem: self.stemmer = SpanishStemmer()
def __init__(self, stem=False): dictionaries = dicts() path = '/home/alangb/TWPP' # path to TreeTagger installation directory self.english_dict = enchant.Dict("en_EN") self.spanish_dict = enchant.Dict("es_ES") self.ND = dictionaries.norm self.SD = dictionaries.lemario self.PND = dictionaries.names self.stem = stem if stem: self.stemmer = SpanishStemmer() else: self.tagger = TreeTagger(TAGLANG='es', TAGDIR=path)
def __init__(self): self.reglasEntities.append(EmailRegla()) self.reglasEntities.append(UrlRegla()) self.reglasEntities.append(FechasRegla()) self.reglasEntities.append(TelefonosRegla()) self.reglasEntities.append(AbreviaturasRegla()) self.reglasEntities.append(NombresPropiosRegla()) self.reglasEntities.append(NumerosRegla()) self.reglasDocumento.append(MinusculasRegla()) self.reglasDocumento.append(TranslateRegla()) self.reglasDocumento.append(LimpiarHtmlTagsRegla()) self.reglasDocumento.append(LimpiadoBasicoRegla()) self.reglasTokens.append(MinMaxCaracteresRegla()) self.stemmer = SpanishStemmer()
def process_violence(lang, data_path, stopword_path, save_path): if lang == "English": stemmer = EnglishStemmer() elif lang == "Spanish": stemmer = SpanishStemmer() else: stemmer = None print "loading dataset" line_sentences = ProcessLineSentence(dataPath=data_path, label="violence", stopwordPath=stopword_path, stemmer=stemmer) with open(save_path, 'w') as f: writer = csv.writer(f) for sentence, label in line_sentences: if label == "no": l = [0] elif label == "violence": l = [1] elif label == "malpractice": l = [2] else: raise (Exception("Wrong label: {}".format(label))) writer.writerow(l + sentence)
class Tokenizer(object): """ Esta clase es la encargada de obtener las palabras de los documentos recuperados por el `Crawler` """ def __init__(self, min_long=5): """ Para inicializar un `Tokenizer` es necesario saber el tamaño mínimo de caracteres `min_long`. que constituyen una palabra válidad :param min_long: un entero. Por defecto igual a cinco (5) """ self.stemmer = SpanishStemmer() self.min_long = min_long def obtener_palabras(self, contenido): """ Este método devuelve una lista de palabras recuperadas del `contenido` :param contenido: una cadena con el contenido de texto del documento :return: una lista de cadenas de caracteres representando las palabras """ # Realizo el stemming en todo el contenido. Eliminando acentos mayusculas y dejando las raices. cont_stemed = self.stemmer.stem(contenido) # Divido el texto por palabras eliminando las repetidas conjunto_palabras = set(re.split(r'\W+', cont_stemed)) # Elimino Stopwords, palabras menores a min_long y retorno lista return [palabra for palabra in conjunto_palabras if palabra not in stopwords.words('spanish') and not len(palabra) < self.min_long]
class ConceptComparerSpanishStem(ConceptComparerBase): """ Implementation of a concept comparer based on a stemmer for spanish. Parameters ---------- None. Notes ----- This is a sub-class of :py:class:`~lingpy.meaning.concepts.ConceptComparerBase`. It uses a simple match of the stem of a given (spanish) string against a given context (that is supposed to be a stemmed spanish word stem). See also -------- ConceptComparerBase ConceptGraph """ def __init__(self): self.stemmer = SpanishStemmer(True) self.re_brackets = re.compile(" ?\([^)]\)") def compare_to_concept(self, element, concept): """Compares a given element to a concept. Parameters ---------- element : str The string (for example a lexical item: head or translation) to compare to the concept. concept : str or object The conpect to compare to. Return ------ match : bool True if element matches the given concept, False otherwise. Notes ----- The `element` is supposed to be a spanish word, the concept a stemmed entry of the spanish Swadesh List. See also -------- spanish_swadesh_list """ element = self.re_brackets.sub("", element) element = element.strip() if not " " in element: stem = self.stemmer.stem(element) if stem == concept: return True return False
def __init__(self, question, words, stem): self.question = question self.stem = stem self.stemmer = SpanishStemmer() self.words = words self.stemmed_words = self.stem_words(self.words) self.path_pfx = os.getcwd() self.inverted_index = self.load_doc_inverted_index() self.doc_names = self.init_doc_names() self.paragraph_indices = {} self.paragraph_inverted_indices = {} self.results = pd.DataFrame(columns=['text', 'law', 'score']) self.load_paragraph_indices() self.L = 23055.676666666666 #Manually obtained using bash self.scores = {'tf': {}, 'idf':{}, 'tfidf':{},'n_containing':{},\ 'score':{}}
class OOVclassifier(object): def __init__(self, stem=False): dictionaries = dicts() path = '/home/alangb/TWPP' # path to TreeTagger installation directory self.english_dict = enchant.Dict("en_EN") self.spanish_dict = enchant.Dict("es_ES") self.ND = dictionaries.norm self.SD = dictionaries.lemario self.PND = dictionaries.names self.stem = stem if stem: self.stemmer = SpanishStemmer() else: self.tagger = TreeTagger(TAGLANG='es', TAGDIR=path) def dictionary_lookup(self, word): result = (word in self.SD or word in self.PND or word in self.ND.values()) return result def affix_check(self, word): result = False if word.islower() or word.istitle(): if self.stem: n = len(word) stem = self.stemmer.stem(word) # compare with first substring of length n of each word in SD for w in [x[:n] for x in self.SD if len(x) >= n]: result = (word == w) if result: break else: lemma = make_tags(self.tagger.tag_text(word))[0].lemma result = self.dictionary_lookup(lemma) return result def check(self, word): result = self.spanish_dict.check(word) if not result: result = self.dictionary_lookup(word) or self.affix_check(word) return result def check_NoES(self, word): result = False if len(word) > 1: result = self.english_dict.check(word) return result def classify(self, word): if self.check(word): result = 1 elif self.check_NoES(word): result = 2 else: result = 0 return result
def build_index_from_words(words, stem): ''' Takes: - words, a list of strings Returns: - index, a dictionary with a count of times a word appears in the document ''' index = {} stemmer = SpanishStemmer() for word in words: if word not in STOP_WORDS: if stem: word = stemmer.stem(word) if word not in index: index[word] = 0 index[word] += 1 return index
class StemmerProcessor(DocumentAtATimeCorpusProcessor): def __init__(self): super(StemmerProcessor, self).__init__() self.stemmer = SpanishStemmer() def process_document(self, document): processed_document = [] for word in document: processed_document.append(self.stemmer.stem(word)) return processed_document
class SpanishStemmer(Normalizer): def __init__(self, next_normalizer=None): super(SpanishStemmer, self).__init__(next_normalizer) self._stemmer = NLTKSpanishStemmer() def _apply_normalizer(self, data): stem_word = lambda x: self._stemmer.stem(x) stem_word_list = lambda xl: [stem_word(w) for w in xl] return stem_word(data) if not isinstance(data, (list, tuple)) else stem_word_list(data)
def generate_stopwords(stopname='stopSpanish.pkl'): """ Remove stop words, and apply stemming """ stemmer=SpanishStemmer() stopwords_es = set(stopwords.words('spanish')) stopwords_es_sw = set(get_stop_words('spanish')) stopSpanishBeta = list(set(stopwords_es.union(stopwords_es_sw))) stopSpanish = set(stopwords_es.union(stopwords_es_sw)) for stopWord in stopSpanishBeta: stopSpanish.add(stemmer.stem(stopWord)) stopSpanish = list(stopSpanish) stopSpanish.extend(['tra', 'd', 'desc']) # Adding stopwords not present in the standard stopwords stopSpanish.remove('no') # Keep to help identify negative categories with open(f'{resource_path}/{stopname}', 'wb') as f: pickle.dump(stopSpanish, f) return stopSpanish
def getfeats(fields, o): """ This takes the word in question and the offset with respect to the instance word """ word = fields[0] stemmer = SpanishStemmer() with_hyphen = 0 if "-" in word: with_hyphen = 1 with_apostrophe = 0 if "'" in word: with_apostrophe = 1 o = str(o) features = [ (o + "word", word), (o + 'pos', fields[1]), #(o + 'prefix1', word[:1]), (o + 'prefix2', word[:2]), (o + 'prefix3', word[:3]), (o + 'prefix4', word[:4]), #(o + 'suffix1', word[-1:]), (o + 'suffix2', word[-2:]), (o + 'suffix3', word[-3:]), (o + 'suffix4', word[-4:]), (o + 'is_upper', word.isupper()), (o + 'is_title', word.istitle()), (o + 'is_digit', word.isdigit()), (o + 'with_hypen', with_hyphen), (o + 'with_apostrophe', with_apostrophe), (o + 'spanich_stem', stemmer.stem(word)), # (o + 'word_shape', word_shape(word)) ] return features
def spanish_swadesh_list(stemmed=True): """ Helper function that returns a list of strings with the stems of the spanish Swadesh entries. """ try: stemmer = SpanishStemmer(True) except: log.warn("Spanish stemmer could not be loaded!") return swadesh_entries = [] for line in util.read_text_file( util.data_path('swadesh', 'swadesh_spa.txt'), lines=True): line = line.strip() for e in line.split(","): e = e.strip() if stemmed: stem = stemmer.stem(e) swadesh_entries.append(stem) else: swadesh_entries.append(e) return swadesh_entries
def run_BM25_collection(output_dir,documents,queries,qrels,train,validation,test,k,language): if language=='en': stop_words = set(stopwords.words('english')) stemmer = PorterStemmer() elif language=='fr': stop_words = set(stopwords.words('french')) stemmer = FrenchStemmer() elif language=='es': stop_words = set(stopwords.words('spanish')) stemmer = SpanishStemmer() elif language=='it': stop_words = set(stopwords.words('italian')) stemmer = ItalianStemmer() corpus = [] doc_indexes = [] for key,value in documents.items(): doc_indexes.append(key) doc = [stemmer.stem(elem) for elem in value.split(" ") if elem not in stop_words] corpus.append(value.split(" ")) bm25 = BM25Okapi(corpus) print("Running BM25",flush=True) results = dict() for i,elem in enumerate(train): results[elem] = run_BM25_query(queries[elem],bm25,doc_indexes,k,language) if i%1000==0: print('Processing query',i,'/',len(train),flush=True) save_BM25_res(output_dir+'/training/BM25.res',results) save_BM25_qrels_dataframe(output_dir + '/training/BM25.qrels.csv',results,qrels,True) results = dict() for elem in validation: results[elem] = run_BM25_query(queries[elem],bm25,doc_indexes,k,language) save_BM25_res(output_dir+'/validation/BM25.res',results) save_BM25_qrels_dataframe(output_dir + '/validation/BM25.qrels.csv',results,qrels,False) results = dict() for elem in test: results[elem] = run_BM25_query(queries[elem],bm25,doc_indexes,k,language) save_BM25_res(output_dir+'/test/BM25.res',results) save_BM25_qrels_dataframe(output_dir + '/test/BM25.qrels.csv',results,qrels,False)
def __init__(self): self.tweets = 0 self.related_tweets = 0 self.stopwords = {} self.stemmers = {} self.stemmers["es"] = SpanishStemmer() self.stemmers["en"] = PorterStemmer() self.stemmers["fr"] = FrenchStemmer() self.stemmers["de"] = GermanStemmer() self.stopwords["es"] = self.load_stopwords_file( "spanish_stopwords.txt") self.stopwords["en"] = self.load_stopwords_file( "english_stopwords.txt") self.stopwords["fr"] = self.load_stopwords_file("french_stopwords.txt") self.stopwords["ge"] = self.load_stopwords_file("german_stopwords.txt") self.output_file = open(sys.argv[2], 'a')
class GigawordParser(StreamParser): STEMMERS = { "eng": PorterStemmer(ignore_stopwords=False), "spa": SpanishStemmer(), } def __init__(self, language): self.next_id = 0 self.language = language self.stemmer = self.STEMMERS.get(language) if self.stemmer is None: raise Exception("Unsupported language %s" % language) def init_id_counter(self, initial): self.next_id = initial def new_id(self): new_id = self.next_id self.next_id += 1 return new_id def parse_raw(self, xml_str): xml = minidom.parseString(xml_str) if self.language == "es": try: url = "gigaword:" + xml.getElementsByTagName( "DOC")[0].attributes["id"].value title = xml.getElementsByTagName( "HEADLINE")[0].firstChild.nodeValue except: url = "<NONE>" title = "<NONE>" else: url = "<NONE>" title = "<NONE>" text = stringio.StringIO() for node in xml.getElementsByTagName("TEXT")[0].childNodes: if len(node.childNodes) > 0: text.write(node.firstChild.nodeValue) content = text.getvalue() terms = text_to_terms(content, self.language) return RuwacDocument(self.new_id(), url, title, content, terms)
def lemmatize(self, text, lang): # spacy.prefer_gpu() # nlp = spacy.load(lang) # en fr "en_core_web_sm" if lang == "fr": stemmer = FrenchStemmer() elif lang == "es": stemmer = SpanishStemmer() else: stemmer = EnglishStemmer() stemmed = [] for word in text.split(" "): stemmed.append(stemmer.stem(word)) # doc = nlp(u""+text) # lem_terms = [] # for token in doc: # lem_terms.append(token.lemma_) return " ".join(stemmed)
def run_BM25_query(query,bm25,doc_indexes,k,language): if language=='en': stop_words = set(stopwords.words('english')) stemmer = PorterStemmer() elif language=='fr': stop_words = set(stopwords.words('french')) stemmer = FrenchStemmer() elif language=='es': stop_words = set(stopwords.words('spanish')) stemmer = SpanishStemmer() elif language=='it': stop_words = set(stopwords.words('italian')) stemmer = ItalianStemmer() tokenized_query = [stemmer.stem(elem) for elem in query.split(" ") if elem not in stop_words] doc_scores = bm25.get_scores(tokenized_query) top_k = np.argsort(doc_scores)[::-1][:k] results = [[doc_indexes[key],doc_scores[key]] for key in top_k] return results
def process_election(lang, data_path, stopword_path, save_path): if lang == "English": stemmer = EnglishStemmer() elif lang == "Spanish": stemmer = SpanishStemmer() else: stemmer = None print "loading dataset" line_sentences = ProcessLineSentence(dataPath=data_path, label="election", stopwordPath=stopword_path, stemmer=stemmer) with open(save_path, 'w') as f: writer = csv.writer(f) for sentence, label in line_sentences: if label == "yes": l = [1] else: l = [0] row = [w.encode('utf-8') for w in sentence] writer.writerow(l + row)
def __init__(self): self.lemmatizer = treetaggerwrapper.TreeTagger(TAGLANG='es') self.stopEnglish = stopwords.words('english') self.stopSpanish = stopwords.words('spanish') self.stopSpanish.append('y/o') self.spanishStemmer=SpanishStemmer()
import pickle def stopwords_from_file(stopwords_filepath = "data/stopwords/spa.txt"): stopwords = codecs.open(stopwords_filepath, "r", "utf-8") ret = set() for line in stopwords: word = line.rstrip("\n") word = regex.sub(" *\|.*$", "", word) if regex.search("[^\s]", word): word = unicodedata.normalize("NFD", word) ret.add(word) return ret tokenizer = nltk.load("tokenizers/punkt/spanish.pickle") stopwords = stopwords_from_file("../../src/qlc/data/stopwords/spa.txt") stemmer = SpanishStemmer() doc = "" doc_id = 0 sentence_id = 0 sentences_for_stem = collections.defaultdict(set) docs_for_stem = collections.defaultdict(set) for l in fileinput.input("/Users/ramon/qlc-github/data/eswiki/AA/wiki00"): l = l.strip() l = l.decode("utf-8") l = unicodedata.normalize("NFD", l) if l.startswith("</doc>"): sentences = tokenizer.tokenize(doc)
Hay multitud de ""stemmizadores"", yo voy a coger el de español ''' from nltk.stem.snowball import SnowballStemmer, SpanishStemmer #Igual que con punkt hay que bajar un paquete download('stopwords') #Si no conocemos el lenguaje a priori. #SnowballStemmer(language, ignore_stopwords=False) spanish_stem = SnowballStemmer("spanish", True) # Si conocemos el lenguaje de antemano, podemos importarlo directamente #SpanishStemmer(ignore_stopwords=False) spanish_stem = SpanishStemmer(True) print(spanish_stem.stem("Comiendo"), spanish_stem.stem("Bailando"), spanish_stem.stem("bailar"), spanish_stem.stem("estantería")) '''################################ # Obteniendo el verbo original # ################################ Conocido como lemmatization. NLTK no tiene esto en español, solo inglés. ''' from nltk.stem.wordnet import WordNetLemmatizer from nltk.corpus import wordnet #Igual que con punkt hay que bajar un paquete download('wordnet')
import csv import collections import operator import unicodedata import os cwd = os.getcwd() root = os.path.dirname(cwd) lematizador_dir = os.path.join(root, "data", "lematizador", "lematizador.csv") stopwords_dir = os.path.join(root, "data", "stopwords") from spellchecker import SpellChecker #https://pypi.org/project/pyspellchecker/ #Stemmer from nltk.stem.snowball import SpanishStemmer stemmer = SpanishStemmer() #Creación objeto para corrección ortografía spell = SpellChecker(language="es") metodo_desconocidas = spell.unknown metodo_correccion = spell.correction lista_blanca_regiones = [ "Arica", "Parinacota", "Tarapacá", "Antofagasta", "Atacama", "Coquimbo", "Valparaíso", "Metropolitana", "Santiago", "Libertador", "General", "Bernardo", "O’Higgins", "Maule", "Ñuble", "Biobío", "Araucanía", "Ríos", "Lagos", "Aysén", "General", "Carlos", "Ibáñez", "Campo", "Magallanes", "Antártica" ] lista_blanca_telecom = [ 'lte', 'whatsapp', 'instagram', 'telegram', 'youtube', 'facebook', 'entel', 'bafi', 'resetea', 'samsung', 'huawei', 'iphone', 'kb', 'mb', 'pixi',
def main(argv): log = logging.getLogger() logging.basicConfig(level=logging.INFO) conf = appconfig('config:development.ini', relative_to='.') config = None if not pylons.test.pylonsapp: config = load_environment(conf.global_conf, conf.local_conf) stemmer = SpanishStemmer(True) # load swadesh list swadesh_file = codecs.open(os.path.join(os.path.dirname( os.path.realpath( __file__)), "swadesh_spa.txt"), "r", "utf-8") swadesh_entries = [] for line in swadesh_file: line = line.strip() for e in line.split(","): stem = stemmer.stem(e) swadesh_entries.append(stem) for b in quanthistling.dictdata.books.list: #if b['bibtex_key'] != "thiesen1998": # continue book = model.meta.Session.query(model.Book).filter_by(bibtex_key=b['bibtex_key']).first() if book: print "Filtering entries in %s..." % b['bibtex_key'] for dictdata in book.dictdata: entries = model.meta.Session.query(model.Entry).filter(model.Entry.dictdata_id==dictdata.id).order_by("startpage", "pos_on_page").all() annotations = model.meta.Session.query(model.Annotation).join(model.Entry, model.Annotation.entry_id==model.Entry.id).filter(model.Entry.dictdata_id==dictdata.id).all() dict_annotations = collections.defaultdict(list) for a in annotations: dict_annotations[a.entry_id].append(a) for e in entries: if b['bibtex_key'] == "thiesen1998": e.filtered = False else: e.filtered = True for a in dict_annotations[e.id]: if a.value == "iso-639-3" and a.string == "spa": for a2 in dict_annotations[e.id]: if (a2.value == "head" or a2.value == "translation") and a2.start == a.start: phrase = re.sub(" ?\([^)]\)", "", a2.string) phrase = phrase.strip() if not " " in phrase: stem = stemmer.stem(phrase) if stem in swadesh_entries: e.filtered = False # if e.is_subentry: # e.mainentry().filtered = False Session.commit()
def stemmer_all(tweet): stm = SpanishStemmer() split_tweet = [word for word in tweet.lower().split(' ') if word.strip()] return ' '.join([stm.stem(word.strip()) for word in split_tweet])
def __init__(self): super(StemmerProcessor, self).__init__() self.stemmer = SpanishStemmer()
def export_swadesh_entries(input_path, output_path=None): print("Input: {0}".format(input_path)) print("Ouput: {0}".format(output_path)) cr = CorpusReaderDict(input_path) print("Data loaded") files = [ "book.csv", "component.csv", "corpusversion.csv", "dictdata.csv", "language_iso.csv", "language_bookname.csv", "language_src.csv", "language_tgt.csv", "nondictdata.csv", "wordlistdata.csv", "wordlistconcept.csv" ] for f in files: shutil.copyfile(os.path.join( input_path, f), os.path.join(output_path, f)) from nltk.stem.snowball import SpanishStemmer stemmer = SpanishStemmer() import qlc.utils #get stopwords stopwords = qlc.utils.stopwords_from_file(os.path.join(os.path.dirname( os.path.realpath( __file__)), "data", "stopwords", "spa.txt")) # load swadesh list swadesh_file = codecs.open(os.path.join(os.path.dirname( os.path.realpath( __file__)), "data", "swadesh", "spa.txt"), "r", "utf-8") swadesh_entries = [] for line in swadesh_file: line = line.strip() for e in line.split(","): stem = stemmer.stem(e) swadesh_entries.append(stem) # find all entries that contain one of the swadesh words # save entry ids to list entry_ids = [] dictdata_ids = cr.dictdata_string_ids for dictdata_id in dictdata_ids: src_language_iso = cr.src_languages_iso_for_dictdata_id(dictdata_id) tgt_language_iso = cr.tgt_languages_iso_for_dictdata_id(dictdata_id) # is there some spanish? if (src_language_iso != ['spa']) and (tgt_language_iso != ['spa']): continue for entry_id, head, translation in \ cr.ids_with_heads_with_translations_for_dictdata_id( dictdata_id): if src_language_iso == [ 'spa' ]: (head, translation) = (translation, head) translation = re.sub(" ?\([^)]\)", "", translation) if translation in stopwords: entry_ids.append(entry_id) else: translation = qlc.utils.remove_stopwords(translation, stopwords) phrase_stems = qlc.utils.stem_phrase(translation, stemmer, True) for stem in phrase_stems: if stem in swadesh_entries: entry_ids.append(entry_id) #print(len(entry_ids)) #return input_entry_csv = os.path.join(input_path, "entry.csv") output_entry_csv = os.path.join(output_path, "entry.csv") input_annotation_csv = os.path.join(input_path, "annotation.csv") output_annotation_csv = os.path.join(output_path, "annotation.csv") output_annotation = codecs.open(output_annotation_csv, "w", "utf-8") annotation_dict = collections.defaultdict(list) # cache annotations for lookup for i, line in enumerate(fileinput.input( input_annotation_csv, openhook=fileinput.hook_encoded("utf-8"))): if i == 0: output_annotation.write(line) continue data = line.strip().split("\t") annotation_dict[ data[_annotation_table_columns['entry_id'] + 1]].append(line) fileinput.nextfile() output = codecs.open(output_entry_csv, "w", "utf-8") count_entries = 0 for i, line in enumerate(fileinput.input( input_entry_csv, openhook=fileinput.hook_encoded("utf-8"))): if i == 0: output.write(line) continue data = line.strip().split("\t") if data[0] in entry_ids: output.write(line) for annotation_line in annotation_dict[data[0]]: output_annotation.write(annotation_line) fileinput.nextfile() output.close() output_annotation.close() # Worldists cr = CorpusReaderWordlist(sys.argv[1]) print("Data loaded") # find all entries that contain one of the swadesh words # save entry ids to list wordlistdata_ids = cr.wordlistdata_string_ids bibtex_keys = collections.defaultdict(list) for wid in wordlistdata_ids: wordlistdata_string = cr.wordlistdata_string_ids[wid] bibtex_key = wordlistdata_string.split("_")[0] bibtex_keys[bibtex_key].append(wid) wordlistentry_ids = [] for bibtex_key in bibtex_key: # first collect all concepts in this book where the spanish counterpart # has one of the swadesh words concepts = [] for wordlistentry_id in wordlistentry_ids: language_iso = cr.get_language_code_for_wordlistdata_id( wordlistdata_id) # is there some spanish? if language_iso != ['spa']: continue for entry_id, concept, counterpart in \ cr.ids_with_concepts_with_counterparts_for_dictdata_id( dictdata_id): counterpart = re.sub(" ?\([^)]\)", "", counterpart) if counterpart in stopwords: entry_ids.append(entry_id) else: counterpart = qlc.utils.remove_stopwords( counterpart, stopwords) phrase_stems = qlc.utils.stem_phrase( counterpart, stemmer, True) for stem in phrase_stems: if stem in swadesh_entries: concepts.append(concept) # now collect the entry ids for those concepts for wordlistentry_id in wordlistentry_ids: for entry_id, concept, counterpart in \ cr.ids_with_concepts_with_counterparts_for_dictdata_id( dictdata_id): if concept in concepts: wordlistentry_ids.append(entry_id) input_entry_csv = os.path.join(input_path, "wordlistentry.csv") output_entry_csv = os.path.join(output_path, "wordlistentry.csv") input_annotation_csv = os.path.join(input_path, "wordlistannotation.csv") output_annotation_csv = os.path.join(output_path, "wordlistannotation.csv") output_annotation = codecs.open(output_annotation_csv, "w", "utf-8") annotation_dict = collections.defaultdict(list) for i, line in enumerate(fileinput.input(input_annotation_csv, openhook=fileinput.hook_encoded("utf-8"))): if i == 0: output_annotation.write(line) continue data = line.strip().split("\t") annotation_dict[data[_wordlistannotation_table_columns['entry_id'] + 1]].append(line) fileinput.nextfile() output = codecs.open(output_entry_csv, "w", "utf-8") count_entries = 0 for i, line in enumerate(fileinput.input(input_entry_csv, openhook=fileinput.hook_encoded("utf-8"))): if i == 0: output.write(line) continue data = line.strip().split("\t") if data[0] in entry_ids: output.write(line) for annotation_line in annotation_dict[data[0]]: output_annotation.write(annotation_line) fileinput.nextfile() output.close() output_annotation.close()
# The next step is to connect spanish translations that contain the same stem. For this we first remove certain stop words from the translation (list of stopwords from NLTK). There are two cases then: just one word remains, or more than one word remains. # # We have to options now what to do with the latter: either they are not connected with anything at all (default behaviour), or each word is stemmed and the translation is connected with every other translation that contain the same stems. Right now this results in many connections that look not very useful. This should be done in a more intelligent way in the future (for example find heads of phrases in mulitword expression and only connect those; split the weight of the connections between all stems and work with weighted graphs from this step on; ...). # # To connect the spanish translations the script adds additional "stem nodes" to the graph. The name of these nodes consists of a spanish word stem plus a pipe symbol plus the string "stem". These nodes look like this in a dot file: # # > "tom|stem" [is_stem=True]; # # The introduction of these nodes later facilites the output of translation matrixes, as you can just search for stems within the graph and only output direct neighbours with spanish translations. It would also be possible to directly connect the spanish translations if they have a matching stem, but then the graph traversal to find matching translations and their heads is a bit more complex later. # # First we create a stemmer object from the SpanishStemmer in NLTK: # <codecell> from nltk.stem.snowball import SpanishStemmer stemmer = SpanishStemmer(True) # <markdowncell> # We create the list of stopwords and encode them as unicode strings: # <codecell> combined_graph_stemmed = copy.deepcopy(combined_graph) stopwords = nltk.corpus.stopwords.words("spanish") stopwords = [w.decode("utf-8") for w in stopwords] # <markdowncell> # Then we loop through all the nodes of the merged graph and add the stem nodes to each Spanish node. If the node has only one word (after stopword removal) we will use the NLTK stemmer; otherwise we just leave the phrase as it is:
def remove_stopwords(text, stopSpanish): stemmer=SpanishStemmer() textList = text.split() textList = [word for word in textList if word not in stopSpanish] return ' '.join([stemmer.stem(word) for word in textList])
import pickle import codecs import os """The application's model objects""" from quanthistling.model.meta import Session, metadata from sqlalchemy import schema, types from sqlalchemy import orm, func from sqlalchemy import and_ from webhelpers.html import literal from operator import attrgetter from pylons import config from nltk.stem.snowball import SpanishStemmer stemmer = SpanishStemmer(True) # load swadesh list swadesh_file = codecs.open(os.path.join(os.path.dirname( os.path.realpath( __file__)), "spa.txt"), "r", "utf-8") swadesh_list = [] for line in swadesh_file: line = line.strip() for e in line.split(","): stem = stemmer.stem(e) swadesh_list.append(stem) def init_model(engine): """Call me before using any of the tables or classes in the model"""
def get_vector_matrix(self, freq_floor=50, context_words=3): nlp = es_core_web_md.load() STOPWORDS = spacy.es.STOP_WORDS def _clean_sent(sent): clean_sent = [] # remove stopwords for word in sent: word = word.lower() if not word in STOPWORDS: if not word.isdigit(): clean_sent.append(word) return clean_sent def _update_feature(word, feature_name, features): " dirty update of features " counts = 1 if word in vectors: if feature_name in vectors[word]: counts = vectors[word][feature_name] + 1 features[feature_name] = counts return features def _update_counts(feature_name, f_counts): counts = 1 if feature_name in f_counts: counts = f_counts[feature_name] + 1 f_counts[feature_name] = counts return f_counts sents = self.corpus.get_sents() stemmer = SpanishStemmer() # will use the words as keys and dict of features as values vectors = {} #freq_counts = {} for sent in sents: # TODO: PARALELLIZE!! #for doc in nlp.pipe(texts, batch_size=10000, n_threads=3): # take off stopwords && to get context_words! cleaned_sent = _clean_sent(sent) doc = nlp(' '.join(sent)) for word_idx in range(len(doc)): # get the word and the pos tag spacy_word = doc[word_idx] word = spacy_word.text.lower() pos_tag = spacy_word.pos_ if len(word) <= 2: continue if word in STOPWORDS: continue if word.isdigit(): continue # if not seen word if not word in vectors: features = {} else: features = vectors[word] # counts of frequency to normalze later #freq_counts = _update_counts(pos_tag, freq_counts) # context related (POS and words stemmed) features = _update_feature(word, pos_tag, features) if word_idx > 0: prev_tag = doc[word_idx - 1].pos_ feature_name = prev_tag + '_pos_prev' features = _update_feature(word, feature_name, features) if word_idx < len(sent) - 1: post_tag = doc[word_idx + 1].pos_ feature_name = post_tag + '_pos_post' features = _update_feature(word, feature_name, features) # dependency features. the objective of the dep is stemmed! dep_type = spacy_word.dep_ if dep_type != 'ROOT': dep_obj = stemmer.stem(spacy_word.head.text.lower()) feature_name = 'DEP:' + dep_type + '-' + dep_obj features = _update_feature(word, feature_name, features) # get n words from context as features (stemmed...!) for i in range(context_words): ctxt_word = (random.choice(cleaned_sent)) feature_word = stemmer.stem(ctxt_word) feature_name = ctxt_word + '_ctxt_word' features = _update_feature(word, feature_name, features) # agregar feature de synset (wordnet) :0 features['word'] = word # frequency counting features = _update_feature(word, 'freq', features) vectors[word] = features # sacar palabras con < 'freq' words_to_pop = set() for word, f_dict in vectors.items(): if f_dict['freq'] <= freq_floor: words_to_pop.add(word) for word in words_to_pop: vectors.pop(word) for word, f_dict in vectors.items(): #print(word, f_dict) f_dict['freq'] = 0 vectors[word] = f_dict # delete an irrelevant dimension! # normalizar los contextos de POS #for word, f_dict in vectors.items(): # f_dict[] # agregar palabra de contexto. .. LEMATIZADA ! # NORMALIZAR TODOS LOS CONTEXTOS! -> diccionario de frequencias de ... TODOS los features que ocurrieron self.words = list( vectors.keys()) # thankfully in the same order as vectors.values vectorizer = DictVectorizer(dtype=numpy.int32) vec_matrix = vectorizer.fit_transform(list(vectors.values())) vectors_shape = vec_matrix.get_shape() print(vectors_shape) """ freqs_vector = vectorizer.transform(freq_counts) vec_matrix = vstack([freqs_vector, vec_matrix]) print(s.get_shape) print(s) print(vectorizer.inverse_transform(s)) """ # normalization vec_matrix = normalize(vec_matrix, copy=False) ####### reduccion de dim no sup # reducir dimensionalidad con variance treshold #selector = VarianceThreshold(threshold = 0.0) #vec_matrix = selector.fit_transform(vec_matrix) # SVD (PCA) Trunc_svd = TruncatedSVD(n_components=1500) vec_matrix = Trunc_svd.fit_transform(vec_matrix) # reducir dimensionalidad con percentile de varianza #selected = SelectPercentile(chi2, percentile = 10) #word_vecs_new=selected.fit_transform(new_word_vecs,target_vec) print(vectorizer.inverse_transform(vec_matrix)) # -> to see features! return self.words, vec_matrix
class MLAssistant(Assistant): def __init__(self, language='en', database_name='memory', memory_table='memory', listen_log_table='listen_log', speak_log_table='speak_log'): super().__init__(language, database_name, memory_table, listen_log_table, speak_log_table) try: json_file = open('modelo_gustos.json', 'r') loaded_model_json = json_file.read() json_file.close() self.model = model_from_json(loaded_model_json) self.model.load_weights("modelo_gustos.h5") self.model.compile(loss='mean_squared_error', optimizer='adam', metrics=['binary_accuracy']) except Exception: print('****ERROR: Error cargando modelo...****') self.stemmer = SpanishStemmer() self.words = [ '¿qu', '?', 'peli', 'pelis', 'color', 'favorit', 'leer', 'libr', 'novel', 'ver', 'prefier', 'gust', 'pelicul', 'jug', '¿cual', 'prefer', 'jueg', 'com', 'plat', 'animal', 'videojueg' ] self.classes = [ 'comida', 'color', 'animal', 'juego', 'libro', 'película' ] def main(self, initial_sentence='¿Qué deseas?'): self.speak(initial_sentence, remember=False) self.listen() self.process_orders(self.last_recognised) self.adjust_for_ambient_noise() def process_orders(self, sentence): _class = self.classify_sentence(sentence) if not _class: self.speak('no estoy segura de lo que me quieres preguntar') else: if _class == 'comida': self.speak( 'Sin lugar a dudas mi comida preferida son los nachos con queso' ) if _class == 'color': self.speak('Mi color preferido es el escarlata.') if _class == 'animal': self.speak( 'Me gustan mucho los grandes felinos, pero mi animal preferido es una perra que se llama' ' Arale.') if _class == 'juego': self.speak('¡Me encanta Hollywood Monsters!') if _class == 'libro': self.speak( 'No queda muy bien decirlo, pero me han programado para decir siempre la verdad. No tengo' ' tiempo para leer, y por tanto no tengo libro preferido.') if _class == 'película': self.speak( 'No tengo una película preferida, pero me gustan especialmente las películas de Disney y' ' las del Studio Ghibli.') def classify_sentence(self, sentence, min_val=0.5): results = self._get_classification(sentence) if float(results[0][1]) < min_val: return None else: return results[0][0] def _clean_up_sentence(self, sentence): sentence_words = nltk.word_tokenize(sentence) sentence_words = [ self.stemmer.stem(word.lower()) for word in sentence_words ] return sentence_words def _bow(self, sentence, words): sentence_words = self._clean_up_sentence(sentence) bag = [0] * len(words) for s in sentence_words: for i, w in enumerate(words): if w == s: bag[i] = 1 return np.array(bag) def _get_classification(self, sentence): array = [self._bow(sentence, self.words)] np_array = np.array(array, "float32") prediction = self.model.predict(np_array).round(2)[0] result = dict(zip(self.classes, prediction)) return sorted(result.items(), key=operator.itemgetter(1))[::-1]
class TextProcessor: lemmatizer=None stopEnglish=None stopSpanish=None spanishStemmer=None def __init__(self): self.lemmatizer = treetaggerwrapper.TreeTagger(TAGLANG='es') self.stopEnglish = stopwords.words('english') self.stopSpanish = stopwords.words('spanish') self.stopSpanish.append('y/o') self.spanishStemmer=SpanishStemmer() def _remove_numbers(self, text): "Elimina los números del texto" return ''.join([letter for letter in text if not letter.isdigit()]) def _remove_punctuation(self, text): "Elimina los signos de puntuacion del texto" regex = re.compile('[%s]' % re.escape(string.punctuation)) return regex.sub(' ', text) def preprocessText(self,text): text=text.lower() text=self._remove_punctuation(text) text=self._remove_numbers(text) return text def lematizeText(self,text): newText = "" firstElement = 0 firstWord=True for word in text.split(): if word not in self.stopEnglish and word not in self.stopSpanish: word = word.replace("\ufeff", "") lemmaResult = self.lemmatizer.tag_text(word) # Return [[word,type of word, lemma]] if (len(lemmaResult) != 0): word = lemmaResult[firstElement].split()[2] if firstWord: newText += word firstWord = False else: newText += " " + word return newText def stemText(self,text): newText = "" firstWord = True for word in text.split(): if word not in self.stopEnglish and word not in self.stopSpanish: word = word.replace("\ufeff", "") wordStemmed = self.spanishStemmer.stem(word) if firstWord: newText += wordStemmed firstWord = False else: newText += " " + wordStemmed return newText
def __init__(self, next_normalizer=None): super(SpanishStemmer, self).__init__(next_normalizer) self._stemmer = NLTKSpanishStemmer()