def correct_word(self, token): """ Correct a word using enchant and the dictionary Nltk.cess_esp and the Levenshtein's distance :param token: Word to correct :return similar_word: Word closest """ if token in self._corrected_words: return self._corrected_words[token] suggested = (enchant.Dict('es')).suggest(token) if len(suggested) > 0: for similar_word in suggested: if SpanishCorpus.levenshtein(token, similar_word) <= SpanishCorpus.levenshtein_distance: self._corrected_words[token] = similar_word print u'--> Palabra corregida: {} --> {}'.format(token, similar_word) return similar_word minimum = sys.maxint similar_word = '' for word in cess_esp.words(): lev_dist = SpanishCorpus.levenshtein(token, word) if (lev_dist < minimum) or (lev_dist == minimum and len(token) == len(word) and len(similar_word) != len(token)): minimum = lev_dist similar_word = word if lev_dist == 0: break if minimum <= SpanishCorpus.levenshtein_distance: self._corrected_words[token] = similar_word print u'--> Palabra corregida: {} --> {}'.format(token, similar_word) return similar_word else: return None
def __init__(self, language): self.language = language # from 'Multilingual and Cross-Lingual Complex Word Identification' (Yimam et. al, 2017) if language == 'english': self.avg_word_length = 5.3 text = brown.words() else: # spanish self.avg_word_length = 6.2 text = cess_esp.words() if language == 'english': self.fdist = nltk.FreqDist(w for w in text) else: self.fdist = nltk.FreqDist(w.lower() for w in text) self.total = len(text) #models self.model1 = MLPClassifier(random_state=2) self.model2 = svm.SVC(random_state=2) self.model4 = RandomForestClassifier(random_state=2) self.model5 = LogisticRegression(random_state=2) #hard voting classifier if language == 'spanish': estimators = [('mlp', self.model1), ('rf', self.model4), ('lr', self.model5)] else: estimators = [('svc', self.model2), ('rf', self.model4), ('mlp', self.model1)] self.vote = VotingClassifier(estimators, voting='hard')
def make_and_save_lookup_tagger(fname): fd_tagged_words = nltk.ConditionalFreqDist(cess_esp.tagged_words()) likely_tags = dict((word, fd_tagged_words[word].max()) for word in cess_esp.words()) lookup_tagger = nltk.UnigramTagger(model=likely_tags) output=open(fname, 'wb') dump(lookup_tagger, output, -1) output.close()
def create_espBoWLexicon(self): # CESS esp corpus BoW = Counter() lexicon = {} for word in cess_esp.words(): BoW[word] += 1. threshold = stats.scoreatpercentile(list(BoW.values()), 10) for word, count in BoW.items(): if count >= threshold: lexicon[word] = count return BoW, lexicon
def make_and_save_most_common_words_lookup_tagger(fname, number): fd_words = nltk.FreqDist(cess_esp.words()) fd_tagged_words = nltk.ConditionalFreqDist(cess_esp.tagged_words()) most_common_words = fd_words.most_common(number) most_common_words = [item[0] for item in most_common_words] likely_tags = dict((word, fd_tagged_words[word].max()) for word in most_common_words) lookup_tagger = nltk.UnigramTagger(model=likely_tags) output=open(fname, 'wb') dump(lookup_tagger, output, -1) output.close()
def __init__(self, language): self.language = language # from 'Multilingual and Cross-Lingual Complex Word Identification' (Yimam et. al, 2017) if language == 'english': self.avg_word_length = 5.3 text = brown.words() else: # spanish self.avg_word_length = 6.2 text = cess_esp.words() self.fdist = nltk.FreqDist(w.lower() for w in text) self.model = KNeighborsClassifier() self.total = len(text)
def descargarCorpus(): print( " Fase 2/6 Descargando Corpus..................espere un momento \n" ) ''' sino="n" sino=raw_input(" ===> Descargar el DATASET del Corpus S/N ? ") sino=sino.lower() if (sino=="s"): print("\n iniciando la descarga...... espere un momento \n") #nltk.download("movie_reviews") nltk.download("spanish_grammars") nltk.download("cess_esp") print("\n dataset descargado \n") if (sino=="n"): print("\n procesando dataset en memoria \n") reviews = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] new_train, new_test = reviews[0:100], reviews[101:200] print(new_train[0]) print("\n\n") ''' print( "\n procesando dataset en memoria.... espere un momento por favor \n") ''' reviews = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] new_train, new_test = reviews[0:100], reviews[101:200] print(new_train[0]) print("\n\n") ''' reviews = list(cess_esp.words() ) #reviews = [(list(cess_esp.words(fileid)), category)] new_train, new_test = reviews[0:100], reviews[101:200] print("\n procesando dataset en memoria del cees_esp \n\n") #print(str(reviews)) print("\n\n") print(new_train) print("\n\n Test...... \n") print(new_test) print("\n\n") return
def correct_word(self, token): """ Correct a word using enchant and the dictionary Nltk.cess_esp and the Levenshtein's distance :param token: Word to correct :return similar_word: Word closest """ if token in self._corrected_words: return self._corrected_words[token] suggested = (enchant.Dict('es')).suggest(token) if len(suggested) > 0: for similar_word in suggested: if SpanishCorpus.levenshtein( token, similar_word) <= SpanishCorpus.levenshtein_distance: self._corrected_words[token] = similar_word print u'--> Palabra corregida: {} --> {}'.format( token, similar_word) return similar_word minimum = sys.maxint similar_word = '' for word in cess_esp.words(): lev_dist = SpanishCorpus.levenshtein(token, word) if (lev_dist < minimum) or (lev_dist == minimum and len(token) == len(word) and len(similar_word) != len(token)): minimum = lev_dist similar_word = word if lev_dist == 0: break if minimum <= SpanishCorpus.levenshtein_distance: self._corrected_words[token] = similar_word print u'--> Palabra corregida: {} --> {}'.format( token, similar_word) return similar_word else: return None
'English: Wall Street Journal Corpus': lambda: treebank.words(), 'Chinese: Sinica Corpus': lambda: sinica_treebank.words(), 'Dutch: Alpino Corpus': lambda: alpino.words(), 'Hindi: Indian Languages Corpus': lambda: indian.words(files='hindi.pos'), 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.words(), 'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.words(), 'Portuguese: Machado Corpus (Brazil)': lambda: machado.words(), 'Spanish: CESS-ESP Corpus': lambda: cess_esp.words() } class CollocationsView: _BACKGROUND_COLOUR='#FFF' #white def __init__(self): self.queue = q.Queue() self.model = CollocationsModel(self.queue) self.top = Tk() self._init_top(self.top) self._init_menubar() self._init_widgets(self.top) self.load_corpus(self.model.DEFAULT_CORPUS) self.after = self.top.after(POLL_INTERVAL, self._poll)
def test_esp(self): words = cess_esp.words()[:15] txt = "El grupo estatal Electricité_de_France -Fpa- EDF -Fpt- anunció hoy , jueves , la compra del" self.assertEqual(words, txt.split()) self.assertEqual(cess_esp.words()[115], "años")
import nltk from nltk.corpus import cess_esp #Pregunta 1.a #forma larga print "pregunta 1a" etiquetado=cess_esp.tagged_words() etiquetas=set(tag for (word,tag) in etiquetado) print etiquetas #otra forma simplificada etiquetado=cess_esp.tagged_words(simplify_tags=True) etiquetas=set(tag for (word,tag) in etiquetado) print etiquetas #pregunta 1.b print "pregunta 1b" for field in cess_esp.fileids(): vocabulario = set([w.lower() for w in cess_esp.words(field)]) print vocabulario #Pregunta 1.c print "pregunta 1c" etiquetado=cess_esp.tagged_words() for i in etiquetado: print i[0]," ",i[1] #Pregunta 1.d print "pregunta 1d" t=cess_esp.parsed_sents()[0] print t #Pregunta 2 print "pregunta 2" from xml.dom import minidom dom=minidom.parse("/home/javier/ALC/python/frase_ancora.xml") nodes=dom.childNodes
def __init__(self, language, trainset): self.language = language pos_tags_nltk = [ 'cc', 'cd', 'dt', 'in', 'jj', 'jjr', 'jjs', 'nn', 'nns', # tag 'nnp', 'nnps', 'pdt', 'pos', 'prp', 'prp$', 'rb', 'rbr', 'rbs', 'rp', 'sym', 'vb', 'vbd', 'vbg', 'vbn', 'vbp', 'vbz', 'wdt', 'wp', 'wp$', 'wrb' ] self.vowels = ['a', 'e', 'i', 'o', 'u'] self.vec = CountVectorizer(vocabulary=pos_tags_nltk) # tag vector self.cmudict = nltk.corpus.cmudict.dict() #syl dict self.model = xgb.XGBClassifier( learning_rate=0.1, # XGboost classifier eta=1, silent=1, nround=10, n_estimators=1000, max_depth=6, min_child_weight=1, gamma=0.1, reg_alpha=0.005, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', nthread=8, scale_pos_weight=1, seed=27) self.word_fre = {} self.bigram_fre = {} self.trigram_fre = {} self.tetgram_fre = {} for row in trainset: sen = row["sentence"] target_word = row["target_word"] for word in target_word.split(' '): if word in self.word_fre: self.word_fre[word] += 1 else: self.word_fre[word] = 1 for word in sen.split(' '): for i in range(len(word) - 1): if word[i:i + 2] in self.bigram_fre: self.bigram_fre[word[i:i + 2]] += 1 else: self.bigram_fre[word[i:i + 2]] = 0 for i in range(len(word) - 2): if word[i:i + 3] in self.trigram_fre: self.trigram_fre[word[i:i + 3]] += 1 else: self.trigram_fre[word[i:i + 3]] = 0 if self.language == "english": self.avg_word_length = 5.3 brown_corpus = brown.categories() # brown corpus for i in range(len(brown_corpus)): file = brown.words(categories=brown_corpus[i]) for word in file: if word not in self.word_fre: self.word_fre[word] = 1 else: self.word_fre[word] += 1 else: self.avg_word_length = 6.2 word = cess_esp.words() # spanil corpus for item in word: if item in self.word_fre: self.word_fre[item] += 1 else: self.word_fre[item] = 1
def test_esp(): words = cess_esp.words()[:15] txt = "El grupo estatal Electricité_de_France -Fpa- EDF -Fpt- anunció hoy , jueves , la compra del" self.assertEqual(words, txt.split())
#! -*- encoding: utf8 -*- #Acceder al corpus en castellano cess_esp from nltk.corpus import cess_esp from nltk.probability import * import unittest from operator import itemgetter #Mostrar el número de palabras que contiene este corpus print("\n\nEJERCICIO 1\n") print( "\n--------------------------------------------------------------------------------\n " ) cess_esp.words() palabras = len(cess_esp.words()) print("1.1) Cargando Corpus cess_esp...") print( "\n--------------------------------------------------------------------------------\n " ) print("\n\n1.2) Numero de palabras que contiene el corpus: \n" + str(palabras)) #Obtener oraciones del corpus: frases = cess_esp.sents() numFrases = len(cess_esp.sents()) print( "\n--------------------------------------------------------------------------------\n " ) print("\n\n1.3) Numero de frases que contiene el corpus: \n" + str(numFrases)) nomFichero = cess_esp.fileids()[0] texto = cess_esp.words(nomFichero)
"English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"), "English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"), "English: Brown Corpus (Science Fiction)": lambda: brown.words( categories="science_fiction" ), "English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"), "English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"), "English: NPS Chat Corpus": lambda: nps_chat.words(), "English: Wall Street Journal Corpus": lambda: treebank.words(), "Chinese: Sinica Corpus": lambda: sinica_treebank.words(), "Dutch: Alpino Corpus": lambda: alpino.words(), "Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"), "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(), "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(), "Portuguese: Machado Corpus (Brazil)": lambda: machado.words(), "Spanish: CESS-ESP Corpus": lambda: cess_esp.words(), } class CollocationsView: _BACKGROUND_COLOUR = "#FFF" # white def __init__(self): self.queue = q.Queue() self.model = CollocationsModel(self.queue) self.top = Tk() self._init_top(self.top) self._init_menubar() self._init_widgets(self.top) self.load_corpus(self.model.DEFAULT_CORPUS) self.after = self.top.after(POLL_INTERVAL, self._poll)
print("!! NOT FOUND IN ANY LIST") botorhuman=np.array(train_bt).flatten() botorhuman_dev=np.array(dev_bt).flatten() gender = np.array(train_g). flatten() gender_dev = np.array(dev_g). flatten() ### HELPER FUNCTIONS ### # a pos tageken kívül ez van még: # token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop nlp = es_core_news_sm.load() stop = stopwords.words('spanish') word_list = cess.words() word_set = set(word_list) senti_clf = SentimentClassifier() def PosTagger(text): doc = nlp(text) pos = [] for token in doc: pos.append(token.pos_) counter = collections.Counter(pos) try: noun = counter.get('NOUN')/len(doc) except: noun = 0 try:
#Ramon Ruiz Dolz #Salvador Marti Roman from nltk.corpus import cess_esp from nltk.corpus import PlaintextCorpusReader from nltk.probability import * import os import nltk dir_path = os.path.dirname(os.path.realpath(__file__)) corpus_root = dir_path.replace(".idea", "") nltk.data.path.append(dir_path + "\\NLTK") #EJERCICIO1 print("#act2") print(len(cess_esp.words())) print("#act3") print(len(cess_esp.sents())) print("#act4") text = cess_esp.words(cess_esp.fileids()[0]) fdist = FreqDist(text) print(fdist.most_common(20)) print("#act5") voc = [w for w, f in fdist.most_common()] print(voc) print("#act6") print(list(w for w in voc if len(w) > 7 and fdist[w] > 2)) print("#act7")
import nltk from nltk.corpus import cess_esp #Pregunta 1.a #forma larga print "pregunta 1a" etiquetado = cess_esp.tagged_words() etiquetas = set(tag for (word, tag) in etiquetado) print etiquetas #otra forma simplificada etiquetado = cess_esp.tagged_words(simplify_tags=True) etiquetas = set(tag for (word, tag) in etiquetado) print etiquetas #pregunta 1.b print "pregunta 1b" for field in cess_esp.fileids(): vocabulario = set([w.lower() for w in cess_esp.words(field)]) print vocabulario #Pregunta 1.c print "pregunta 1c" etiquetado = cess_esp.tagged_words() for i in etiquetado: print i[0], " ", i[1] #Pregunta 1.d print "pregunta 1d" t = cess_esp.parsed_sents()[0] print t #Pregunta 2 print "pregunta 2" from xml.dom import minidom dom = minidom.parse("/home/javier/ALC/python/frase_ancora.xml") nodes = dom.childNodes
import re #nltk is the python library for Natural Language Processing (used here for cleaning non-English text from the data) from nltk.corpus import brown from nltk.corpus import words from nltk.corpus import cess_esp as spanish from nltk.corpus import reuters from nltk.corpus import nps_chat #These dictionaries are used to reduce time required to search for English words by implementing a hash search in "isEnglishWord" englishBrownDict = dict.fromkeys(brown.words(), True) englishWordsDict = dict.fromkeys(words.words(), True) englishReutersDict = dict.fromkeys(reuters.words(), True) englishChatDict = dict.fromkeys(nps_chat.words(), True) spanishWordsDict = dict.fromkeys(spanish.words(), True) malayText = open(os.path.join(os.getcwd(), "malayUpdated.txt")) malayWordsDict = [] for line in malayText: malayWordsDict.append(line) #print "Count of malay words: ", len (malayWords), "\n" #malayWordsDict = dict.fromkeys (malayWords, True) commonTweetWords = [ "ur", "u", "youre", "gonna", "wanna", "wannabe", "shoulda", "should've", "coulda", "could've", "woulda", "would've", "thats", "that's", "whats", "what's", "hadnt", "hadn't", "couldnt", "couldn't", "wouldnt", "wouldn't",
from nltk.corpus import PlaintextCorpusReader from nltk.corpus import stopwords from nltk.tokenize import RegexpTokenizer from nltk.stem import SnowballStemmer tokenizer = RegexpTokenizer(r'\w+') def remove_stopwords(text, language="english"): stopwordsw = stopwords.words(language) result = [w for w in text if w.lower() not in stopwordsw] return result # Ejercicio 1 # Mostrar el número de palabras que contiene este corpus print("El numero de palabras en este corpus es %d" % len(cess_esp.words())) # Mostrar el número de frases que contiene print("El número de oraciones en este corpues es %d" % len(cess_esp.sents())) """ Obtener las frecuencias de aparición de los ítems que componen el primer fichero del corpus anterior. Un ítem es un par (key, value) donde key es la palabra y value es la frecuencia de aparición de la palabra. Visualizar los 20 más frecuentes. """ text = cess_esp.words(cess_esp.fileids()[0]) fdist = FreqDist(text) print(fdist.most_common(20)) # Obtener el vocabulario del primer fichero del corpus (ordenado por frecuencia). print("vocabulario ordenado por frecuencia") p = [w for w, f in fdist.most_common()] print(p)
from nltk.corpus import cess_esp from nltk.probability import * firstfile = cess_esp.words(cess_esp.fileids()[0]) fdist = FreqDist(firstfile) print(fdist.most_common(20))
# Exercise 1 # 1 from nltk.corpus import cess_esp # 2 print("2.", len(cess_esp.words())) # 3 print("3.", len(cess_esp.sents())) # 4 from nltk.probability import FreqDist first_file = cess_esp.fileids()[0] cess_freq0 = FreqDist(cess_esp.words(first_file)) print("4.", cess_freq0.most_common(20)) # 5 print("5.", [w for w, k in cess_freq0.most_common()]) # 6 print("6.", [w for w, k in cess_freq0.items() if len(w) > 7 and k > 2]) # 7 print("7.", [k for w, k in cess_freq0.most_common()]) print("7b. Freq de aparición de la preposición a", cess_freq0.get("a", 0)) # 8 print("8. No de palabras que aparecen una sola vez:", len([w for w, k in cess_freq0.items() if k == 1])) # 9 print("9. La palabra más frecuente es", cess_freq0.max()) # 10 from nltk.corpus import PlaintextCorpusReader mycorpus = PlaintextCorpusReader("../res/", ".*") # 11
out = [] i = len(s) while i > 0: c, k = best_match(i) assert c == cost[i] out.append(s[i - k:i]) i -= k return " ".join(reversed(out)) link = set() words = set( list(wd.words()) + list(brown.words()) + word_man + list(udhr.words()) + list(cess.words())) some_variable = 0 def fcn(domain_data, pt, date): list_no = domain_data[1] forbids = [ '[', '`', '\\', '-', '=', '~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '_', '+', '\\', '[', '\\', ']', '{', '}', ';', "'", '\\', ':', '"', '|', '<', ',', '.', '/', '<', '>', '?', ']' ] file = open('filtered_domains.txt', 'a') global words, link, some_variable, result_list, result_list_b, master_data domain = domain_data[0] inter = list(set(forbids) & set(domain.split(".")[0])) # FILTER 1
'English: Wall Street Journal Corpus': lambda: treebank.words(), 'Chinese: Sinica Corpus': lambda: sinica_treebank.words(), 'Dutch: Alpino Corpus': lambda: alpino.words(), 'Hindi: Indian Languages Corpus': lambda: indian.words(files='hindi.pos'), 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.words(), 'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.words(), 'Portuguese: Machado Corpus (Brazil)': lambda: machado.words(), 'Spanish: CESS-ESP Corpus': lambda: cess_esp.words() } class CollocationsView: _BACKGROUND_COLOUR = '#FFF' # white def __init__(self): self.queue = q.Queue() self.model = CollocationsModel(self.queue) self.top = Tk() self._init_top(self.top) self._init_menubar() self._init_widgets(self.top) self.load_corpus(self.model.DEFAULT_CORPUS) self.after = self.top.after(POLL_INTERVAL, self._poll)
stringl = " ".join(map(str, listaimp)) print(stringl) def printfreq(listaimp): freqtoprint = FreqDist(listaimp) print(freqtoprint.most_common(20)) #Ejercicio 1 #1 Acceder al corpus en castellano cess_esp from nltk.corpus import cess_esp as corpus #2 Mostrar el número de palabras que contiene este corpus print(len(corpus.words())) #3 Mostrar el número de frases que contiene print(len(corpus.sents())) #4 Obtener las frecuencias de aparición de los ítems que componen el primer fichero del corpus #anterior. Un ítem es un par (key, value) donde key es la palabra y value es la frecuencia de #aparición de la palabra. Visualizar los 20 más frecuentes. text1 = corpus.words(corpus.fileids()[0]) fdist = FreqDist(text1) print(fdist.most_common(20)) #5 Obtener el vocabulario del primer fichero del corpus (ordenado por frecuencia) #vocxfrec= sorted([(b,a) for a,b in sorted([(y,x) for x,y in fdist.keys()])]) #vocxfrec = sorted([key for key in sorted([(value, key) for key,value in fdist.most_common()])]) vocxfrec = [key for (key, value) in fdist.most_common()]
"English: Brown Corpus": lambda: brown.words(), "English: Brown Corpus (Press)": lambda: brown.words(categories=["news", "editorial", "reviews"]), "English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"), "English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"), "English: Brown Corpus (Science Fiction)": lambda: brown.words(categories="science_fiction"), "English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"), "English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"), "English: NPS Chat Corpus": lambda: nps_chat.words(), "English: Wall Street Journal Corpus": lambda: treebank.words(), "Chinese: Sinica Corpus": lambda: sinica_treebank.words(), "Dutch: Alpino Corpus": lambda: alpino.words(), "Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"), "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(), "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(), "Portuguese: Machado Corpus (Brazil)": lambda: machado.words(), "Spanish: CESS-ESP Corpus": lambda: cess_esp.words(), } class CollocationsView: _BACKGROUND_COLOUR = "#FFF" # white def __init__(self): self.queue = q.Queue() self.model = CollocationsModel(self.queue) self.top = Tk() self._init_top(self.top) self._init_menubar() self._init_widgets(self.top) self.load_corpus(self.model.DEFAULT_CORPUS) self.after = self.top.after(POLL_INTERVAL, self._poll)
from nltk.corpus import cess_esp from nltk.probability import FreqDist fdist = FreqDist(cess_esp.words(cess_esp.fileids()[0])) print("La palabra mas frecuente es ", fdist.max())
import nltk from nltk.corpus import cess_esp import pylab palabras = cess_esp.words() palabras1 = palabras[:1000] freqdist = nltk.FreqDist(palabras1) freqdist.plot()
from collections import Counter import pickle from nltk.corpus import brown, cess_esp from nltk.util import ngrams from collections import defaultdict # Map words and the trigrams contained in them to their absolute frequency in the # Brown Corpus (English) or the CESS_ESP corpus (Spanish) if __name__ == '__main__': brown_words = brown.words() english_freqs = Counter(brown_words) esp_words = cess_esp.words() spanish_freqs = Counter(esp_words) with open('spanish_freqs.pkl', 'wb') as esp_f: pickle.dump(spanish_freqs, esp_f) with open('english_freqs.pkl', 'wb') as en_f: pickle.dump(english_freqs, en_f) en_trigrams = defaultdict(int) for word in brown_words: for trigram in [ '{}{}{}'.format(t[0], t[1], t[2]) for t in list(ngrams(word, 3)) ]: en_trigrams[trigram] += 1 with open('english_trigram_freqs.pkl', 'wb') as en_t_f: pickle.dump(en_trigrams, en_t_f) esp_trigrams = defaultdict(int) for word in esp_words: for trigram in [
def __init__(self, language): self.language = language # from 'Multilingual and Cross-Lingual Complex Word Identification' (Yimam et. al, 2017) if language == 'english': self.avg_word_length = 5.3 # from Beker, Henry; Piper, Fred. Cipher Systems: The Protection of Communications. self.char_frequency = { 'a': 8.167, 'b': 1.492, 'c': 2.782, 'd': 4.253, 'e': 12.702, 'f': 2.228, 'g': 2.015, 'h': 6.094, 'i': 6.966, 'j': 0.153, 'k': 0.772, 'l': 4.025, 'm': 2.406, 'n': 6.749, 'o': 7.507, 'p': 1.929, 'q': 0.095, 'r': 5.987, 's': 6.327, 't': 9.056, 'u': 2.758, 'v': 0.978, 'w': 2.360, 'x': 0.150, 'y': 1.974, 'z': 0.074 } self.dic = pyphen.Pyphen(lang='en') self.reuters = reuters.words() self.unigram_counts = Counter(self.reuters) bigrams = [] for sent in reuters.sents(): bigrams.extend( nltk.bigrams(sent, pad_left=True, pad_right=True)) self.bigram_counts = Counter(bigrams) else: # spanish self.avg_word_length = 6.2 # self.char_frequency = {'a': 12.525,'b': 2.215,'c': 4.139,'d': 5.860,'e': 13.681, # 'f': 0.692,'g': 1.768,'h': 0.703,'i': 6.247,'j': 0.443, # 'k': 0.011,'l': 4.967,'m': 3.157,'n': 6.71,'o': 8.683, # 'p': 2.510, 'q': 0.877,'r': 6.871,'s': 7.977,'t': 4.632, # 'u': 3.927, 'v': 1.138,'w': 0.017,'x': 0.215,'y': 1.008, # 'z': 0.517,'á': 0.502, 'é': 0.433, 'í': 0.725, 'ñ': 0.311, # 'ó': 0.827, 'ú': 0.168, 'ü': 0.012} # self.dic = pyphen.Pyphen(lang='es') self.cess = cess.words() self.unigram_counts = Counter(self.cess) bigrams = [] for sent in cess.sents(): bigrams.extend( nltk.bigrams(sent, pad_left=True, pad_right=True)) self.bigram_counts = Counter(bigrams) # self.clf = svm.SVC() # self.model = LogisticRegression() self.model = svm.SVC(gamma=5)
import sys if (len(sys.argv) == 1): print( 'Ingrese el archivo del corpus como argumento. Ej: python createDictFromCorpus.py corpus.txt' ) elif sys.argv[1] == 'cess_esp': import json from nltk.corpus import cess_esp file = 'cess_esp' wordsToLower = map(lambda x: x.lower(), cess_esp.words()) d = dict.fromkeys(set(wordsToLower), 1) else: import os import json from nltk.corpus import PlaintextCorpusReader path, file = os.path.split(sys.argv[1]) corpus = PlaintextCorpusReader(path, file) wordsToLower = map(lambda x: x.lower(), corpus.words()) d = dict.fromkeys(set(wordsToLower), 1) jsonF = json.dumps(d) f = open(file + ".json", "w") f.write(jsonF) f.close()