def get_tagger(lang): if lang == "English": global eng_tagger if eng_tagger: return eng_tagger else: _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle' eng_tagger = load(_POS_TAGGER) return eng_tagger elif lang == "Spanish": global spa_tagger if spa_tagger: return spa_tagger else: training = cess_esp.tagged_sents() default_tagger = nltk.DefaultTagger('NN') unigram_tagger = nltk.UnigramTagger(training, backoff=default_tagger) bigram_tagger = nltk.BigramTagger(training, backoff=unigram_tagger) spa_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger) return spa_tagger else: global cat_tagger if cat_tagger: return cat_tagger else: training = cess_cat.tagged_sents() default_tagger = nltk.DefaultTagger('NN') unigram_tagger = nltk.UnigramTagger(training, backoff=default_tagger) bigram_tagger = nltk.BigramTagger(training, backoff=unigram_tagger) cat_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger) return cat_tagger
def run(train, test, language, answer): results = {} if language == 'English': _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle' tagger = load(_POS_TAGGER) elif language == 'Spanish': tagger = ut(cess_esp.tagged_sents()) elif language == 'Catalan': tagger = ut(cess_cat.tagged_sents()) for lexelt in train: train_features, y_train = extract_features(train[lexelt],language,tagger) test_features, _ = extract_features(test[lexelt],language,tagger) X_train, X_test = vectorize(train_features,test_features) X_train_new, X_test_new = feature_selection(X_train, X_test,y_train) results[lexelt] = classify(X_train_new, X_test_new,y_train) """ B1.c for lexelt in train: features = getBestWords(train[lexelt], 30) train_features = countFeature(features, train[lexelt]) _, y_train = extract_features(train[lexelt], language) test_features = countFeature(features, test[lexelt]) X_train, X_test = vectorize(train_features, test_features) results[lexelt] = classify(X_train, X_test, y_train) B1.c """ A.print_results(results, answer)
def get_tagger(lang): if lang == "English": global eng_tagger if eng_tagger: return eng_tagger else: _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle' eng_tagger = load(_POS_TAGGER) return eng_tagger elif lang == "Spanish": global spa_tagger if spa_tagger: return spa_tagger else: print 111 training = cess_esp.tagged_sents() default_tagger = nltk.DefaultTagger('NN') unigram_tagger = nltk.UnigramTagger(training,backoff=default_tagger) bigram_tagger = nltk.BigramTagger(training, backoff=unigram_tagger) spa_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger) print 555 return spa_tagger else: global cat_tagger if cat_tagger: return cat_tagger else: training = cess_cat.tagged_sents() default_tagger = nltk.DefaultTagger('NN') unigram_tagger = nltk.UnigramTagger(training,backoff=default_tagger) bigram_tagger = nltk.BigramTagger(training, backoff=unigram_tagger) cat_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger) return cat_tagger
def Catalan_tagger(): import nltk from nltk.corpus import cess_cat training = cess_cat.tagged_sents() default_tagger = nltk.DefaultTagger("NOUN") bigram_tagger = nltk.BigramTagger(training, backoff=default_tagger) trigram_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger) return trigram_tagger
def __init__(self, train_percent_size=1): """ :param train_percent_size: 0-1 :return: """ catalan_sents = cat_corpus.tagged_sents() subset = subset_from_corpus(catalan_sents, train_percent_size) self._tagger = trained_tagger_with_corpus(subset)
def set_tagger(language): if language == 'English': _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle' tagger = load(_POS_TAGGER) elif language == 'Catalan': training = cess_cat.tagged_sents() default_tagger = nltk.DefaultTagger('NN') unigram_tagger = nltk.UnigramTagger(training,backoff=default_tagger) bigram_tagger = nltk.BigramTagger(training, backoff=unigram_tagger) tagger = nltk.TrigramTagger(training, backoff=bigram_tagger) elif language == 'Spanish': training = cess_esp.tagged_sents() default_tagger = nltk.DefaultTagger('NN') unigram_tagger = nltk.UnigramTagger(training,backoff=default_tagger) bigram_tagger = nltk.BigramTagger(training, backoff=unigram_tagger) tagger = nltk.TrigramTagger(training, backoff=bigram_tagger) return tagger
def test_catalan(self): words = cess_cat.words()[:15] txt = "El Tribunal_Suprem -Fpa- TS -Fpt- ha confirmat la condemna a quatre anys d' inhabilitació especial" self.assertEqual(words, txt.split()) self.assertEqual(cess_cat.tagged_sents()[0][34][0], "càrrecs")
WORD_OR_TAG = '[^/ ]+' BOUNDARY = r'\b' CORPUS_LOADED_EVENT = '<<CL_EVENT>>' SEARCH_TERMINATED_EVENT = '<<ST_EVENT>>' SEARCH_ERROR_EVENT = '<<SE_EVENT>>' ERROR_LOADING_CORPUS_EVENT = '<<ELC_EVENT>>' # NB All corpora must be specified in a lambda expression so as not to be # loaded when the module is imported. _DEFAULT = 'English: Brown Corpus (Humor, simplified)' _CORPORA = { 'Catalan: CESS-CAT Corpus (simplified)': lambda: cess_cat.tagged_sents(simplify_tags=True), 'English: Brown Corpus': lambda: brown.tagged_sents(), 'English: Brown Corpus (simplified)': lambda: brown.tagged_sents(simplify_tags=True), 'English: Brown Corpus (Press, simplified)': lambda: brown.tagged_sents(categories=['news', 'editorial', 'reviews'], simplify_tags=True), 'English: Brown Corpus (Religion, simplified)': lambda: brown.tagged_sents(categories='religion', simplify_tags=True), 'English: Brown Corpus (Learned, simplified)': lambda: brown.tagged_sents(categories='learned', simplify_tags=True), 'English: Brown Corpus (Science Fiction, simplified)': lambda: brown.tagged_sents(categories='science_fiction', simplify_tags=True), 'English: Brown Corpus (Romance, simplified)': lambda: brown.tagged_sents(categories='romance', simplify_tags=True), 'English: Brown Corpus (Humor, simplified)':
def test_catalan(self): words = cess_cat.words()[:15] txt = "El Tribunal_Suprem -Fpa- TS -Fpt- ha confirmat la condemna a quatre anys d' inhabilitació especial" self.assertEqual(words, txt.split()) self.assertEqual(cess_cat.tagged_sents()[0][34][0], "càrrecs")
import nltk from nltk.corpus import cess_cat from nltk.tokenize import word_tokenize tagged_sents = cess_cat.tagged_sents() unigram_tagger = nltk.UnigramTagger(tagged_sents) oracio = "avui fa sol però demà plourà" tokens = word_tokenize(oracio) analisi = unigram_tagger.tag(tokens) print(analisi)
import A from sklearn.feature_extraction import DictVectorizer from sklearn import svm from nltk import word_tokenize from nltk.corpus import cess_esp from nltk.corpus import cess_cat from nltk.data import load from sklearn import svm import nltk from nltk import UnigramTagger as ut tagger_cat = ut(cess_cat.tagged_sents()) tagger_esp = ut(cess_esp.tagged_sents()) # You might change the window size window_size = 15 def b1_base(data): ''' :param data: list of instances for a given lexelt with the following structure: { [(instance_id, left_context, head, right_context, sense_id), ...] } :param s: list of words (features) for a given lexelt: [w1,w2,w3, ...] :return: vectors: A dictionary with the following structure { instance_id: [w_1 count, w_2 count, ...], ... } labels: A dictionary with the following structure { instance_id : sense_id } '''
BOUNDARY = r'\b' CORPUS_LOADED_EVENT = '<<CL_EVENT>>' SEARCH_TERMINATED_EVENT = '<<ST_EVENT>>' SEARCH_ERROR_EVENT = '<<SE_EVENT>>' ERROR_LOADING_CORPUS_EVENT = '<<ELC_EVENT>>' POLL_INTERVAL = 50 # NB All corpora must be specified in a lambda expression so as not to be # loaded when the module is imported. _DEFAULT = 'English: Brown Corpus (Humor, simplified)' _CORPORA = { 'Catalan: CESS-CAT Corpus (simplified)': lambda: cess_cat.tagged_sents(tagset='universal'), 'English: Brown Corpus': lambda: brown.tagged_sents(), 'English: Brown Corpus (simplified)': lambda: brown.tagged_sents(tagset='universal'), 'English: Brown Corpus (Press, simplified)': lambda: brown.tagged_sents(categories=['news', 'editorial', 'reviews'], tagset='universal'), 'English: Brown Corpus (Religion, simplified)': lambda: brown.tagged_sents(categories='religion', tagset='universal'), 'English: Brown Corpus (Learned, simplified)': lambda: brown.tagged_sents(categories='learned', tagset='universal'), 'English: Brown Corpus (Science Fiction, simplified)': lambda: brown.tagged_sents(categories='science_fiction', tagset='universal'), 'English: Brown Corpus (Romance, simplified)':
BOUNDARY = r"\b" CORPUS_LOADED_EVENT = "<<CL_EVENT>>" SEARCH_TERMINATED_EVENT = "<<ST_EVENT>>" SEARCH_ERROR_EVENT = "<<SE_EVENT>>" ERROR_LOADING_CORPUS_EVENT = "<<ELC_EVENT>>" POLL_INTERVAL = 50 # NB All corpora must be specified in a lambda expression so as not to be # loaded when the module is imported. _DEFAULT = "English: Brown Corpus (Humor, simplified)" _CORPORA = { "Catalan: CESS-CAT Corpus (simplified)": lambda: cess_cat.tagged_sents( tagset="universal" ), "English: Brown Corpus": lambda: brown.tagged_sents(), "English: Brown Corpus (simplified)": lambda: brown.tagged_sents( tagset="universal" ), "English: Brown Corpus (Press, simplified)": lambda: brown.tagged_sents( categories=["news", "editorial", "reviews"], tagset="universal" ), "English: Brown Corpus (Religion, simplified)": lambda: brown.tagged_sents( categories="religion", tagset="universal" ), "English: Brown Corpus (Learned, simplified)": lambda: brown.tagged_sents( categories="learned", tagset="universal" ), "English: Brown Corpus (Science Fiction, simplified)": lambda: brown.tagged_sents(
WORD_OR_TAG = "[^/ ]+" BOUNDARY = r"\b" CORPUS_LOADED_EVENT = "<<CL_EVENT>>" SEARCH_TERMINATED_EVENT = "<<ST_EVENT>>" SEARCH_ERROR_EVENT = "<<SE_EVENT>>" ERROR_LOADING_CORPUS_EVENT = "<<ELC_EVENT>>" POLL_INTERVAL = 50 # NB All corpora must be specified in a lambda expression so as not to be # loaded when the module is imported. _DEFAULT = "English: Brown Corpus (Humor, simplified)" _CORPORA = { "Catalan: CESS-CAT Corpus (simplified)": lambda: cess_cat.tagged_sents(tagset="simple"), "English: Brown Corpus": lambda: brown.tagged_sents(), "English: Brown Corpus (simplified)": lambda: brown.tagged_sents(tagset="simple"), "English: Brown Corpus (Press, simplified)": lambda: brown.tagged_sents( categories=["news", "editorial", "reviews"], tagset="simple" ), "English: Brown Corpus (Religion, simplified)": lambda: brown.tagged_sents(categories="religion", tagset="simple"), "English: Brown Corpus (Learned, simplified)": lambda: brown.tagged_sents(categories="learned", tagset="simple"), "English: Brown Corpus (Science Fiction, simplified)": lambda: brown.tagged_sents( categories="science_fiction", tagset="simple" ), "English: Brown Corpus (Romance, simplified)": lambda: brown.tagged_sents(categories="romance", tagset="simple"), "English: Brown Corpus (Humor, simplified)": lambda: brown.tagged_sents(categories="humor", tagset="simple"), "English: NPS Chat Corpus": lambda: nps_chat.tagged_posts(), "English: NPS Chat Corpus (simplified)": lambda: nps_chat.tagged_posts(tagset="simple"), "English: Wall Street Journal Corpus": lambda: treebank.tagged_sents(),
BOUNDARY = r'\b' CORPUS_LOADED_EVENT = '<<CL_EVENT>>' SEARCH_TERMINATED_EVENT = '<<ST_EVENT>>' SEARCH_ERROR_EVENT = '<<SE_EVENT>>' ERROR_LOADING_CORPUS_EVENT = '<<ELC_EVENT>>' POLL_INTERVAL = 50 # NB All corpora must be specified in a lambda expression so as not to be # loaded when the module is imported. _DEFAULT = 'English: Brown Corpus (Humor, simplified)' _CORPORA = { 'Catalan: CESS-CAT Corpus (simplified)': lambda: cess_cat.tagged_sents(tagset='simple'), 'English: Brown Corpus': lambda: brown.tagged_sents(), 'English: Brown Corpus (simplified)': lambda: brown.tagged_sents(tagset='simple'), 'English: Brown Corpus (Press, simplified)': lambda: brown.tagged_sents(categories=['news', 'editorial', 'reviews'], tagset='simple'), 'English: Brown Corpus (Religion, simplified)': lambda: brown.tagged_sents(categories='religion', tagset='simple'), 'English: Brown Corpus (Learned, simplified)': lambda: brown.tagged_sents(categories='learned', tagset='simple'), 'English: Brown Corpus (Science Fiction, simplified)': lambda: brown.tagged_sents(categories='science_fiction', tagset='simple'), 'English: Brown Corpus (Romance, simplified)': lambda: brown.tagged_sents(categories='romance', tagset='simple'), 'English: Brown Corpus (Humor, simplified)':
BOUNDARY = r'\b' CORPUS_LOADED_EVENT = '<<CL_EVENT>>' SEARCH_TERMINATED_EVENT = '<<ST_EVENT>>' SEARCH_ERROR_EVENT = '<<SE_EVENT>>' ERROR_LOADING_CORPUS_EVENT = '<<ELC_EVENT>>' POLL_INTERVAL = 50 # NB All corpora must be specified in a lambda expression so as not to be # loaded when the module is imported. _DEFAULT = 'English: Brown Corpus (Humor, simplified)' _CORPORA = { 'Catalan: CESS-CAT Corpus (simplified)': lambda: cess_cat.tagged_sents(tagset='universal'), 'English: Brown Corpus': lambda: brown.tagged_sents(), 'English: Brown Corpus (simplified)': lambda: brown.tagged_sents(tagset='universal'), 'English: Brown Corpus (Press, simplified)': lambda: brown.tagged_sents(categories=['news', 'editorial', 'reviews'], tagset='universal'), 'English: Brown Corpus (Religion, simplified)': lambda: brown.tagged_sents(categories='religion', tagset='universal'), 'English: Brown Corpus (Learned, simplified)': lambda: brown.tagged_sents(categories='learned', tagset='universal'), 'English: Brown Corpus (Science Fiction, simplified)': lambda: brown.tagged_sents(categories='science_fiction', tagset='universal'), 'English: Brown Corpus (Romance, simplified)': lambda: brown.tagged_sents(categories='romance', tagset='universal'), 'English: Brown Corpus (Humor, simplified)':
BOUNDARY = r'\b' CORPUS_LOADED_EVENT = '<<CL_EVENT>>' SEARCH_TERMINATED_EVENT = '<<ST_EVENT>>' SEARCH_ERROR_EVENT = '<<SE_EVENT>>' ERROR_LOADING_CORPUS_EVENT = '<<ELC_EVENT>>' POLL_INTERVAL = 50 # NB All corpora must be specified in a lambda expression so as not to be # loaded when the module is imported. _DEFAULT = 'English: Brown Corpus (Humor, simplified)' _CORPORA = { 'Catalan: CESS-CAT Corpus (simplified)': lambda: cess_cat.tagged_sents(tagset='simple'), 'English: Brown Corpus': lambda: brown.tagged_sents(), 'English: Brown Corpus (simplified)': lambda: brown.tagged_sents(tagset='simple'), 'English: Brown Corpus (Press, simplified)': lambda: brown.tagged_sents(categories=['news', 'editorial', 'reviews'], tagset='simple'), 'English: Brown Corpus (Religion, simplified)': lambda: brown.tagged_sents(categories='religion', tagset='simple'), 'English: Brown Corpus (Learned, simplified)': lambda: brown.tagged_sents(categories='learned', tagset='simple'), 'English: Brown Corpus (Science Fiction, simplified)': lambda: brown.tagged_sents(categories='science_fiction', tagset='simple'), 'English: Brown Corpus (Romance, simplified)': lambda: brown.tagged_sents(categories='romance', tagset='simple'),
BOUNDARY = r"\b" CORPUS_LOADED_EVENT = "<<CL_EVENT>>" SEARCH_TERMINATED_EVENT = "<<ST_EVENT>>" SEARCH_ERROR_EVENT = "<<SE_EVENT>>" ERROR_LOADING_CORPUS_EVENT = "<<ELC_EVENT>>" POLL_INTERVAL = 50 # NB All corpora must be specified in a lambda expression so as not to be # loaded when the module is imported. _DEFAULT = "English: Brown Corpus (Humor, simplified)" _CORPORA = { "Catalan: CESS-CAT Corpus (simplified)": lambda: cess_cat.tagged_sents(tagset="universal"), "English: Brown Corpus": lambda: brown.tagged_sents(), "English: Brown Corpus (simplified)": lambda: brown.tagged_sents(tagset="universal"), "English: Brown Corpus (Press, simplified)": lambda: brown.tagged_sents(categories=["news", "editorial", "reviews"], tagset="universal"), "English: Brown Corpus (Religion, simplified)": lambda: brown.tagged_sents(categories="religion", tagset="universal"), "English: Brown Corpus (Learned, simplified)": lambda: brown.tagged_sents(categories="learned", tagset="universal"), "English: Brown Corpus (Science Fiction, simplified)": lambda: brown.tagged_sents(categories="science_fiction", tagset="universal"), "English: Brown Corpus (Romance, simplified)":
BOUNDARY = r'\b' CORPUS_LOADED_EVENT = '<<CL_EVENT>>' SEARCH_TERMINATED_EVENT = '<<ST_EVENT>>' SEARCH_ERROR_EVENT = '<<SE_EVENT>>' ERROR_LOADING_CORPUS_EVENT = '<<ELC_EVENT>>' POLL_INTERVAL = 50 # NB All corpora must be specified in a lambda expression so as not to be # loaded when the module is imported. _DEFAULT = 'English: Brown Corpus (Humor, simplified)' _CORPORA = { 'Catalan: CESS-CAT Corpus (simplified)': lambda: cess_cat.tagged_sents(simplify_tags=True), 'English: Brown Corpus': lambda: brown.tagged_sents(), 'English: Brown Corpus (simplified)': lambda: brown.tagged_sents(simplify_tags=True), 'English: Brown Corpus (Press, simplified)': lambda: brown.tagged_sents(categories=['news', 'editorial', 'reviews'], simplify_tags=True), 'English: Brown Corpus (Religion, simplified)': lambda: brown.tagged_sents(categories='religion', simplify_tags=True), 'English: Brown Corpus (Learned, simplified)': lambda: brown.tagged_sents(categories='learned', simplify_tags=True), 'English: Brown Corpus (Science Fiction, simplified)': lambda: brown.tagged_sents(categories='science_fiction', simplify_tags=True), 'English: Brown Corpus (Romance, simplified)':