def data_preparation(self): """ Splits one of Brown, BNC News, Indian corpora into train set and test set Returns: -------- sentences (list): Sentences without POS-tags tagged_sentences (list): Sentences with POS-tags """ if self.corpus == 'brown': tagged_sentences = brown.tagged_sents(categories='news') sentences = brown.sents(categories='news') elif self.corpus == 'bnc': root = find('corpora/bnc') bncnews = TaggedCorpusReader(root, 'bnc-news-wtp.txt', tagset='en-claws') if self.tagset is None: tagged_sentences = bncnews.tagged_sents() elif self.tagset == 'universal': tagged_sentences = bncnews.tagged_sents(tagset=self.tagset) sentences = bncnews.sents() elif self.corpus == 'indian': if self.lang in ['telugu', 'hindi', 'marathi', 'bangla']: tagged_sentences = indian.tagged_sents(f'{self.lang}.pos') sentences = indian.sents(f'{self.lang}.pos') else: print('Language not part of Indian Corpus.') return sentences, tagged_sentences
def get_objects(self): reader = TaggedCorpusReader('data/', r'.*\.pos') pos_fileids = reader.fileids()[1] neg_fileids = reader.fileids()[0] postag_pos = reader.tagged_sents(pos_fileids) postag_neg = reader.tagged_sents(neg_fileids) return (postag_pos, postag_neg)
def treina(expressao_regular, etiquetador, destino, raiz=".", proporcoes=[100], razao=1.0, codificacao="utf-8"): regexp_tagger=abre_etiquetador(etiquetador) corpus=TaggedCorpusReader(raiz, expressao_regular, encoding=codificacao) print("Conjunto de treino:\n%s\n" % " \n".join(corpus.fileids())) sents=corpus.tagged_sents() #print sents[3] #print type(sents[3][0][0]) c=len(sents) # proporção do conjunto de desenvolvimento # em relação a um determinado corpus # proporcoes=[10,30,50,70,100] # razão entre sentencas de treino e total de sentencas # razao=0.75 for n in proporcoes: proporcao=n/100.0 size=int(c*proporcao) dev=sents[:size] size=int(len(dev)*razao) train=dev[:size] print("\n\nQuantidade de sentenças") print("Conjunto de treinamento: %d" % len(train)) print("Total de %d tokens" % len(sum(train, []))) test=dev[size:] print("Conjunto de teste: %d sentenças" % len(test)) print("Total de %d tokens" % len(sum(test, []))) t1=time.time() rubt=backoff_tagger(train, [UnigramTagger,BigramTagger,TrigramTagger], backoff=regexp_tagger) t2=time.time() print("Tempo de treinamento em segundos: %f" % (t2-t1)) print('Etiquetagem da sentença-exemplo "%s"\n' % EXEMPLO,rubt.tag(SENTENCA)) f=open(destino,"wb") pickle.dump(rubt,f,-1) if razao < 1.0: t1=time.time() # introduzir avaliação por meio de Avalia.testa_etiquetador print("\nAcurácia na etiquetagem do conjunto de teste: %f" % rubt.evaluate(test)) t2=time.time() print("Tempo de avaliação em segundos: %f" % (t2-t1))
def treina(expressao_regular, etiquetador=INICIAL, destino="BRUBT.pkl", raiz=".", codificacao="utf-8", max_rules=100, min_score=3): inicial = abre_etiquetador(etiquetador) corpus = TaggedCorpusReader(raiz, expressao_regular, encoding=codificacao) train_sents = corpus.tagged_sents() trainer = brill.FastBrillTaggerTrainer(inicial, TEMPLATES) brubt = trainer.train(train_sents, max_rules=max_rules, min_score=min_score) print('Etiquetagem da sentença-exemplo "%s"\n' % EXEMPLO, brubt.tag(SENTENCA)) f = open(destino, "wb") dump(brubt, f, -1) f.close()
def read_sub_corpus(corpus, files_req, tag_length=2): """ Read in the requested files from the requested corpus. Given a corpus and filenames, reads in and cleans the pos tagged data, including truncating tags for INTERA. :param corpus: The name of the corpus. :type corpus: String (one of {'INTERA', 'UDGreek', 'tagged_texts'}) :param files_req: The files to be read. :type files_req: List :param tag_length: Length of tag to include (INTERA only) :type tag_length: Integer :default tag_length: 2 :rtype : One (test) or three (train) lists of sentences. :raise exception : If an invalid corpus provided. """ # load the corpus as tagged sentences corp_sents = list() # for each file for file_name in files_req: # mask parenthesis file_name = file_name.replace('(', '\(').replace(')', '\)') corp_raw = TaggedCorpusReader(CORP_DIR + corpus, file_name) corp_sents.extend(corp_raw.tagged_sents()) print('Files read : ' + str(len(files_req))) print('Sentences read: ' + str(len(corp_sents))) print('Words read : ' + str(sum([len(x) for x in corp_sents]))) # clean the tags - replace missing with '' - and simplify corp_sents = [ list(map(lambda x: (x[0], '') if x[1] == None else x, sent)) for sent in corp_sents ] corp_sents = [[(word, tag[:tag_length]) for (word, tag) in sent] for sent in corp_sents] # return the loaded files return (corp_sents)
def tag_text(str_trained_folder, str_fname_in, str_fname_out): #http://nltk.googlecode.com/svn/trunk/doc/howto/tag.html #build trigram tagger based on your tagged_corpora tagged_corpora = TaggedCorpusReader(str_trained_folder, '.*') #print tagged_corpora.tagged_sents()[50] trigram_tagger = nltk.TrigramTagger(tagged_corpora.tagged_sents()) with open(str_fname_in) as f_in: with open(str_fname_out, "w+") as f_out: for line in f_in: tagged_result = trigram_tagger.tag(line.split()) str = " ".join([t[0] for t in tagged_result if t[1] == 'C']) if str: f_out.write(str + "\n")
def analyse_ngram(tranche): corpus_entrainement_tuple = TaggedCorpusReader( dossier_racine, 'resultats\/corpus_entrainement' + str(tranche + 1) + '.txt') corpus_test_tuple = TaggedCorpusReader( dossier_racine, 'resultats\/corpus_test' + str(tranche + 1) + '.txt') train_sents = corpus_entrainement_tuple.tagged_sents() tagger = None tagger = create_tagger(train_sents) sents_corrects = corpus_test_tuple.tagged_sents() sents_tagges = tagger.tag_sents(corpus_test_tuple.sents()) #print(corpus_test_tuple.sents()) for sent_correct, sent_tagge in zip(sents_corrects, sents_tagges): phrase_combine = [ (mot_correct, mot_tagge) for mot_correct, mot_tagge in zip(sent_correct, sent_tagge) ] for couple in phrase_combine: for MI in scores_ngram: if MI['signifiant'] == couple[0][0]: MI['total_signifiant'] += 1 if couple[0][1] == 'M': MI['total_MI'] += 1 if couple[1][1] == 'M': MI['MI_reperes'] += 1 if couple[1][1] == couple[0][1]: MI['MI_corrects'] += 1
# T(he original version of t)his code was written by Ulrich Germann (11/2010) ###################################################################### import nltk nltk.data.path[0:0] = ['/u/csc485h/include/a3/nltk'] # The following code provides access to the tagged NY Times corpus # nyt_big is the full corpus # nyt_mini a small subset for development from nltk.data import ZipFilePathPointer from nltk.corpus import TaggedCorpusReader nyt_zipped = ZipFilePathPointer('/u/csc485h/include/a3/nltk/corpora/nyt.zip','nyt/') nyt_big = TaggedCorpusReader(nyt_zipped,['2004-tagged.txt'],sep='/', encoding='latin2') nyt_mini = TaggedCorpusReader(nyt_zipped,['nytimes-mini.txt'],sep='/', encoding='latin2') # Finally, let's set up a default pattern for NP chunking # Setting up the NP chunker itself is left to the main script, to encourage # trying different variants of the pattern ## Operator Behavior ## . Wildcard, matches any character ## ^abc Matches some pattern abc at the start of a string ## abc$ Matches some pattern abc at the end of a string ## [abc] Matches one of a set of characters ## [A-Z0-9] Matches one of a range of characters ## ed|ing|s Matches one of the specified strings (disjunction) ## * Zero or more of previous item, e.g. a*, [a-z]* (also known as Kleene Closure) ## + One or more of previous item, e.g. a+, [a-z]+
def read_dir(self): # read entire directory as a single doc? corpus = TaggedCorpusReader('../../ANC', '.*', '_') corpus.tagged_sents()
if __name__ == '__main__': input_dir = sys.argv[1] #coref_reader = CoreReader() wp_bootstrapper = WordPairBootstrapper() seedwords_filename = sys.argv[2] # make this a directory and output the wordpairs, the anaphors, the antecedents and patterns outdir = sys.argv[3] # read in seedwords as specified wp_bootstrapper.read_seedwords(seedwords_filename) # read in entire corpus corpus = TaggedCorpusReader('../../prepANC', '.*', '_', encoding='utf-8') print('corpus is loaded!') # iterate through corpus to extract candidate patterns if needed if not os.path.isfile('patterns_anc.pkl'): wp_bootstrapper.run_candidate_patterns(corpus) else: wp_bootstrapper.preprocess_seeds() wp_bootstrapper.perm_lex = wp_bootstrapper.seedwords #TODO Clean this up later it's clunky looking # print('Read patterns from cache') # wp_bootstrapper.read_cache_candidate_eps() # wp_bootstrapper.string2pattern_set() # iterate through corpus again for as many iterations as needed and do the extraction process # wp_bootstrapper.run_candidate_patterns(corpus) wp_bootstrapper.run(corpus)
def analyse_SVM(tranche): global scores_SVM ###Preparation des dicts de features### #On va chercher les resustats corpus_entrainement_tuple = TaggedCorpusReader( dossier_racine, 'resultats\/corpus_entrainement' + str(tranche + 1) + '.txt') train_sents = corpus_entrainement_tuple.tagged_sents() tagger = None tagger = create_tagger(train_sents) #joblib.dump(tagger, 'etiqueteur_ngrammes.pkl') liste_dictionnaires = [] liste_y = [] ###CONSTRUCTION DU DICTIONNAIRE ENTRAINEMENT### corpus_test_tuple = TaggedCorpusReader( dossier_racine, 'resultats\/corpus_entrainement' + str(tranche + 1) + '.txt') #sert a identifier le feature tag# sents_corrects = corpus_test_tuple.tagged_sents() sents_tagges = tagger.tag_sents(corpus_test_tuple.sents()) for sent_correct, sent_tagge in zip(sents_corrects, sents_tagges): phrase_combine = [ (mot_correct, mot_tagge) for mot_correct, mot_tagge in zip(sent_correct, sent_tagge) ] #print(phrase_combine) indice = 0 for couple in phrase_combine: #print("waaaa" + str(couple)) for MI in scores_SVM: #print(MI) if couple[0][0] == MI['signifiant']: liste_dictionnaires.append( create_dict(phrase_combine, indice)) #print(couple[0][1]) if couple[0][1] == 'M': liste_y.append(1) else: liste_y.append(0) #print("Mot entr") #print(dict_mot) #print('\n') indice += 1 ###CONSTRUCTION DU DICTIONNAIRE TEST#### #corpus_test_tuple = TaggedCorpusReader(dossier_racine, nom_tes) corpus_test_tuple = TaggedCorpusReader( dossier_racine, 'resultats\/corpus_test' + str(tranche + 1) + '.txt') #sert a identifier le feature tag# sents_corrects = corpus_test_tuple.tagged_sents() sents_tagges = tagger.tag_sents(corpus_test_tuple.sents()) liste_dictionnaires_test = [] liste_y_test = [] for sent_correct, sent_tagge in zip(sents_corrects, sents_tagges): phrase_combine = [ (mot_correct, mot_tagge) for mot_correct, mot_tagge in zip(sent_correct, sent_tagge) ] #print(phrase_combine) indice = 0 for couple in phrase_combine: for MI in scores_SVM: if couple[0][0] == MI['signifiant']: liste_dictionnaires_test.append( create_dict(phrase_combine, indice)) if couple[0][1] == 'M': liste_y_test.append(1) else: liste_y_test.append(0) #print(dict_mot) #print('\n') indice += 1 #vectoriation des dictionnaires### vec = DictVectorizer() listes_colles = liste_dictionnaires + liste_dictionnaires_test vecteur_x_ent_plus_test = vec.fit_transform(listes_colles).toarray() #joblib.dump(vec, 'vectoriseur.pkl') #print(vec.get_feature_names()) #print(vecteur_x_ent_plus_test) vecteur_x_entrainement = vecteur_x_ent_plus_test[:len(liste_dictionnaires)] vecteur_x_test = vecteur_x_ent_plus_test[len(liste_dictionnaires):] clf = svm.SVC(kernel='linear', C=18, class_weight={1: 3}) #BEST equilibre #0,9211 0,9574 #Total signifiants 4185, Fmesure obtenue 0,9389 print(clf.get_params()) clf.fit(vecteur_x_entrainement, liste_y) #joblib.dump(clf, 'classifieur_SVM.pkl') #print(vecteur_x_test) prediction = clf.predict(vecteur_x_test) #print(liste_y_test) #print(prediction) double_y = zip(liste_y_test, prediction) """#pour utiliser sans signifiant dans dict scores_total = {'signifiant': "toute", 'total_signifiant':0, 'total_MI':0, 'MI_reperes':0, 'MI_corrects':0 } """ for unite, couple_reponse in zip(liste_dictionnaires_test, double_y): #print(unite) #print(couple_reponse) for M in scores_SVM: #print(MI) if M['signifiant'] == unite['signifiant']: M['total_signifiant'] += 1 if couple_reponse[0] == 1: M['total_MI'] += 1 if couple_reponse[1] == 1: M['MI_reperes'] += 1 if couple_reponse[0] == couple_reponse[1]: M['MI_corrects'] += 1
#!/usr/bin/env python # -*- coding: utf-8 from nltk.probability import ConditionalFreqDist from nltk.corpus import TaggedCorpusReader from nltk.tag import simplify FIRST = 0 END = 150 POS = "V" #POS = "N" #POS = "ADJ" corpus_root = './data' fileids = 'tagged_sent' corpus = TaggedCorpusReader(corpus_root, fileids, encoding='utf-8') processing = [(simplify.simplify_wsj_tag(tag), word.lower()) for (word, tag) in corpus.tagged_words()] cfd_corpus = ConditionalFreqDist(processing) for term,freq in cfd_corpus[POS].items(): print term.encode("utf-8"),freq
from gmail_corpus.nltk_util.bigram_score import make_score_dict, save_score_dict from nltk.corpus import TaggedCorpusReader import numpy as np from glob import glob import os, sys if __name__ == '__main__': corpus_path = sys.argv[1] # remove empty files files = glob('%s/*.txt' % corpus_path) for f in files: if os.path.getsize(f) == 0: os.remove(f) print 'Removed empty file %s' % f corpus = TaggedCorpusReader(corpus_path, '.*\.txt') score_dict = make_score_dict(corpus.tagged_words()) save_score_dict(score_dict, 'bigram_scores.pkl') print 'saved bigram_scores.pkl'