def data_preparation(self): """ Splits one of Brown, BNC News, Indian corpora into train set and test set Returns: -------- sentences (list): Sentences without POS-tags tagged_sentences (list): Sentences with POS-tags """ if self.corpus == 'brown': tagged_sentences = brown.tagged_sents(categories='news') sentences = brown.sents(categories='news') elif self.corpus == 'bnc': root = find('corpora/bnc') bncnews = TaggedCorpusReader(root, 'bnc-news-wtp.txt', tagset='en-claws') if self.tagset is None: tagged_sentences = bncnews.tagged_sents() elif self.tagset == 'universal': tagged_sentences = bncnews.tagged_sents(tagset=self.tagset) sentences = bncnews.sents() elif self.corpus == 'indian': if self.lang in ['telugu', 'hindi', 'marathi', 'bangla']: tagged_sentences = indian.tagged_sents(f'{self.lang}.pos') sentences = indian.sents(f'{self.lang}.pos') else: print('Language not part of Indian Corpus.') return sentences, tagged_sentences
def get_objects(self): reader = TaggedCorpusReader('data/', r'.*\.pos') pos_fileids = reader.fileids()[1] neg_fileids = reader.fileids()[0] postag_pos = reader.tagged_sents(pos_fileids) postag_neg = reader.tagged_sents(neg_fileids) return (postag_pos, postag_neg)
def analyse_ngram(tranche): corpus_entrainement_tuple = TaggedCorpusReader( dossier_racine, 'resultats\/corpus_entrainement' + str(tranche + 1) + '.txt') corpus_test_tuple = TaggedCorpusReader( dossier_racine, 'resultats\/corpus_test' + str(tranche + 1) + '.txt') train_sents = corpus_entrainement_tuple.tagged_sents() tagger = None tagger = create_tagger(train_sents) sents_corrects = corpus_test_tuple.tagged_sents() sents_tagges = tagger.tag_sents(corpus_test_tuple.sents()) #print(corpus_test_tuple.sents()) for sent_correct, sent_tagge in zip(sents_corrects, sents_tagges): phrase_combine = [ (mot_correct, mot_tagge) for mot_correct, mot_tagge in zip(sent_correct, sent_tagge) ] for couple in phrase_combine: for MI in scores_ngram: if MI['signifiant'] == couple[0][0]: MI['total_signifiant'] += 1 if couple[0][1] == 'M': MI['total_MI'] += 1 if couple[1][1] == 'M': MI['MI_reperes'] += 1 if couple[1][1] == couple[0][1]: MI['MI_corrects'] += 1
def treina(expressao_regular, etiquetador, destino, raiz=".", proporcoes=[100], razao=1.0, codificacao="utf-8"): regexp_tagger=abre_etiquetador(etiquetador) corpus=TaggedCorpusReader(raiz, expressao_regular, encoding=codificacao) print("Conjunto de treino:\n%s\n" % " \n".join(corpus.fileids())) sents=corpus.tagged_sents() #print sents[3] #print type(sents[3][0][0]) c=len(sents) # proporção do conjunto de desenvolvimento # em relação a um determinado corpus # proporcoes=[10,30,50,70,100] # razão entre sentencas de treino e total de sentencas # razao=0.75 for n in proporcoes: proporcao=n/100.0 size=int(c*proporcao) dev=sents[:size] size=int(len(dev)*razao) train=dev[:size] print("\n\nQuantidade de sentenças") print("Conjunto de treinamento: %d" % len(train)) print("Total de %d tokens" % len(sum(train, []))) test=dev[size:] print("Conjunto de teste: %d sentenças" % len(test)) print("Total de %d tokens" % len(sum(test, []))) t1=time.time() rubt=backoff_tagger(train, [UnigramTagger,BigramTagger,TrigramTagger], backoff=regexp_tagger) t2=time.time() print("Tempo de treinamento em segundos: %f" % (t2-t1)) print('Etiquetagem da sentença-exemplo "%s"\n' % EXEMPLO,rubt.tag(SENTENCA)) f=open(destino,"wb") pickle.dump(rubt,f,-1) if razao < 1.0: t1=time.time() # introduzir avaliação por meio de Avalia.testa_etiquetador print("\nAcurácia na etiquetagem do conjunto de teste: %f" % rubt.evaluate(test)) t2=time.time() print("Tempo de avaliação em segundos: %f" % (t2-t1))
def treina(expressao_regular, etiquetador=INICIAL, destino="BRUBT.pkl", raiz=".", codificacao="utf-8", max_rules=100, min_score=3): inicial = abre_etiquetador(etiquetador) corpus = TaggedCorpusReader(raiz, expressao_regular, encoding=codificacao) train_sents = corpus.tagged_sents() trainer = brill.FastBrillTaggerTrainer(inicial, TEMPLATES) brubt = trainer.train(train_sents, max_rules=max_rules, min_score=min_score) print('Etiquetagem da sentença-exemplo "%s"\n' % EXEMPLO, brubt.tag(SENTENCA)) f = open(destino, "wb") dump(brubt, f, -1) f.close()
def read_sub_corpus(corpus, files_req, tag_length=2): """ Read in the requested files from the requested corpus. Given a corpus and filenames, reads in and cleans the pos tagged data, including truncating tags for INTERA. :param corpus: The name of the corpus. :type corpus: String (one of {'INTERA', 'UDGreek', 'tagged_texts'}) :param files_req: The files to be read. :type files_req: List :param tag_length: Length of tag to include (INTERA only) :type tag_length: Integer :default tag_length: 2 :rtype : One (test) or three (train) lists of sentences. :raise exception : If an invalid corpus provided. """ # load the corpus as tagged sentences corp_sents = list() # for each file for file_name in files_req: # mask parenthesis file_name = file_name.replace('(', '\(').replace(')', '\)') corp_raw = TaggedCorpusReader(CORP_DIR + corpus, file_name) corp_sents.extend(corp_raw.tagged_sents()) print('Files read : ' + str(len(files_req))) print('Sentences read: ' + str(len(corp_sents))) print('Words read : ' + str(sum([len(x) for x in corp_sents]))) # clean the tags - replace missing with '' - and simplify corp_sents = [ list(map(lambda x: (x[0], '') if x[1] == None else x, sent)) for sent in corp_sents ] corp_sents = [[(word, tag[:tag_length]) for (word, tag) in sent] for sent in corp_sents] # return the loaded files return (corp_sents)
def tag_text(str_trained_folder, str_fname_in, str_fname_out): #http://nltk.googlecode.com/svn/trunk/doc/howto/tag.html #build trigram tagger based on your tagged_corpora tagged_corpora = TaggedCorpusReader(str_trained_folder, '.*') #print tagged_corpora.tagged_sents()[50] trigram_tagger = nltk.TrigramTagger(tagged_corpora.tagged_sents()) with open(str_fname_in) as f_in: with open(str_fname_out, "w+") as f_out: for line in f_in: tagged_result = trigram_tagger.tag(line.split()) str = " ".join([t[0] for t in tagged_result if t[1] == 'C']) if str: f_out.write(str + "\n")
def read_dir(self): # read entire directory as a single doc? corpus = TaggedCorpusReader('../../ANC', '.*', '_') corpus.tagged_sents()
def analyse_SVM(tranche): global scores_SVM ###Preparation des dicts de features### #On va chercher les resustats corpus_entrainement_tuple = TaggedCorpusReader( dossier_racine, 'resultats\/corpus_entrainement' + str(tranche + 1) + '.txt') train_sents = corpus_entrainement_tuple.tagged_sents() tagger = None tagger = create_tagger(train_sents) #joblib.dump(tagger, 'etiqueteur_ngrammes.pkl') liste_dictionnaires = [] liste_y = [] ###CONSTRUCTION DU DICTIONNAIRE ENTRAINEMENT### corpus_test_tuple = TaggedCorpusReader( dossier_racine, 'resultats\/corpus_entrainement' + str(tranche + 1) + '.txt') #sert a identifier le feature tag# sents_corrects = corpus_test_tuple.tagged_sents() sents_tagges = tagger.tag_sents(corpus_test_tuple.sents()) for sent_correct, sent_tagge in zip(sents_corrects, sents_tagges): phrase_combine = [ (mot_correct, mot_tagge) for mot_correct, mot_tagge in zip(sent_correct, sent_tagge) ] #print(phrase_combine) indice = 0 for couple in phrase_combine: #print("waaaa" + str(couple)) for MI in scores_SVM: #print(MI) if couple[0][0] == MI['signifiant']: liste_dictionnaires.append( create_dict(phrase_combine, indice)) #print(couple[0][1]) if couple[0][1] == 'M': liste_y.append(1) else: liste_y.append(0) #print("Mot entr") #print(dict_mot) #print('\n') indice += 1 ###CONSTRUCTION DU DICTIONNAIRE TEST#### #corpus_test_tuple = TaggedCorpusReader(dossier_racine, nom_tes) corpus_test_tuple = TaggedCorpusReader( dossier_racine, 'resultats\/corpus_test' + str(tranche + 1) + '.txt') #sert a identifier le feature tag# sents_corrects = corpus_test_tuple.tagged_sents() sents_tagges = tagger.tag_sents(corpus_test_tuple.sents()) liste_dictionnaires_test = [] liste_y_test = [] for sent_correct, sent_tagge in zip(sents_corrects, sents_tagges): phrase_combine = [ (mot_correct, mot_tagge) for mot_correct, mot_tagge in zip(sent_correct, sent_tagge) ] #print(phrase_combine) indice = 0 for couple in phrase_combine: for MI in scores_SVM: if couple[0][0] == MI['signifiant']: liste_dictionnaires_test.append( create_dict(phrase_combine, indice)) if couple[0][1] == 'M': liste_y_test.append(1) else: liste_y_test.append(0) #print(dict_mot) #print('\n') indice += 1 #vectoriation des dictionnaires### vec = DictVectorizer() listes_colles = liste_dictionnaires + liste_dictionnaires_test vecteur_x_ent_plus_test = vec.fit_transform(listes_colles).toarray() #joblib.dump(vec, 'vectoriseur.pkl') #print(vec.get_feature_names()) #print(vecteur_x_ent_plus_test) vecteur_x_entrainement = vecteur_x_ent_plus_test[:len(liste_dictionnaires)] vecteur_x_test = vecteur_x_ent_plus_test[len(liste_dictionnaires):] clf = svm.SVC(kernel='linear', C=18, class_weight={1: 3}) #BEST equilibre #0,9211 0,9574 #Total signifiants 4185, Fmesure obtenue 0,9389 print(clf.get_params()) clf.fit(vecteur_x_entrainement, liste_y) #joblib.dump(clf, 'classifieur_SVM.pkl') #print(vecteur_x_test) prediction = clf.predict(vecteur_x_test) #print(liste_y_test) #print(prediction) double_y = zip(liste_y_test, prediction) """#pour utiliser sans signifiant dans dict scores_total = {'signifiant': "toute", 'total_signifiant':0, 'total_MI':0, 'MI_reperes':0, 'MI_corrects':0 } """ for unite, couple_reponse in zip(liste_dictionnaires_test, double_y): #print(unite) #print(couple_reponse) for M in scores_SVM: #print(MI) if M['signifiant'] == unite['signifiant']: M['total_signifiant'] += 1 if couple_reponse[0] == 1: M['total_MI'] += 1 if couple_reponse[1] == 1: M['MI_reperes'] += 1 if couple_reponse[0] == couple_reponse[1]: M['MI_corrects'] += 1