예제 #1
0
    def data_preparation(self):
        """
        Splits one of Brown, BNC News, Indian corpora into train set and
        test set

        Returns:
        --------
            sentences (list):
                Sentences without POS-tags
            tagged_sentences (list):
                Sentences with POS-tags
        """
        if self.corpus == 'brown':
            tagged_sentences = brown.tagged_sents(categories='news')
            sentences = brown.sents(categories='news')
        elif self.corpus == 'bnc':
            root = find('corpora/bnc')
            bncnews = TaggedCorpusReader(root,
                                         'bnc-news-wtp.txt',
                                         tagset='en-claws')
            if self.tagset is None:
                tagged_sentences = bncnews.tagged_sents()
            elif self.tagset == 'universal':
                tagged_sentences = bncnews.tagged_sents(tagset=self.tagset)
            sentences = bncnews.sents()
        elif self.corpus == 'indian':
            if self.lang in ['telugu', 'hindi', 'marathi', 'bangla']:
                tagged_sentences = indian.tagged_sents(f'{self.lang}.pos')
                sentences = indian.sents(f'{self.lang}.pos')
            else:
                print('Language not part of Indian Corpus.')
        return sentences, tagged_sentences
예제 #2
0
	def get_objects(self):
		reader = TaggedCorpusReader('data/', r'.*\.pos')

		pos_fileids = reader.fileids()[1]
		neg_fileids = reader.fileids()[0]

		postag_pos = reader.tagged_sents(pos_fileids)
		postag_neg = reader.tagged_sents(neg_fileids)

		return (postag_pos, postag_neg)
예제 #3
0
def treina(expressao_regular,
               etiquetador,
               destino,
              raiz=".",
              proporcoes=[100],
               razao=1.0,
               codificacao="utf-8"):
    regexp_tagger=abre_etiquetador(etiquetador)
    corpus=TaggedCorpusReader(raiz,
                         expressao_regular,
                         encoding=codificacao)
    print("Conjunto de treino:\n%s\n" % " \n".join(corpus.fileids()))
    sents=corpus.tagged_sents()
    #print sents[3]
    #print type(sents[3][0][0])
    c=len(sents)
    # proporção do conjunto de desenvolvimento
    # em relação a um determinado corpus
    # proporcoes=[10,30,50,70,100] 
    # razão entre sentencas de treino e total de sentencas
    # razao=0.75

    for n in proporcoes:
        proporcao=n/100.0
        size=int(c*proporcao)
        dev=sents[:size]
        size=int(len(dev)*razao)
        train=dev[:size]
        print("\n\nQuantidade de sentenças")
        print("Conjunto de treinamento: %d" % len(train))
        print("Total de %d tokens" % len(sum(train, [])))
        test=dev[size:]
        print("Conjunto de teste: %d sentenças" % len(test))
        print("Total de %d tokens" % len(sum(test, [])))
        t1=time.time()
        rubt=backoff_tagger(train,
                                 [UnigramTagger,BigramTagger,TrigramTagger],
                                 backoff=regexp_tagger)
        t2=time.time()
        print("Tempo de treinamento em segundos: %f" % (t2-t1))
        print('Etiquetagem da sentença-exemplo "%s"\n' % EXEMPLO,rubt.tag(SENTENCA))
        f=open(destino,"wb")
        pickle.dump(rubt,f,-1)
        if razao < 1.0:
            t1=time.time()
	    # introduzir avaliação por meio de Avalia.testa_etiquetador
            print("\nAcurácia na etiquetagem do conjunto de teste: %f" % rubt.evaluate(test))
            t2=time.time()
            print("Tempo de avaliação em segundos: %f" % (t2-t1))
예제 #4
0
def treina(expressao_regular,
           etiquetador=INICIAL,
           destino="BRUBT.pkl",
           raiz=".",
           codificacao="utf-8",
           max_rules=100,
           min_score=3):
    inicial = abre_etiquetador(etiquetador)
    corpus = TaggedCorpusReader(raiz, expressao_regular, encoding=codificacao)
    train_sents = corpus.tagged_sents()
    trainer = brill.FastBrillTaggerTrainer(inicial, TEMPLATES)
    brubt = trainer.train(train_sents,
                          max_rules=max_rules,
                          min_score=min_score)
    print('Etiquetagem da sentença-exemplo "%s"\n' % EXEMPLO,
          brubt.tag(SENTENCA))
    f = open(destino, "wb")
    dump(brubt, f, -1)
    f.close()
예제 #5
0
def read_sub_corpus(corpus, files_req, tag_length=2):
    """
    Read in the requested files from the requested corpus.
    
    Given a corpus and filenames, reads in and cleans the pos tagged data,
    including truncating tags for INTERA.
    
    :param        corpus: The name of the corpus.
    :type         corpus: String (one of {'INTERA', 'UDGreek', 'tagged_texts'})
    :param     files_req: The files to be read.
    :type      files_req: List
    :param    tag_length: Length of tag to include (INTERA only)
    :type     tag_length: Integer
    :default  tag_length: 2
    :rtype              : One (test) or three (train) lists of sentences.
    :raise exception    : If an invalid corpus provided.
    """

    # load the corpus as tagged sentences
    corp_sents = list()
    # for each file
    for file_name in files_req:
        # mask parenthesis
        file_name = file_name.replace('(', '\(').replace(')', '\)')
        corp_raw = TaggedCorpusReader(CORP_DIR + corpus, file_name)
        corp_sents.extend(corp_raw.tagged_sents())

    print('Files read    : ' + str(len(files_req)))
    print('Sentences read: ' + str(len(corp_sents)))
    print('Words read    : ' + str(sum([len(x) for x in corp_sents])))

    # clean the tags - replace missing with '' - and simplify
    corp_sents = [
        list(map(lambda x: (x[0], '') if x[1] == None else x, sent))
        for sent in corp_sents
    ]
    corp_sents = [[(word, tag[:tag_length]) for (word, tag) in sent]
                  for sent in corp_sents]

    # return the loaded files
    return (corp_sents)
예제 #6
0
파일: tagger.py 프로젝트: baio/company-craw
def tag_text(str_trained_folder, str_fname_in, str_fname_out):

    #http://nltk.googlecode.com/svn/trunk/doc/howto/tag.html

    #build trigram tagger based on your tagged_corpora

    tagged_corpora = TaggedCorpusReader(str_trained_folder, '.*')

    #print tagged_corpora.tagged_sents()[50]

    trigram_tagger = nltk.TrigramTagger(tagged_corpora.tagged_sents())

    with open(str_fname_in) as f_in:

        with open(str_fname_out, "w+") as f_out:

            for line in f_in:

                tagged_result = trigram_tagger.tag(line.split())

                str = " ".join([t[0] for t in tagged_result if t[1] == 'C'])

                if str:
                    f_out.write(str + "\n")
예제 #7
0
def analyse_ngram(tranche):

    corpus_entrainement_tuple = TaggedCorpusReader(
        dossier_racine,
        'resultats\/corpus_entrainement' + str(tranche + 1) + '.txt')
    corpus_test_tuple = TaggedCorpusReader(
        dossier_racine, 'resultats\/corpus_test' + str(tranche + 1) + '.txt')

    train_sents = corpus_entrainement_tuple.tagged_sents()

    tagger = None
    tagger = create_tagger(train_sents)

    sents_corrects = corpus_test_tuple.tagged_sents()
    sents_tagges = tagger.tag_sents(corpus_test_tuple.sents())

    #print(corpus_test_tuple.sents())

    for sent_correct, sent_tagge in zip(sents_corrects, sents_tagges):
        phrase_combine = [
            (mot_correct, mot_tagge)
            for mot_correct, mot_tagge in zip(sent_correct, sent_tagge)
        ]

        for couple in phrase_combine:

            for MI in scores_ngram:
                if MI['signifiant'] == couple[0][0]:
                    MI['total_signifiant'] += 1

                    if couple[0][1] == 'M':
                        MI['total_MI'] += 1

                    if couple[1][1] == 'M':
                        MI['MI_reperes'] += 1

                        if couple[1][1] == couple[0][1]:
                            MI['MI_corrects'] += 1
예제 #8
0
파일: Asst3.py 프로젝트: g3ngoric/undergrad
# T(he original version of t)his code was written by Ulrich Germann (11/2010)


######################################################################

import nltk
nltk.data.path[0:0] = ['/u/csc485h/include/a3/nltk']

# The following code provides access to the tagged NY Times corpus
# nyt_big is the full corpus
# nyt_mini a small subset for development
from nltk.data         import ZipFilePathPointer
from nltk.corpus       import TaggedCorpusReader

nyt_zipped = ZipFilePathPointer('/u/csc485h/include/a3/nltk/corpora/nyt.zip','nyt/')
nyt_big    = TaggedCorpusReader(nyt_zipped,['2004-tagged.txt'],sep='/', encoding='latin2')
nyt_mini   = TaggedCorpusReader(nyt_zipped,['nytimes-mini.txt'],sep='/', encoding='latin2')

# Finally, let's set up a default pattern for NP chunking
# Setting up the NP chunker itself is left to the main script, to encourage
# trying different variants of the pattern

##  Operator 	Behavior
##  . 	        Wildcard, matches any character
##  ^abc 	Matches some pattern abc at the start of a string
##  abc$ 	Matches some pattern abc at the end of a string
##  [abc] 	Matches one of a set of characters
##  [A-Z0-9] 	Matches one of a range of characters
##  ed|ing|s 	Matches one of the specified strings (disjunction)
##  * 	        Zero or more of previous item, e.g. a*, [a-z]* (also known as Kleene Closure)
##  + 	        One or more of previous item, e.g. a+, [a-z]+
예제 #9
0
 def read_dir(self):
     # read entire directory as a single doc?
     corpus = TaggedCorpusReader('../../ANC', '.*', '_')
     corpus.tagged_sents()
예제 #10
0
if __name__ == '__main__':
    input_dir = sys.argv[1]
    #coref_reader = CoreReader()

    wp_bootstrapper = WordPairBootstrapper()
    seedwords_filename = sys.argv[2]

    # make this a directory and output the wordpairs, the anaphors, the antecedents and patterns
    outdir = sys.argv[3]

    # read in seedwords as specified
    wp_bootstrapper.read_seedwords(seedwords_filename)

    # read in entire corpus
    corpus = TaggedCorpusReader('../../prepANC', '.*', '_', encoding='utf-8')
    print('corpus is loaded!')

    # iterate through corpus to extract candidate patterns if needed
    if not os.path.isfile('patterns_anc.pkl'):
        wp_bootstrapper.run_candidate_patterns(corpus)
    else:
        wp_bootstrapper.preprocess_seeds()
        wp_bootstrapper.perm_lex = wp_bootstrapper.seedwords  #TODO Clean this up later it's clunky looking
        # print('Read patterns from cache')
        # wp_bootstrapper.read_cache_candidate_eps()
        # wp_bootstrapper.string2pattern_set()
    # iterate through corpus again for as many iterations as needed and do the extraction process
    # wp_bootstrapper.run_candidate_patterns(corpus)
    wp_bootstrapper.run(corpus)
예제 #11
0
def analyse_SVM(tranche):

    global scores_SVM

    ###Preparation des dicts de features###
    #On va chercher les resustats
    corpus_entrainement_tuple = TaggedCorpusReader(
        dossier_racine,
        'resultats\/corpus_entrainement' + str(tranche + 1) + '.txt')

    train_sents = corpus_entrainement_tuple.tagged_sents()
    tagger = None
    tagger = create_tagger(train_sents)

    #joblib.dump(tagger, 'etiqueteur_ngrammes.pkl')

    liste_dictionnaires = []
    liste_y = []

    ###CONSTRUCTION DU DICTIONNAIRE ENTRAINEMENT###
    corpus_test_tuple = TaggedCorpusReader(
        dossier_racine, 'resultats\/corpus_entrainement' + str(tranche + 1) +
        '.txt')  #sert a identifier le feature tag#

    sents_corrects = corpus_test_tuple.tagged_sents()

    sents_tagges = tagger.tag_sents(corpus_test_tuple.sents())

    for sent_correct, sent_tagge in zip(sents_corrects, sents_tagges):

        phrase_combine = [
            (mot_correct, mot_tagge)
            for mot_correct, mot_tagge in zip(sent_correct, sent_tagge)
        ]
        #print(phrase_combine)

        indice = 0

        for couple in phrase_combine:

            #print("waaaa" + str(couple))

            for MI in scores_SVM:

                #print(MI)

                if couple[0][0] == MI['signifiant']:

                    liste_dictionnaires.append(
                        create_dict(phrase_combine, indice))
                    #print(couple[0][1])
                    if couple[0][1] == 'M':
                        liste_y.append(1)
                    else:
                        liste_y.append(0)

                    #print("Mot entr")
                    #print(dict_mot)
                    #print('\n')

            indice += 1

    ###CONSTRUCTION DU DICTIONNAIRE TEST####

    #corpus_test_tuple = TaggedCorpusReader(dossier_racine, nom_tes)
    corpus_test_tuple = TaggedCorpusReader(
        dossier_racine, 'resultats\/corpus_test' + str(tranche + 1) +
        '.txt')  #sert a identifier le feature tag#
    sents_corrects = corpus_test_tuple.tagged_sents()
    sents_tagges = tagger.tag_sents(corpus_test_tuple.sents())

    liste_dictionnaires_test = []
    liste_y_test = []

    for sent_correct, sent_tagge in zip(sents_corrects, sents_tagges):

        phrase_combine = [
            (mot_correct, mot_tagge)
            for mot_correct, mot_tagge in zip(sent_correct, sent_tagge)
        ]
        #print(phrase_combine)

        indice = 0

        for couple in phrase_combine:

            for MI in scores_SVM:

                if couple[0][0] == MI['signifiant']:

                    liste_dictionnaires_test.append(
                        create_dict(phrase_combine, indice))
                    if couple[0][1] == 'M':
                        liste_y_test.append(1)
                    else:
                        liste_y_test.append(0)

                    #print(dict_mot)
                    #print('\n')

            indice += 1

    #vectoriation des dictionnaires###
    vec = DictVectorizer()
    listes_colles = liste_dictionnaires + liste_dictionnaires_test

    vecteur_x_ent_plus_test = vec.fit_transform(listes_colles).toarray()

    #joblib.dump(vec, 'vectoriseur.pkl')

    #print(vec.get_feature_names())
    #print(vecteur_x_ent_plus_test)

    vecteur_x_entrainement = vecteur_x_ent_plus_test[:len(liste_dictionnaires)]
    vecteur_x_test = vecteur_x_ent_plus_test[len(liste_dictionnaires):]

    clf = svm.SVC(kernel='linear', C=18, class_weight={1: 3})  #BEST equilibre
    #0,9211 0,9574
    #Total signifiants 4185, Fmesure obtenue 0,9389

    print(clf.get_params())

    clf.fit(vecteur_x_entrainement, liste_y)

    #joblib.dump(clf, 'classifieur_SVM.pkl')

    #print(vecteur_x_test)

    prediction = clf.predict(vecteur_x_test)

    #print(liste_y_test)
    #print(prediction)

    double_y = zip(liste_y_test, prediction)
    """#pour utiliser sans signifiant dans dict
    scores_total = {'signifiant': "toute",
          'total_signifiant':0,
          'total_MI':0,
          'MI_reperes':0,
          'MI_corrects':0
         }
    """

    for unite, couple_reponse in zip(liste_dictionnaires_test, double_y):
        #print(unite)
        #print(couple_reponse)

        for M in scores_SVM:
            #print(MI)
            if M['signifiant'] == unite['signifiant']:
                M['total_signifiant'] += 1

                if couple_reponse[0] == 1:
                    M['total_MI'] += 1

                if couple_reponse[1] == 1:
                    M['MI_reperes'] += 1

                    if couple_reponse[0] == couple_reponse[1]:
                        M['MI_corrects'] += 1
예제 #12
0
#!/usr/bin/env python
# -*- coding: utf-8

from nltk.probability import ConditionalFreqDist
from nltk.corpus import TaggedCorpusReader
from nltk.tag import simplify

FIRST = 0
END = 150
POS = "V"
#POS = "N"
#POS = "ADJ"

corpus_root = './data'
fileids = 'tagged_sent'

corpus = TaggedCorpusReader(corpus_root,
    fileids,
    encoding='utf-8')

processing = [(simplify.simplify_wsj_tag(tag), word.lower()) for (word, tag) in corpus.tagged_words()]
cfd_corpus = ConditionalFreqDist(processing)

for term,freq in cfd_corpus[POS].items():
  print term.encode("utf-8"),freq
예제 #13
0
from gmail_corpus.nltk_util.bigram_score import make_score_dict, save_score_dict
from nltk.corpus import TaggedCorpusReader
import numpy as np
from glob import glob
import os, sys

if __name__ == '__main__':
	corpus_path = sys.argv[1]
	# remove empty files
	files = glob('%s/*.txt' % corpus_path)
	for f in files:
		if os.path.getsize(f) == 0:
			os.remove(f)
			print 'Removed empty file %s' % f

	corpus = TaggedCorpusReader(corpus_path, '.*\.txt')
	score_dict = make_score_dict(corpus.tagged_words())
	save_score_dict(score_dict, 'bigram_scores.pkl')
	print 'saved bigram_scores.pkl'