Python ConllCorpusReader.tagged_sents示例，nltk.corpus.reader.ConllCorpusReader.tagged_sents Python示例

示例#1

0

显示文件

文件： build_data.py 项目： 4theKnowledge/pytorch-lexnorm

def main():

    if cf.EMBEDDING_MODEL == "Elmo":
        raise Exception("Please use build_data_elmo instead.")
    #if cf.MODEL_TYPE == S2S:
    corpusReader = ConllCorpusReader(cf.DATA_FOLDER,
                                     [cf.TRAIN_FILENAME, cf.TEST_FILENAME],
                                     ['words', 'pos'])
    #elif cf.MODEL_TYPE == S21:
    #	corpusReader = TabbedCorpusReader(cf.DATA_FOLDER, [cf.TRAIN_FILENAME, cf.TEST_FILENAME])

    tagged_sents = corpusReader.tagged_sents()

    test_unique_wordtags, test_unique_chartags = get_unique_test_tag_set()

    logger.info("%d sentences loaded." % len(tagged_sents))
    #tagged_sents = clean_sentences(tagged_sents)
    #logger.info("%d sentences after cleaning (removing short/long sentences)." % len(tagged_sents))

    word_to_ix, ix_to_word, wtag_to_ix, ix_to_wtag = get_word_and_wordtag_ids(
        tagged_sents)  #, test_unique_wordtags)
    char_to_ix, ix_to_char, ctag_to_ix, ix_to_ctag = get_char_and_chartag_ids(
        tagged_sents)  #, test_unique_chartags)

    save_data_to_files(tagged_sents, word_to_ix, wtag_to_ix, ix_to_word,
                       ix_to_wtag, char_to_ix, ctag_to_ix, ix_to_char,
                       ix_to_ctag)

    if cf.USE_PRETRAINED_WORD_EMBEDDINGS:
        # Get all words in the embedding vocab
        emb_vocab = get_emb_vocab(cf.EMB_VEC_FILENAME)

        # Generate OOV embeddings for any words in ix_to_word that aren't in emb_vocab
        #generate_oov_embeddings(ix_to_word, emb_vocab, cf.EMB_BIN_FILENAME, cf.OOV_TOKENS_FILENAME, cf.EMB_OOV_FILENAME)

        # Combine OOV embeddings with IV embeddings and export them to a file
        export_trimmed_embedding_vectors(word_to_ix, cf.WORD_EMBEDDING_DIM,
                                         cf.EMB_OOV_FILENAME,
                                         cf.EMB_VEC_FILENAME,
                                         cf.EMB_TRIMMED_FILENAME,
                                         cf.WORD_EMBEDDING_DIM)

    if cf.USE_PRETRAINED_CHAR_EMBEDDINGS:
        char_emb_vocab = get_emb_vocab(cf.CHAR_EMB_VEC_FILENAME)
        generate_oov_embeddings(ix_to_char, char_emb_vocab,
                                cf.CHAR_EMB_BIN_FILENAME,
                                cf.CHAR_OOV_TOKENS_FILENAME,
                                cf.CHAR_EMB_OOV_FILENAME)
        export_trimmed_embedding_vectors(char_to_ix, cf.CHAR_EMBEDDING_DIM,
                                         cf.CHAR_EMB_OOV_FILENAME,
                                         cf.CHAR_EMB_VEC_FILENAME,
                                         cf.CHAR_EMB_TRIMMED_FILENAME,
                                         cf.CHAR_EMBEDDING_DIM)

    logger.info("Data building complete.")

示例#2

0

显示文件

 def read_turkish_corpus(self):
     tagged_sentences_raw = []
     conll_reader = ConllCorpusReader('path/to/languages-corpora',
                                      'path/to/turkish-pos-conll-file',
                                      ('words', 'pos'),
                                      encoding='UTF-8')
     tagged_sentences_raw_map = conll_reader.tagged_sents(
         'path/to/turkish-pos-conll-file')
     for sent in tagged_sentences_raw_map:
         tagged_sentences_raw.append(sent)
     tagged_sentences = [[(w.lower(), t) for (w, t) in s]
                         for s in tagged_sentences_raw]
     return tagged_sentences

示例#3

0

显示文件

文件： build_data.py 项目： 4theKnowledge/pytorch-lexnorm

def get_unique_test_tag_set():

    logger.info("Building set of testset-unique tags...")

    corpusReaderTrain = ConllCorpusReader(cf.DATA_FOLDER, [cf.TRAIN_FILENAME],
                                          ['words', 'pos'])
    corpusReaderTest = ConllCorpusReader(cf.DATA_FOLDER, [cf.TEST_FILENAME],
                                         ['words', 'pos'])

    tagged_sents_train = corpusReaderTrain.tagged_sents()
    tagged_sents_test = corpusReaderTest.tagged_sents()

    train_wordtags = set()
    train_chartags = set()
    for sent in tagged_sents_train:
        for word, tag in sent:
            if tag != "<PAD>" and tag != "<SELF>":
                train_wordtags.add(tag)
                for char in tag:
                    train_chartags.add(char)

    test_unique_wordtags = set()
    test_unique_chartags = set()
    for sent in tagged_sents_test:
        for word, tag in sent:
            if tag != "<PAD>" and tag != "<SELF>":
                if tag not in train_wordtags:
                    test_unique_wordtags.add(tag)
                    for char in tag:
                        if char not in train_chartags:
                            test_unique_chartags.add(char)

    logger.info(
        "%d unique word tags and %d unique char tags found in the test dataset."
        % (len(test_unique_wordtags), len(test_unique_chartags)))
    return test_unique_wordtags, test_unique_chartags

示例#4

0

显示文件

文件： build_data_elmo.py 项目： 4theKnowledge/pytorch-lexnorm

def main():

	corpusReader = ConllCorpusReader(cf.DATA_FOLDER, [cf.TRAIN_FILENAME, cf.TEST_FILENAME], ['words', 'pos'])

	tagged_sents = corpusReader.tagged_sents()

	test_unique_wordtags, test_unique_chartags = get_unique_test_tag_set()

	logger.info("%d sentences loaded." % len(tagged_sents))	



	ix_to_word, word_to_ix, wtag_to_ix, ix_to_wtag, embedding_vectors = get_ids_and_elmo_embeddings(tagged_sents) 

	char_to_ix, ix_to_char, ctag_to_ix, ix_to_ctag = get_char_and_chartag_ids(tagged_sents)

	save_data_to_files(tagged_sents, word_to_ix, wtag_to_ix, ix_to_word, ix_to_wtag, char_to_ix, ctag_to_ix, ix_to_char, ix_to_ctag)

	logger.info("Data building complete.")

示例#5

0

显示文件

def load_datasets(word_to_ix, wtag_to_ix, char_to_ix, ctag_to_ix, ix_to_char, ix_to_word):
	data_iterators = { "train": None, "dev": None }
	test_dataset = []
	word_index = 1 # Used for elmo models only
	for i, dataset in enumerate(["train", "test"]):

		#if cf.MODEL_TYPE == S2S:
		corpusReader = ConllCorpusReader(cf.DATA_FOLDER, [[cf.TRAIN_FILENAME, cf.TEST_FILENAME][i]], ['words', 'pos'])
		#elif cf.MODEL_TYPE == S21:
		#	corpusReader = TabbedCorpusReader(cf.DATA_FOLDER, [[cf.TRAIN_FILENAME, cf.TEST_FILENAME][i]])

		tagged_sents = corpusReader.tagged_sents()
		data_w, data_x, data_y, data_f, rejected_sents, rejected_words, filtered_words, non_alphabetical_words, rejected_tags, word_index = tagged_sents_to_numpy(tagged_sents, word_to_ix, wtag_to_ix, char_to_ix, ctag_to_ix, ix_to_char, ix_to_word, dataset, word_index)
		if cf.WORD_LEVEL_WITH_FLAGGER:
			myDataset = MyDatasetWithFlags(data_w, data_x, data_y, data_f)
		else:
			myDataset = MyDataset(data_w, data_x, data_y)

		data_iterator = DataLoader(myDataset, batch_size=cf.BATCH_SIZE, pin_memory=True)
		data_iterators[dataset] = data_iterator
			#for d in data_iterator:
		#		torch.set_printoptions(threshold = 5000000)
	#			print d 
	#			exit()
		logger.info("Loaded %d %s batches.\n" % (len(data_iterator), dataset) +
			"      (%d x %d = ~%d %s total)" % (len(data_iterator), cf.BATCH_SIZE, len(data_iterator) * cf.BATCH_SIZE, "words" if cf.GRANULARITY in [CHAR_LEVEL, CHAR_AND_WORD_LEVEL] else "sentences"))
		if len(rejected_sents) > 0:
			logger.warning("%d of %d sentences from the %s set were trimmed due to being too long or short." % (len(rejected_sents), len(tagged_sents) + len(rejected_sents), dataset))
		if len(rejected_words) > 0:
			logger.warning("%d words from the %s set were trimmed due to being too long." % (len(rejected_words), dataset))
		if len(rejected_tags) > 0:
			logger.warning("%d labels from the %s set were trimmed due to being too long." % (len(rejected_tags), dataset))
		if len(filtered_words) > 0:
			logger.info("%d words were filtered from the %s set due to beginning with undesirable character sequences." % (len(filtered_words), dataset))
		if len(non_alphabetical_words) > 0:
			logger.info("%d words were filtered from the %s set due to being entirely non-alphabetical." % (len(non_alphabetical_words), dataset))
	return data_iterators

示例#6

0

显示文件

 def tagged_sents(self, fileids=None, categories=None):
     return ConllCorpusReader.tagged_sents(
         self, self._resolve(fileids, categories))

示例#7

0

显示文件

文件： catchunked.py 项目： RomanZacharia/python_text_processing_w_nltk2_cookbook

	def tagged_sents(self, fileids=None, categories=None):
		return ConllCorpusReader.tagged_sents(self, self._resolve(fileids, categories))

示例#8

0

显示文件

文件： Text_Ana_Assigment2.py 项目： nreyesh/SentimentAnalysis

    lst.append(list_train[i].split())


lst_train = []
for i in range(0,len(lst)):
    lst2 = []
    for j in range(0,len(lst[i])):
        lst2.append(lst[i][j].split('|'))
    lst_train.append(lst2)

print('Train Set Readed')

## Reads the WikiGold Test Set

reader = ConllCorpusReader('/home/nicor/Documents','.conll',('words','pos'))
list_test = reader.tagged_sents('wikigold.txt')

lst_test = []
for i in range(0,len(list_test)):
    lst1 = []
    lst2 = []
    lst3 = []
    for j in range(0, len(list_test[i])):
        lst1.append(list_test[i][j][0])
    list2 = nltk.pos_tag(lst1)
    for j in range(0, len(list2)):
        lst3.append([list2[j][0], list2[j][1], list_test[i][j][1]])
    lst_test.append(lst3)

### Defines the Features to be obtained from every sentence

示例#9

0

显示文件

文件： Train.py 项目： gonchandrei/ANLP_viterbi

from __future__ import division
from nltk.corpus.reader import ConllCorpusReader
from nltk.probability import FreqDist, DictionaryProbDist, LaplaceProbDist, SimpleGoodTuringProbDist, MLEProbDist

conllreader = ConllCorpusReader(".", "de-train.tt", ('words', 'pos'))  # getting a train corpus from file
states = ('VERB', 'NOUN', 'PRON', 'ADJ', 'ADV', 'ADP', 'CONJ', 'DET', 'NUM', 'PRT', 'X', '.')  # list of 12 POS tags
sentslen = len(conllreader.tagged_sents())  # getting number of sentences

tagfdist = FreqDist(pair[1] for pair in conllreader.tagged_words())   # getting frequence of (word,tag)

firsttagfdist = FreqDist(pair[0][1] for pair in conllreader.tagged_sents())  # getting frequence of first tags
A0j = DictionaryProbDist(dict(map(lambda (k, x): (k, x/sentslen), firsttagfdist.iteritems())))
A0jLap = LaplaceProbDist(firsttagfdist)
A0jGT = SimpleGoodTuringProbDist(firsttagfdist)
A0jMLE = MLEProbDist(firsttagfdist)

TagPair = []
words = conllreader.tagged_words()
for i in range(0, len(words)-1):
    TagPair.append((words[i][1], words[i+1][1]))

TagPairfdist = FreqDist(TagPair)
Aij = DictionaryProbDist(dict(map(lambda (k, x): (k, x/tagfdist.get(k[0])), TagPairfdist.iteritems())))
AijLap = LaplaceProbDist(TagPairfdist)
AijGT = SimpleGoodTuringProbDist(TagPairfdist)
AijMLE = MLEProbDist(TagPairfdist)

TagWordfdist = FreqDist(conllreader.tagged_words())
Biw = DictionaryProbDist(dict(map(lambda (k, x): (k, x/tagfdist.get(k[1])), TagWordfdist.iteritems())))
BiwLap = LaplaceProbDist(TagWordfdist)
BiwGT = SimpleGoodTuringProbDist(TagWordfdist)