def tagged_words(self, fileids=None, categories=None): return ConllCorpusReader.tagged_words( self, self._resolve(fileids, categories))
from __future__ import division from nltk.corpus.reader import ConllCorpusReader from nltk.probability import FreqDist, DictionaryProbDist, LaplaceProbDist, SimpleGoodTuringProbDist, MLEProbDist conllreader = ConllCorpusReader(".", "de-train.tt", ('words', 'pos')) # getting a train corpus from file states = ('VERB', 'NOUN', 'PRON', 'ADJ', 'ADV', 'ADP', 'CONJ', 'DET', 'NUM', 'PRT', 'X', '.') # list of 12 POS tags sentslen = len(conllreader.tagged_sents()) # getting number of sentences tagfdist = FreqDist(pair[1] for pair in conllreader.tagged_words()) # getting frequence of (word,tag) firsttagfdist = FreqDist(pair[0][1] for pair in conllreader.tagged_sents()) # getting frequence of first tags A0j = DictionaryProbDist(dict(map(lambda (k, x): (k, x/sentslen), firsttagfdist.iteritems()))) A0jLap = LaplaceProbDist(firsttagfdist) A0jGT = SimpleGoodTuringProbDist(firsttagfdist) A0jMLE = MLEProbDist(firsttagfdist) TagPair = [] words = conllreader.tagged_words() for i in range(0, len(words)-1): TagPair.append((words[i][1], words[i+1][1])) TagPairfdist = FreqDist(TagPair) Aij = DictionaryProbDist(dict(map(lambda (k, x): (k, x/tagfdist.get(k[0])), TagPairfdist.iteritems()))) AijLap = LaplaceProbDist(TagPairfdist) AijGT = SimpleGoodTuringProbDist(TagPairfdist) AijMLE = MLEProbDist(TagPairfdist) TagWordfdist = FreqDist(conllreader.tagged_words()) Biw = DictionaryProbDist(dict(map(lambda (k, x): (k, x/tagfdist.get(k[1])), TagWordfdist.iteritems()))) BiwLap = LaplaceProbDist(TagWordfdist) BiwGT = SimpleGoodTuringProbDist(TagWordfdist)
def tagged_words(self, fileids=None, categories=None): return ConllCorpusReader.tagged_words(self, self._resolve(fileids, categories))
## Function to add an adjective to a noun key def add_adj(noun_param, adj_param): if (noun_param in a): a[noun_param].append(adj_param) else: a[noun_param] = [adj_param] filedir = '/Users/fnascime/Documents/Sicily_Project/texts/' filename = 'ilgattopardo_prima' mycorpus = ConllCorpusReader(filedir, filename + '.conll', ('ignore', 'words', 'ignore', 'pos', 'ignore', 'ignore', 'ignore', 'ignore')) words = mycorpus.tagged_words() list_len = len(words) ## Loop through file and retrieve adjetives directly associated with nouns (adjunct words) for i in range(list_len): if (words[i][1] == 'S'): if ((i > 0) and (words[i - 1][1] == 'A')): add_adj(words[i][0], words[i - 1][0]) elif ((i < list_len - 1) and (words[i + 1][1] == 'A')): add_adj(words[i][0], words[i + 1][0]) ## Loop throught the list of words and verify the ones with more adjective nouns_counting = len(a) adj_counting = 0