def parse_token_pos(essay_object):
    """
    通过nltk的语料库训练pos模型,然后拿文章进行token,然后得pos
    :param essay_object: 
    :return: 返回的是一篇文章的tokens和token对应的pos
    """
    # train pos by nltk's cropus
    from nltk.corpus import treebank
    train_sents = treebank.tagged_sents()[:3000]
    test_sents = treebank.tagged_sents()[3000:]

    train_brown = nltk.corpus.brown.tagged_sents()[0:5000]
    test_brown = nltk.corpus.brown.tagged_sents()[5000:]

    tnt_tagger = nltk.tag.tnt.TnT()
    tnt_tagger.train(train_sents)

    t_tagger_brown = nltk.tag.tnt.TnT()
    t_tagger_brown.train(train_brown)

    print("训练pos模型完成")
    print("当前文章为{}".format(essay_object.essay_str))
    tokenTags = tnt_tagger.tag(essay_object.tokens)  # pos of token
    bTags = t_tagger_brown.tag(essay_object.tokens)  # pos of token
    essay_token_attribute = []
    for tuple_token_pos in tokenTags:  # change token
        list_token_pos = list(tuple_token_pos)
        if list_token_pos[1] == 'Unk':
            list_token_pos[1] = bTags[0][1]
        if list_token_pos[1] == 'Unk':
            if list_token_pos[0][-2:] == 'ed':
                list_token_pos[1] = 'VBD'
        essay_token_attribute.append(list_token_pos)

    return essay_token_attribute
    def load_data(self, percentage):
        print("Started Loading the Data")
        # Get the complete data
        data_set = treebank.fileids()
        # Partition the data into train and test data sets
        training_data_fileIds = [file for file in data_set if "wsj_00" in str(file)]
        testing_data_fileIds = [file for file in data_set if "wsj_01" in str(file)]

        # How much percentage of files consider for training?
        index = int(percentage*len(training_data_fileIds))
        training_data_fileIds = training_data_fileIds[:index]

        tagged_training_data = treebank.tagged_sents(fileids=training_data_fileIds)
        tagged_testing_data = treebank.tagged_sents(fileids=testing_data_fileIds)

        tagged_training_words = treebank.tagged_words(fileids=training_data_fileIds)
        tagged_testing_words = treebank.tagged_words(fileids=testing_data_fileIds)

        # print(len(tagged_training_data1), len(tagged_testing_data1))

        # UnTag the data for other uses
        untagged_training_data = [untag(item) for item in tagged_training_data]
        untagged_testing_data = [untag(item) for item in tagged_testing_data]

        print("Data Loaded Successfully. Stats are")
        print("Training Data Sentences: ", len(tagged_training_data))
        print("Testing Data  Sentences: ", len(tagged_testing_data))

        return tagged_training_data, tagged_testing_data, tagged_training_words, tagged_testing_words, untagged_training_data, untagged_testing_data
Пример #3
0
def trainPOS_Tagger():
    train_data = treebank.tagged_sents()[:3000]
    test_data = treebank.tagged_sents()[3000:]
    tnt_pos_tagger = tnt.TnT()
    tnt_pos_tagger.train(train_data)
    tnt_pos_tagger.evaluate(test_data)
    f = open('tnt_treebank_pos_tagger.pickle', 'w')
    pickle.dump(tnt_pos_tagger, f)
    f.close()
Пример #4
0
def main():
    from nltk.corpus import treebank
    from main import TAGS
    train_data = treebank.tagged_sents()[:3000]
    test_data = treebank.tagged_sents()[3000:]
    hmm = hmm_tagger(TAGS)
    print 'start train'
    hmm.train(train_data)
    print 'start test'
    word_accuracy, sentence_accuracy = hmm.evaluate(test_data)
    print "Word accuracy = {0}% | Sentence accuracy = {1}%".format(
        word_accuracy * 100, sentence_accuracy * 100)
Пример #5
0
def demo(corpus, num_sents):
    """
    Loads a few sentences from the Brown corpus or the Wall Street Journal
    corpus, trains them, tests the tagger's accuracy and tags an unseen
    sentence.

    @type corpus: C{str}
    @param corpus: Name of the corpus to load, either C{brown} or C{treebank}.

    @type num_sents: C{int}
    @param num_sents: Number of sentences to load from a corpus. Use a small
    number, as training might take a while.
    """
    if corpus.lower() == "brown":
        from nltk.corpus import brown
        tagged_sents = brown.tagged_sents()[:num_sents]
    elif corpus.lower() == "treebank":
        from nltk.corpus import treebank
        tagged_sents = treebank.tagged_sents()[:num_sents]
    else:
        print "Please load either the 'brown' or the 'treebank' corpus."

    size = int(len(tagged_sents) * 0.1)
    train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]
    maxent_tagger = MaxentPosTagger()
    maxent_tagger.train(train_sents)
    print "tagger accuracy (test %i sentences, after training %i):" % \
        (size, (num_sents - size)), maxent_tagger.evaluate(test_sents)
    print "\n\n"
    print "classify unseen sentence: ", maxent_tagger.tag(["This", "is", "so",
        "slow", "!"])
    print "\n\n"
    print "show the 10 most informative features:"
    print maxent_tagger.classifier.show_most_informative_features(10)
Пример #6
0
def demo3():
    from nltk.corpus import treebank, brown

    d = list(treebank.tagged_sents())
    e = list(brown.tagged_sents())

    d = d[:1000]
    e = e[:1000]

    d10 = int(len(d) * 0.1)
    e10 = int(len(e) * 0.1)

    tknacc = 0
    sknacc = 0
    tallacc = 0
    sallacc = 0
    tknown = 0
    sknown = 0

    for i in range(10):

        t = TnT(N=1000, C=False)
        s = TnT(N=1000, C=False)

        dtest = d[(i * d10) : ((i + 1) * d10)]
        etest = e[(i * e10) : ((i + 1) * e10)]

        dtrain = d[: (i * d10)] + d[((i + 1) * d10) :]
        etrain = e[: (i * e10)] + e[((i + 1) * e10) :]

        t.train(dtrain)
        s.train(etrain)

        tacc = t.evaluate(dtest)
        tp_un = t.unknown / (t.known + t.unknown)
        tp_kn = t.known / (t.known + t.unknown)
        tknown += tp_kn
        t.unknown = 0
        t.known = 0

        sacc = s.evaluate(etest)
        sp_un = s.unknown / (s.known + s.unknown)
        sp_kn = s.known / (s.known + s.unknown)
        sknown += sp_kn
        s.unknown = 0
        s.known = 0

        tknacc += tacc / tp_kn
        sknacc += sacc / tp_kn
        tallacc += tacc
        sallacc += sacc

        # print i+1, (tacc / tp_kn), i+1, (sacc / tp_kn), i+1, tacc, i+1, sacc

    print("brown: acc over words known:", 10 * tknacc)
    print("     : overall accuracy:", 10 * tallacc)
    print("     : words known:", 10 * tknown)
    print("treebank: acc over words known:", 10 * sknacc)
    print("        : overall accuracy:", 10 * sallacc)
    print("        : words known:", 10 * sknown)
	def get_accuracy(self, sentences=[]):

		if sentences == []:
			test_sents = treebank.tagged_sents()[6000:]
		else:
			test_sents = sentences
		print self._tagger.evaluate(test_sents)
Пример #8
0
def tag_matching(sequences):

    treebank_sentences = treebank.tagged_sents()
    #treebank_sentences = brown.tagged_sents()

    # Return best count/sequence
    best = (0, None)

    count = 0
    errors = 0

    resultset = []

    for seq in sequences:
        for sent in treebank_sentences:
            for i, word in enumerate(sent):
                if sent[i][1] == seq[0]:
                    try:
                        if sent[i+1][1] == seq[1]:
                            count += 1
                            #if sent[i+2][1] == seq[2]:
                            #   count += 1
                    except IndexError:
                        errors += 1
        if count > best[0]:
            best = (count, seq)
        resultset.append((seq, count, errors))
        count, erros = 0, 0
    return resultset
def getData(corpus="brown", categories=""):
    if corpus == "brown":
        if categories != "":
            return brown.tagged_sents(tagset='universal',
                                      categories=categories)

        return brown.tagged_sents(tagset='universal')
    elif corpus == "treebank":
        return treebank.tagged_sents(tagset='universal')
    elif corpus == "nps_chat":
        #Dialogue dataset
        data = []
        posts = nps_chat.posts()
        words = nps_chat.tagged_words(tagset='universal')

        index = 0
        for sent in posts:
            data.append(words[index:index + len(sent)])
            index += len(sent)
        return data

    elif corpus == "conll2000":
        return conll2000.tagged_sents(tagset='universal')

    return brown.tagged_sents(tagset='universal')
Пример #10
0
def train_tagger():
    """
	This function trains the tagger
	"""
    print("Training POS tagger...")
    # https://github.com/japerk/nltk3-cookbook/blob/master/chapter4.py

    tagged_sentences = treebank.tagged_sents()
    size = int(len(tagged_sentences) * 0.9)
    train_sents = tagged_sentences[:size]
    test_sents = tagged_sentences[3000:]

    default = DefaultTagger("NN")
    tagger = ClassifierBasedPOSTagger(
        train=train_sents, backoff=default, cutoff_prob=0.3
    )
    print(tagger.evaluate(test_sents))  # 0.9613641269156055

    # save model to pickle file as binary
    file_name = MODEL_PATH + "tag_model.pkl"
    with open(file_name, "wb") as fout:
        pickle.dump(tagger, fout)

    print("model written to: " + file_name)
    print("")

    return tagger
Пример #11
0
    def train_pos_tagger(self, path):
        tagged_sents = treebank.tagged_sents()

        train_size = int(.75 * len(tagged_sents))
        training_sents = tagged_sents[:train_size]
        test_sents = tagged_sents[train_size:]

        X, y = self.transform_to_dataset(training_sents)

        clf = Pipeline([('vectorizer', DictVectorizer(sparse=False)),
                        ('classifier',
                         DecisionTreeClassifier(criterion="entropy"))])

        print('Training started')
        clf.fit(X, y)
        print('Training finished')

        X_test, y_test = transform_to_dataset(test_sents)
        print('Accuracy: {}'.format(clf.score(X_test, y_test)))

        # Save model to file
        model_pkl = open(path, 'wb')
        pickle.dump(clf, model_pkl)
        model_pkl.close()

        self.classifier = clf
Пример #12
0
def sequence_matching(input):
    sents = treebank.tagged_sents()
    parses = treebank.parsed_sents()
    for s in range(len(sents)):  # look through every sentence in treebank to find a sequence match with input
        sent = sents[s]
        pars = parses[s]
        k = 0  # k will track how far into the sequence has been matched
        matches = []  # log position in sent that there was a match to help build tree later
        for i in range(len(input)):
            match = False  # flag to cut down on time if a word doesn't match anything in the sent
            for j in range(k, len(sent)):  # loop through every word in sentence starting from last match

                if sent[j][1] == input[i][1]:  # labels (pos) match
                    k = j
                    UpdateTree(pars, j, input[i][1])
                    match = True  # if this line is never reached, then don't waste more time on this sentence
                    if i == len(input) - 1:  # made it through the entire input, so sent was a match
                        return pars # pars will have words replaced where there is a match
                    break

            if match == False:
                print("Sentence does not match")
                break  # program has looked through whole sentence without matching a word so move onto the next sentence

    return None  # no sentence was found to match the input sequence, print error message
Пример #13
0
 def traintest_bigram_trigram_tagger(self):
     from nltk.tag import DefaultTagger,UnigramTagger, BigramTagger, TrigramTagger 
     from nltk.corpus import treebank        
     test_sents  = treebank.tagged_sents()[3000:]          
     train_sents = treebank.tagged_sents()[:3000]
     
     print 'trainging bigramTagger'                
     bitagger = BigramTagger(train_sents)
     print 'evaluation bitagger'
     print bitagger.evaluate(test_sents)
     
     print 'trainging trigram Tagger'
     tritagger = TrigramTagger(train_sents)
     print 'evaluation bitagger'
     print tritagger.evaluate(test_sents)
     print 'tagging'
Пример #14
0
def demo2():
    from nltk.corpus import treebank

    d = list(treebank.tagged_sents())

    t = TnT(N=1000, C=False)
    s = TnT(N=1000, C=True)
    t.train(d[(11)*100:])
    s.train(d[(11)*100:])

    for i in range(10):
        tacc = t.evaluate(d[i*100:((i+1)*100)])
        tp_un = float(t.unknown) / float(t.known +t.unknown)
        tp_kn = float(t.known) / float(t.known + t.unknown)
        t.unknown = 0
        t.known = 0

        print('Capitalization off:')
        print('Accuracy:', tacc)
        print('Percentage known:', tp_kn)
        print('Percentage unknown:', tp_un)
        print('Accuracy over known words:', (tacc / tp_kn))

        sacc = s.evaluate(d[i*100:((i+1)*100)])
        sp_un = float(s.unknown) / float(s.known +s.unknown)
        sp_kn = float(s.known) / float(s.known + s.unknown)
        s.unknown = 0
        s.known = 0

        print('Capitalization on:')
        print('Accuracy:', sacc)
        print('Percentage known:', sp_kn)
        print('Percentage unknown:', sp_un)
        print('Accuracy over known words:', (sacc / sp_kn))
    def __init__(self, do_markovify=True):
        print("tagging the datasets and markovifying them ... please wait!")
        # print(list(brown.tagged_sents()))
        # print(list(nps_chat.tagged_words()))
        # with open("reddit_apple_android.txt", "w") as text_file:
        #     self.tagged_sents = list(nltk.pos_tag(sent) for sent in (text_file.sents('reddit_apple_android.txt')))

        self.tagged_sents = list(brown.tagged_sents())
        # self.tagged_sents = list(treebank.tagged_sents())
        # self.tagged_sents = list(nltk.pos_tag(sent) for sent in (gutenberg.sents('austen-emma.txt')))
        # self.tagged_sents = list(nltk.pos_tag(sent) for sent in (gutenberg.sents('quora.txt')))
        # self.tagged_sents = list(nltk.pos_tag(sent) for sent in (gutenberg.sents('reddit_apple_android.txt')))
        # self.tagged_sents = list(nltk.pos_tag(sent) for sent in (gutenberg.sents('hackernews.txt')))
        self.tagged_sents.append(list(treebank.tagged_sents()))
        # self.tagged_sents.append(list(nps_chat.tagged_words()))
        # self.tagged_sents.append(list(nltk.pos_tag(sent) for sent in (gutenberg.sents('austen-emma.txt'))))
        # self.tagged_sents.append(list(nltk.pos_tag(sent) for sent in (gutenberg.sents('chesterton-brown.txt'))))
        # self.tagged_sents.append(list(nltk.pos_tag(sent) for sent in (gutenberg.sents('austen-persuasion.txt'))))
        # self.tagged_sents.append(list(nltk.pos_tag(sent) for sent in (gutenberg.sents('austen-sense.txt'))))
        # self.tagged_sents.append(list(nltk.pos_tag(sent) for sent in (gutenberg.sents('reddit_apple_android.txt'))))
        # self.tagged_sents.append(list(nltk.pos_tag(sent) for sent in (genesis.sents('english-web.txt'))))
        # self.tagged_sents.append(list(nltk.pos_tag(gutenberg.sents('austen-persuasion.txt'))))
        # self.tagged_sents.append(list(nltk.pos_tag(gutenberg.sents('austen-sense.txt'))))
        # self.tagged_sents.append(list(nltk.pos_tag(genesis.sents('english-web.txt'))))
        # self.tagged_sents.append(list(genesis.tagged_words()))
        # self.tagged_sents.append(list(snowball_data.tagged_words()))

        # print(self.tagged_sents)
        if do_markovify:
            self.model = markovify.Chain(self.tagged_sents, 2)
Пример #16
0
def create_input_dataset():
	print 'Loading input'
	input_data = []
	tags = []
	sents = wsj.sents()
	json_file  = open('data.json','w') 
	counter = 0
	for i,sentence in enumerate(wsj.tagged_sents()[:no_of_sentences]):
		prev = None
		prev_prev = None
		for j,word in enumerate(sentence):
			datapoint = {}
			temp = []
			len_sentence = len(sentence)

			
			if(j > 0):
				temp.append(sents[i][j-1])
			else:
				temp.append('*')
			if(j > 1):
				temp.append(sents[i][j-2])
			else:
				temp.append('*')
			temp.append(sents[i][j])
			if(j < len_sentence-1):
				temp.append(sents[i][j+1])
			else:
				temp.append('*')
			if(j < len_sentence-2):
				temp.append(sents[i][j+2])
			else:
				temp.append('*')

			datapoint['wn'] = temp
			
			datapoint['index'] = j

			datapoint['i'] = counter
			counter += 1
			if(prev == None):
				datapoint['t_minus_one'] = '*'
			else:
				datapoint['t_minus_one'] = prev[1]
			if(prev_prev == None):
				datapoint['t_minus_two'] = '*'
			else:
				datapoint['t_minus_two'] = prev_prev[1]

			prev_prev = prev
			prev = word
			# print datapoint,word[1]
			datapoint['tag'] = word[1]
			json_file.write(json.dumps(datapoint))
			json_file.write('\n')
			input_data.append(datapoint)
			tags.append(word[1])
	print 'Done'
	json_file.close()
	return input_data, tags
def extractTransitions(tagged_sents=treebank.tagged_sents(tagset='universal')):
	for s in tagged_sents:
		lasttag = 0
		for token,tag in s:
			T[lasttag][tag]+=1
			L[tag][token]+=1
			lasttag = tag
Пример #18
0
def demo(corpus, num_sents):

    if corpus.lower() == "treebank":
        from nltk.corpus import treebank
        tagged_sents = treebank.tagged_sents()[:num_sents]

    elif corpus.lower() == "brown":
        from nltk.corpus import brown
        tagged_sents = brown.tagged_sents()[:num_sents]

    else:
        print
        "Please load either the 'brown' or the 'treebank' corpus."

    size = int(len(tagged_sents) * 0.1)
    train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]
    maxent_tagger = MaxentPosTagger()
    maxent_tagger.train(train_sents)
    print
    "tagger accuracy (test %i sentences, after training %i):" % \
    (size, (num_sents - size)), maxent_tagger.evaluate(test_sents)
    print
    "\n\n"
    print
    "classify unseen sentence: ", maxent_tagger.tag(
        ["This", "is", "so", "slow", "!"])
    print
    "\n\n"
    print
    "show the 10 most informative features:"
    print
    maxent_tagger.classifier.show_most_informative_features(10)
Пример #19
0
def demo2():
    from nltk.corpus import treebank

    d = list(treebank.tagged_sents())

    t = TnT(N=1000, C=False)
    s = TnT(N=1000, C=True)
    t.train(d[(11) * 100:])
    s.train(d[(11) * 100:])

    for i in range(10):
        tacc = t.accuracy(d[i * 100:((i + 1) * 100)])
        tp_un = t.unknown / (t.known + t.unknown)
        tp_kn = t.known / (t.known + t.unknown)
        t.unknown = 0
        t.known = 0

        print("Capitalization off:")
        print("Accuracy:", tacc)
        print("Percentage known:", tp_kn)
        print("Percentage unknown:", tp_un)
        print("Accuracy over known words:", (tacc / tp_kn))

        sacc = s.accuracy(d[i * 100:((i + 1) * 100)])
        sp_un = s.unknown / (s.known + s.unknown)
        sp_kn = s.known / (s.known + s.unknown)
        s.unknown = 0
        s.known = 0

        print("Capitalization on:")
        print("Accuracy:", sacc)
        print("Percentage known:", sp_kn)
        print("Percentage unknown:", sp_un)
        print("Accuracy over known words:", (sacc / sp_kn))
Пример #20
0
def demo3():
    from nltk.corpus import brown, treebank

    d = list(treebank.tagged_sents())
    e = list(brown.tagged_sents())

    d = d[:1000]
    e = e[:1000]

    d10 = int(len(d) * 0.1)
    e10 = int(len(e) * 0.1)

    tknacc = 0
    sknacc = 0
    tallacc = 0
    sallacc = 0
    tknown = 0
    sknown = 0

    for i in range(10):

        t = TnT(N=1000, C=False)
        s = TnT(N=1000, C=False)

        dtest = d[(i * d10):((i + 1) * d10)]
        etest = e[(i * e10):((i + 1) * e10)]

        dtrain = d[:(i * d10)] + d[((i + 1) * d10):]
        etrain = e[:(i * e10)] + e[((i + 1) * e10):]

        t.train(dtrain)
        s.train(etrain)

        tacc = t.accuracy(dtest)
        tp_un = t.unknown / (t.known + t.unknown)
        tp_kn = t.known / (t.known + t.unknown)
        tknown += tp_kn
        t.unknown = 0
        t.known = 0

        sacc = s.accuracy(etest)
        sp_un = s.unknown / (s.known + s.unknown)
        sp_kn = s.known / (s.known + s.unknown)
        sknown += sp_kn
        s.unknown = 0
        s.known = 0

        tknacc += tacc / tp_kn
        sknacc += sacc / tp_kn
        tallacc += tacc
        sallacc += sacc

        # print(i+1, (tacc / tp_kn), i+1, (sacc / tp_kn), i+1, tacc, i+1, sacc)

    print("brown: acc over words known:", 10 * tknacc)
    print("     : overall accuracy:", 10 * tallacc)
    print("     : words known:", 10 * tknown)
    print("treebank: acc over words known:", 10 * sknacc)
    print("        : overall accuracy:", 10 * sallacc)
    print("        : words known:", 10 * sknown)
Пример #21
0
def make_sentences():
    dictionary = [k.strip() for k in open("./embeddings/words.lst")]
    ind_lookup = {word:(ind+1) for ind,word in enumerate(dictionary)}

    taglst = [k.strip() for k in open("data/tags.lst")]
    tag_lookup = {word:(ind+1) for ind,word in enumerate(taglst)}

    bracket_rep = { "-LRB-":"(",
                    "-RRB-":")",
                    "-RSB-":"[",
                    "-RSB-":"]",
                    "-LCB-":"{",
                    "-RCB-":"}"}

    sentences = list(treebank.tagged_sents())
    for i,sent in enumerate(sentences):
        sent = [(item.lower(),tag) for (item,tag) in sent if tag != '-NONE-']
        sent = [(bracket_rep.get(item, item), tag)                          for (item,tag) in sent]
        sent = [(u'0', tag) if item[0].isdigit() else (item,tag)            for (item,tag) in sent]
        sent = [(u"UNKNOWN", tag) if item not in ind_lookup else (item,tag) for (item,tag) in sent]
        # 1 indexed!!!
        sent = [(ind_lookup[item], tag_lookup[tag])                         for (item,tag) in sent]
        sentences[i] = sent

    sentences = [i for i in sentences if len(i) > 4]
    print(sum(map(len, sentences)) / float(len(sentences)))

    return sentences
Пример #22
0
 def __init__(self,dname='treebank'):
     super().__init__()
     
     
     data = None
     #selecting the datset
     if dname =='treebank':
         if len(treebank.words()) == 0:    
             nltk.download('treebank')
         data = treebank.tagged_sents(tagset='universal')
         
     elif dname == 'brown':
         if len(brown.words()) == 0:    
             nltk.download('brown')
         data = brown.tagged_sents(tagset='universal')
         
     
     self.data=data
     #print(data[0:1])
     vocab,tags =self._build_vocab()
     max_sent_len = max(map(len, data))
     self.max_sent_len = max_sent_len
     self.word_to_idx = defaultdict(lambda:0, {word:idx for idx,word in enumerate(vocab)})
     self.idx_to_word = {idx:word for word,idx in self.word_to_idx.items()}
     self.tag_to_idx = {tag:idx for idx,tag in enumerate(tags)}
     self.idx_to_tag = {idx:tag for tag,idx in self.tag_to_idx.items()}
     self.sen_list,self.tag_list = self._convert_to_num()
Пример #23
0
def _demo_prepare_data(tagged_data, train, num_sents, randomize, separate_baseline_data):
    # train is the proportion of data used in training; the rest is reserved
    # for testing.
    if tagged_data is None:
        print("Loading tagged data from treebank... ")
        tagged_data = treebank.tagged_sents()
    if num_sents is None or len(tagged_data) <= num_sents:
        num_sents = len(tagged_data)
    if randomize:
        random.seed(len(tagged_data))
        random.shuffle(tagged_data)
    cutoff = int(num_sents * train)
    training_data = tagged_data[:cutoff]
    gold_data = tagged_data[cutoff:num_sents]
    testing_data = [[t[0] for t in sent] for sent in gold_data]
    if not separate_baseline_data:
        baseline_data = training_data
    else:
        bl_cutoff = len(training_data) // 3
        (baseline_data, training_data) = (training_data[:bl_cutoff], training_data[bl_cutoff:])
    (trainseqs, traintokens) = corpus_size(training_data)
    (testseqs, testtokens) = corpus_size(testing_data)
    (bltrainseqs, bltraintokens) = corpus_size(baseline_data)
    print("Read testing data ({0:d} sents/{1:d} wds)".format(testseqs, testtokens))
    print("Read training data ({0:d} sents/{1:d} wds)".format(trainseqs, traintokens))
    print("Read baseline data ({0:d} sents/{1:d} wds) {2:s}".format(
        bltrainseqs, bltraintokens, "" if separate_baseline_data else "[reused the training set]"))
    return (training_data, baseline_data, gold_data, testing_data)
Пример #24
0
def benchmark_aptagger():
    '''
    Benchmark the aptagger vs the Penn Treebank sample in nltk
    '''
    from nltk.corpus import treebank

    # we want to remove "-NONE-" tags since these appear to be garbage
    text = []
    tags = []
    k = 0
    for sentence in treebank.tagged_sents():
        text.append([ele[0] for ele in sentence if ele[1] != '-NONE-'])
        tags.extend([ele[1] for ele in sentence if ele[1] != '-NONE-'])
        k += 1

    t1 = time.time()
    predicted = tagger.tag_sents(text)
    t2 = time.time()

    ncorrect = sum(
        bool(t == p[1]) for t, p in izip(tags, chain.from_iterable(predicted)))

    print("For Penn Treebank sample in NLTK:")
    print("Took %s seconds to POS tag %s tokens (%s tokens/sec)" %
          (t2 - t1, len(tags), int(len(tags) / (t2 - t1))))
    print("Accuracy: %s" % (float(ncorrect) / len(tags)))
Пример #25
0
def _demo_prepare_data(tagged_data, train, num_sents, randomize, separate_baseline_data):
    # train is the proportion of data used in training; the rest is reserved
    # for testing.
    if tagged_data is None:
        print("Loading tagged data from treebank... ")
        tagged_data = treebank.tagged_sents()
    if num_sents is None or len(tagged_data) <= num_sents:
        num_sents = len(tagged_data)
    if randomize:
        random.seed(len(tagged_data))
        random.shuffle(tagged_data)
    cutoff = int(num_sents * train)
    training_data = tagged_data[:cutoff]
    gold_data = tagged_data[cutoff:num_sents]
    testing_data = [[t[0] for t in sent] for sent in gold_data]
    if not separate_baseline_data:
        baseline_data = training_data
    else:
        bl_cutoff = len(training_data) // 3
        (baseline_data, training_data) = (training_data[:bl_cutoff], training_data[bl_cutoff:])
    (trainseqs, traintokens) = corpus_size(training_data)
    (testseqs, testtokens) = corpus_size(testing_data)
    (bltrainseqs, bltraintokens) = corpus_size(baseline_data)
    print("Read testing data ({0:d} sents/{1:d} wds)".format(testseqs, testtokens))
    print("Read training data ({0:d} sents/{1:d} wds)".format(trainseqs, traintokens))
    print("Read baseline data ({0:d} sents/{1:d} wds) {2:s}".format(
        bltrainseqs, bltraintokens, "" if separate_baseline_data else "[reused the training set]"))
    return (training_data, baseline_data, gold_data, testing_data)
Пример #26
0
def benchmark_aptagger():
    '''
    Benchmark the aptagger vs the Penn Treebank sample in nltk
    '''
    from nltk.corpus import treebank

    # we want to remove "-NONE-" tags since these appear to be garbage
    text = []
    tags = []
    k = 0
    for sentence in treebank.tagged_sents():
        text.append([ele[0] for ele in sentence if ele[1] != '-NONE-'])
        tags.extend([ele[1] for ele in sentence if ele[1] != '-NONE-'])
        k += 1

    t1 = time.time()
    predicted = tagger.tag_sents(text)
    t2 = time.time()

    ncorrect = sum(bool(t == p[1])
        for t, p in izip(tags, chain.from_iterable(predicted)))

    print("For Penn Treebank sample in NLTK:")
    print("Took %s seconds to POS tag %s tokens (%s tokens/sec)" % (
        t2 - t1, len(tags), int(len(tags) / (t2 - t1))))
    print("Accuracy: %s" % (float(ncorrect) / len(tags)))
Пример #27
0
def get_pos_tagger():
    train_sents = treebank.tagged_sents()
    tagger = nltk.TrigramTagger(train_sents, backoff=
        nltk.BigramTagger(train_sents, backoff=
        nltk.UnigramTagger(train_sents, backoff=
        nltk.DefaultTagger("NN"))))
    return tagger
Пример #28
0
def main():
    ### Globals ###
    regexp_tagger = nltk.RegexpTagger(
           [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
           (r'(The|the|A|a|An|an)$', 'AT'),   # articles
           (r'.*able$', 'JJ'),                # adjectives
           (r'.*ness$', 'NN'),                # nouns formed from adjectives
           (r'.*ly$', 'RB'),                  # adverbs
           (r'.*s$', 'NNS'),                  # plural nouns
           (r'.*ing$', 'VBG'),                # gerunds
           (r'.*ed$', 'VBD'),                 # past tense verbs
           (r'.*', 'NN')                      # nouns (default)
    ])

    training_data = treebank.tagged_sents()
           
    unigram_tagger = nltk.UnigramTagger(training_data, backoff=regexp_tagger)
    bigram_tagger = nltk.BigramTagger(training_data, backoff=unigram_tagger)
    trigram_tagger = nltk.TrigramTagger(training_data, backoff=bigram_tagger)

    unigram_pickler = pickle.Pickler(open("unigram_tagger.bin","w"))
    bigram_pickler = pickle.Pickler(open("bigram_tagger.bin","w"))
    trigram_pickler = pickle.Pickler(open("trigram_tagger.bin","w"))

    unigram_pickler.dump(unigram_tagger)
    bigram_pickler.dump(bigram_tagger)
    trigram_pickler.dump(trigram_tagger)
Пример #29
0
    def train_pos_tagger(self, path):
        # Just to make sure
        nltk.download('treebank')

        tagged_sentences = treebank.tagged_sents()

        train_size = int(.80 * len(tagged_sentences))
        training_sentences = tagged_sentences[:train_size]

        X_train, y_train = self.transform_to_dataset(training_sentences)

        model = CRF()

        print('Training started...')
        model.fit(X_train, y_train)
        print('Training finished.')

        # Save classifier to file
        model_pkl = open(path, 'wb')
        pickle.dump(model, model_pkl)
        model_pkl.close()

        print("POSTagger saved.")

        self.classifier = model
Пример #30
0
def demo(corpus, num_sents):
    """
    Loads a few sentences from the Brown corpus or the Wall Street Journal
    corpus, trains them, tests the tagger's accuracy and tags an unseen
    sentence.

    @type corpus: C{str}
    @param corpus: Name of the corpus to load, either C{brown} or C{treebank}.

    @type num_sents: C{int}
    @param num_sents: Number of sentences to load from a corpus. Use a small
    number, as training might take a while.
    """
    if corpus.lower() == "brown":
        from nltk.corpus import brown
        tagged_sents = brown.tagged_sents()[:num_sents]
    elif corpus.lower() == "treebank":
        from nltk.corpus import treebank
        tagged_sents = treebank.tagged_sents()[:num_sents]
    else:
        print "Please load either the 'brown' or the 'treebank' corpus."

    size = int(len(tagged_sents) * 0.1)
    train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]
    maxent_tagger = MaxentPosTagger()
    maxent_tagger.train(train_sents)
    print "tagger accuracy (test %i sentences, after training %i):" % \
        (size, (num_sents - size)), maxent_tagger.evaluate(test_sents)
    print "\n\n"
    print "classify unseen sentence: ", maxent_tagger.tag(
        ["This", "is", "so", "slow", "!"])
    print "\n\n"
    print "show the 10 most informative features:"
    print maxent_tagger.classifier.show_most_informative_features(10)
Пример #31
0
    def _train_tagger(self):
        training_sents = treebank.tagged_sents()
        patterns = [  # for regexp tagger
            (r'^[\.|\?|!]$', '.'), (r'^,$', ','), (r'^\'$', '\'\''),
            (r'^\"$', '\"'), (r'^\($', '('),
            (r'^\)$', ')'), (r'^[=|/]$', 'SYM'), (r'.*ing$', 'VBG'),
            (r'.*ed$', 'VBD'), (r'.*es$', 'VBZ'), (r'.*ould$', 'MD'),
            (r'.*\'s$', 'POS'), (r'.*s$', 'NNS'),
            (r'(The|the|A|a|An|an)$', 'AT'), (r'.*able$', 'JJ'),
            (r'.*ly$', 'RB'), (r'.*s$', 'NNS'), (r'^[0-9][0-9]*$', 'CD'),
            (r'^[0-9]([0-9]*[-|.|,|/][0-9]*)*$', 'CD'),
            (r'^([0-9]*\.[0-9]*)*$', 'CD'), (r'^[^a-zA-Z]*$', ':'),
            (r'[A-Z].*', 'NNP'), (r'.*', 'NN')
        ]

        default_tagger = nltk.DefaultTagger('NN')
        regexp_tagger = nltk.RegexpTagger(patterns, backoff=default_tagger)
        unigram_tagger = nltk.UnigramTagger(training_sents,
                                            backoff=regexp_tagger)
        bigram_tagger = nltk.BigramTagger(training_sents,
                                          backoff=unigram_tagger)
        trigram_tagger = nltk.TrigramTagger(training_sents,
                                            backoff=bigram_tagger)

        self.final_tagger = trigram_tagger
Пример #32
0
def ie_preprocess(document):
    print document
    sentences = nltk.sent_tokenize(document)
    # print sentences
    trigram_tagger = nltk.TrigramTagger(brown_a, cutoff=0)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    print "\nDefault tagger"
    x = [t0.tag(sent) for sent in sentences]
    print x
    print "\nUnigram tagger"
    x = [t1.tag(sent) for sent in sentences]
    print x
    print "\nBigram tagger"
    x = [t2.tag(sent) for sent in sentences]
    print x
    print "\nTrigram tagger"
    x = [t3.tag(sent) for sent in sentences]
    print x
    print "\n"
    # sentences = [nltk.pos_tag(sent) for sent in sentences
    trainer = hmm.HiddenMarkovModelTrainer()
    train_data = treebank.tagged_sents()[:3000]
    tagger = trainer.train_supervised(train_data)
    print tagger
    print "\nHMM tagger"
    x = [tagger.tag(sent) for sent in sentences]
    print x
    print "\nPOS Tag"
    sentences = [nltk.pos_tag(sent) for sent in sentences]
    print sentences
    return sentences
Пример #33
0
 def split_sents(self, train=0.95, total=3500,
                 document_class=TaggedSentence):
     sents = tagged_corpus.tagged_sents()[:total]
     total = len(sents) if total is None else total
     i = int(round(train * total))
     j = i + int(round(total - train * total))
     return (map(document_class, sents[0:i]),
             map(document_class, sents[i:j]))
Пример #34
0
def demo(corpus, num_sents):
    """
    Loads a few sentences from the Brown corpus or the Wall Street Journal
    corpus, trains them, tests the tagger's accuracy and tags an unseen
    sentence.

    @type corpus: C{str}
    @param corpus: Name of the corpus to load, either C{brown} or C{treebank}.

    @type num_sents: C{int}
    @param num_sents: Number of sentences to load from a corpus. Use a small
    number, as training might take a while.
    """
    if corpus.lower() == "brown":
        from nltk.corpus import brown
        tagged_sents = brown.tagged_sents()[:num_sents]

    elif corpus.lower() == "treebank":
        from nltk.corpus import treebank
        tagged_sents = treebank.tagged_sents()[:num_sents]

    elif corpus.lower() == "floresta":
        from nltk.corpus import floresta
        tagged_sents = floresta.tagged_sents()[:num_sents]

    elif corpus.lower() == "cintil":
        print "Loading CINTIL"
        #column_types = ['ignore','words','ignore','ignore','pos','ignore']
        #cintil = ConllCorpusReader('/home/dsbatista/cintil/','cintil-fixed.conll',column_types)
        column_types = ['words','pos','ignore']
        #cintil = ConllCorpusReader('/home/dsbatista/extract-publico-relationships/pos-tagger','cintil-fixed.conll',column_types)
        cintil = ConllCorpusReader('/home/dsbatista/extract-publico-relationships/pos-tagger','cintil-fixed-reduced.conll',column_types)
        tagged_sents = cintil.tagged_sents()[:num_sents]

    else:
        print "Please load either the 'brown' or the 'treebank' corpus."

    size = int(len(tagged_sents) * 0.1)

    train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]
    maxent_tagger = MaxentPosTagger()
    maxent_tagger.train(train_sents)

    maxent_tagger.evaluate(test_sents)

    """
    print "tagger accuracy (test %i sentences, after training %i):" % \
        (size, (num_sents - size)), maxent_tagger.evaluate(test_sents)
    print "\n\n"
    print "classify unseen sentence: ", maxent_tagger.tag(["Isto", "é", "bastante","rápido", "!"])
    print "\n\n"
    print "show the 40 most informative features:"
    print maxent_tagger.classifier.show_most_informative_features(40)
    """

    fModel = open('test.pkl',"wb")
    pickle.dump(maxent_tagger, fModel,1)
    fModel.close()
Пример #35
0
def get_tagger():
    try:
        with open(tagger_fn) as tagger_file:
            tagger = pickle.load(tagger_file)
    except:
        tagger = ClassifierBasedPOSTagger(train=treebank.tagged_sents())
        with open(tagger_fn,"w") as tagger_file:
            pickle.dump(tagger,tagger_file)
    return tagger
Пример #36
0
def HMM():
    train_data = treebank.tagged_sents()[:3000]
    print(train_data[0])

    s1 = "Today is a good day ."
    s2 = "Joe met Joanne in Delhi ."
    s3 = "Chicago is the birthplace of Ginny"

    ################################
    ToDo = "Use HMM"
	def tag_words(self, words, sents):
		train_sents = treebank.tagged_sents()
		tagger = UnigramTagger(train_sents)
		test_sents = tagger.tag(sents[0])
		# test_sents = treebank.tagged_sents()[3000:]
		# print treebank.tagged_sents()[1:]
		# print "accuracy: " + str(self._tagger.evaluate(test_sents))
		# print self._tagger.tag(words)
		# print test_sents
		print tagger.evaluate(test_sents)
Пример #38
0
    def run(self):

        app = App.get_running_app()

        print 'start training TnT pos tagger'
        train_sents = treebank.tagged_sents()[:2000]
        unk = DefaultTagger('NN')
        app.root.tnt_tagger = tnt.TnT(unk=unk, Trained=True)
        app.root.tnt_tagger.train(train_sents)
        print 'end training TnT pos tagger'
Пример #39
0
 def split_sents(self,
                 train=0.95,
                 total=3500,
                 document_class=TaggedSentence):
     sents = tagged_corpus.tagged_sents()[:total]
     total = len(sents) if total is None else total
     i = int(round(train * total))
     j = i + int(round(total - train * total))
     return (map(document_class, sents[0:i]), map(document_class,
                                                  sents[i:j]))
Пример #40
0
def create_dataset():
	#print 'Loading dataset'
	dataset = []
	tags = []
	sents = wsj.sents()

	for i,sentence in enumerate(wsj.tagged_sents()[:no_of_sentences]):
		prev = None
		prev_prev = None
		for j,word in enumerate(sentence):
			datapoint = {}
			temp = []
			
			len_sentence = len(sentence)
			
			if(j > 0):
				temp.append(sents[i][j-1])
			else:
				temp.append('*')
			if(j > 1):
				temp.append(sents[i][j-2])
			else:
				temp.append('*')
			
			temp.append(sents[i][j])

			if(j < len_sentence-1):
				temp.append(sents[i][j+1])
			else:
				temp.append('*')
			if(j < len_sentence-2):
				temp.append(sents[i][j+2])
			else:
				temp.append('*')

			#what is WN ?
			datapoint['wn'] = temp
			
			datapoint['index'] = j
			if(prev == None):
				datapoint['t_minus_one'] = '*'
			else:
				datapoint['t_minus_one'] = prev[1]
			if(prev_prev == None):
				datapoint['t_minus_two'] = '*'
			else:
				datapoint['t_minus_two'] = prev_prev[1]

			prev_prev = prev
			prev = word
			# print datapoint,word[1]
			dataset.append(datapoint)
			tags.append(word[1])
	#print 'Done'
	return dataset, tags
Пример #41
0
Файл: brill.py Проект: 4li/nlp
    def train_parser(self):
        default_tagger = DefaultTagger("NN")
        train_sents = treebank.tagged_sents()[:3000]
        initial_tagger = self.backoff_tagger(
            train_sents, [UnigramTagger, BigramTagger, TrigramTagger], backoff=default_tagger
        )
        initial_tagger.evaluate(train_sents)
        brill_tagger = self.train_brill_tagger(initial_tagger, train_sents)

        pickle.dump(brill_tagger, open(self.pickle_path, "wb"))
        return brill_tagger
Пример #42
0
 def test_pos_template(self):
     train_sents = treebank.tagged_sents()[:1000]
     tagger = UnigramTagger(train_sents)
     trainer = brill_trainer.BrillTaggerTrainer(
         tagger, [brill.Template(brill.Pos([-1]))])
     brill_tagger = trainer.train(train_sents)
     # Example from https://github.com/nltk/nltk/issues/769
     result = brill_tagger.tag('This is a foo bar sentence'.split())
     expected = [('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('foo', None),
                 ('bar', 'NN'), ('sentence', None)]
     self.assertEqual(result, expected)
Пример #43
0
def make_backoff_tagger():
	""" Returns a backoff tagger that useses a UnigramTagger,
	BigramTagger, TrigramTagger, and a Default tagger that returns NN

	:returns: A backoff POS tagger.

	"""

	return backoff_tagger(treebank.tagged_sents(), 
		[UnigramTagger, BigramTagger, TrigramTagger],
		backoff=DefaultTagger('NN'))
Пример #44
0
def train_pos_tagger():
  """
  Trains a POS tagger with sentences from Penn Treebank
  and returns it.
  """
  train_sents = treebank.tagged_sents(simplify_tags=True)
  tagger = nltk.TrigramTagger(train_sents, backoff=
    nltk.BigramTagger(train_sents, backoff=
    nltk.UnigramTagger(train_sents, backoff=
    nltk.DefaultTagger("NN"))))
  return tagger
Пример #45
0
def train_tagger(tagger_name):
    train_sents = treebank.tagged_sents()[:5000]
    if tagger_name == "TnT" or tagger_name == 'tagger':
        trained_tagger = tnt.TnT()
        trained_tagger.train(train_sents)
    else:
        tagger1 = DefaultTagger('NN')
        tagger2 = TrigramTagger(train_sents, backoff=tagger1)
        tagger3 = BigramTagger(train_sents, backoff=tagger2)
        trained_tagger = UnigramTagger(train_sents, backoff=tagger3)
    return trained_tagger
Пример #46
0
def main():

    # """
    # ++++++++++++++++++++++++++++++++++++++++++
    # DATA PREPROCESSING
    # """

    #########
    # EITHER
    sentences = treebank.tagged_sents()

    # OR
    # sentences = parsebrown() # have to dl brown corpus ("brown-universal.txt") and change path in parsebrown function
    #########

    # trnstc, tststc, valstc  = ttvsplit(sentences[0:50000], .6, .3, .1)
    trnstc, tststc, valstc = ttvsplit(sentences, .6, .3, .1)

    xtrn, ytrn = str2dct(trnstc)
    xtst, ytst = str2dct(tststc)
    xval, yval = str2dct(valstc)

    dict_encoder, xtrn, xtst, xval = dct2arr(xtrn, xtst, xval)

    label_encoder, ytrn, ytst, yval = catenc(ytrn, ytst, yval)

    ytrn, ytst, yval = ohenc(ytrn, ytst, yval)

    # # print(xtrn[0])   # treebank (61014, 44232)   # brown (860100, 188)
    # # print(ytrn[0])   # treebank (61014, 46)      # brown (860100, 9)

    # # """
    # # ++++++++++++++++++++++++++++++++++++++++++
    # # MODEL
    # # """
    model_params = {
        'build_fn': build_model,
        'input_dim': xtrn.shape[1],
        'hidden_neurons': 512,
        'output_dim': ytrn.shape[1],
        'epochs': 3,
        'batch_size': 1024,
        'verbose': 1,
        'validation_data': (xval, yval),
        'shuffle': True
    }

    m = KerasClassifier(**model_params)
    hist = m.fit(xtrn, ytrn)
    score = m.score(xtst, ytst)
    print("score")
    print(score)
    m.model.save('model')
Пример #47
0
 def traintest_uni_bi_tri_tagger(self):
     from nltk.tag import DefaultTagger,UnigramTagger, BigramTagger, TrigramTagger
     from nltk.corpus import conll2000, treebank    
     test_sents  = conll2000.tagged_sents()[8000:]          
     train_sents = treebank.tagged_sents()[3000:]
     print 'trainging trigramter with backoff'
     backoff = DefaultTagger('NN')
     tagger = self.backoff_tagger(train_sents, [UnigramTagger, BigramTagger,TrigramTagger], backoff=backoff)
     print 'evaluation trigram with backoff'        
     print tagger.evaluate(test_sents)
     print 'tagging'
     print tagger.tag(word_tokenize("This is a test. This should be faster than nothing. How can I rent a car in the next twelve hours? "))
def process(data):
    processed_tweets = []
    t0 = AffixTagger(train=treebank.tagged_sents())
    t1 = UnigramTagger(train=treebank.tagged_sents(), backoff=t0)
    t2 = BigramTagger(train=treebank.tagged_sents(), backoff=t1)
    count = 0
    for tweet in data.get_tweets():
        count += 1
        print count
        tweet = remove_hashtags(tweet)
        tweet = remove_user_tags(tweet)
        tweet = remove_html_entities(tweet)
        tweet = remove_punctuation_deep(tweet)
        tweet = tokenize_and_remove_stopwords(tweet)
        tweet = remove_apostrophes(tweet)
        tweet = remove_multiple_spaces(tweet)
        tweet = translate_slang(tweet)
        tweet = pos_tag_filter(tweet, data, t2)
        if not is_empty(tweet):
            processed_tweets.append(tweet)
    data.set_tweets(processed_tweets)
Пример #49
0
def create_dataset():
    print "Loading dataset"
    dataset = []
    tags = []
    sents = wsj.sents()

    for i, sentence in enumerate(wsj.tagged_sents()[:no_of_sentences]):
        prev = None
        prev_prev = None
        for j, word in enumerate(sentence):
            datapoint = {}
            temp = []
            len_sentence = len(sentence)

            if j > 0:
                temp.append(sents[i][j - 1])
            else:
                temp.append("*")
            if j > 1:
                temp.append(sents[i][j - 2])
            else:
                temp.append("*")

            temp.append(sents[i][j])

            if j < len_sentence - 1:
                temp.append(sents[i][j + 1])
            else:
                temp.append("*")
            if j < len_sentence - 2:
                temp.append(sents[i][j + 2])
            else:
                temp.append("*")

            datapoint["wn"] = temp

            datapoint["index"] = j
            if prev == None:
                datapoint["t_minus_one"] = "*"
            else:
                datapoint["t_minus_one"] = prev[1]
            if prev_prev == None:
                datapoint["t_minus_two"] = "*"
            else:
                datapoint["t_minus_two"] = prev_prev[1]

            prev_prev = prev
            prev = word
            # print datapoint,word[1]
            dataset.append(datapoint)
            tags.append(word[1])
    print "Done"
    return dataset, tags
Пример #50
0
 def from_treebank(klass):
     from nltk.corpus import brown, treebank
     probdist = klass()
     for sent in treebank.tagged_sents():
         for word, tag in sent:
             probdist.inc(word.lower(), tag)
     for sent in treebank_brown.tagged_sents():
         for word, tag in sent:
             probdist.inc(word.lower(), tag)
     for word, tag in get_lexicon():
         probdist.inc(word, tag, closed_class=False)
     for i in range(10): probdist.inc('can', 'VB')
     return probdist
Пример #51
0
def store_pos_tag_dicts():
    pos_tag_dict = defaultdict(tuple)
    tagged = treebank.tagged_sents()
    for sent in tagged:
        for tup in sent:
            if not tup[1] in pos_tag_dict[tup[0].lower()]:
                pos_tag_dict[tup[0].lower()] += (tup[1], )

    pos_tag_dict_univ = defaultdict(tuple)
    penn_tagged_univ = treebank.tagged_sents(tagset='universal')
    brown_tagged_univ = brown.tagged_sents(tagset='universal')
    for text in [penn_tagged_univ, brown_tagged_univ]:
        for sent in text:
            for tup in sent:
                if not tup[1] in pos_tag_dict_univ[tup[0].lower()]:
                    pos_tag_dict_univ[tup[0].lower()] += (tup[1], )
    for word in states.values():
        pos_tag_dict[word.lower()] += ('NNP', )
        pos_tag_dict_univ[word.lower()] += ('NOUN', )
    dicts = (pos_tag_dict, pos_tag_dict_univ)
    with open('{}/data/pos_dicts.pickle'.format(mod_path), 'wb') as file:
        pickle.dump(dicts, file, protocol=2)
Пример #52
0
   def evaluate(self):
      '''run tests on conll2000 and treebank data'''

      test = treebank.tagged_sents()[:100]
      treebank_result = (100*self.classifier.evaluate(test))

      test = conll2000.tagged_sents()[:100]
      conll2000_result = (100*self.classifier.evaluate(test))

      test = brown.tagged_sents()[int(len(brown.tagged_sents())*0.8):]
      brown_result = (100*self.classifier.evaluate(test))

      return (treebank_result, conll2000_result, brown_result)
Пример #53
0
def train_pos_tagger():
    """
  Trains a POS tagger with sentences from Penn Treebank
  and returns it.
  """
    train_sents = treebank.tagged_sents(simplify_tags=True)
    tagger = nltk.TrigramTagger(train_sents,
                                backoff=nltk.BigramTagger(
                                    train_sents,
                                    backoff=nltk.UnigramTagger(
                                        train_sents,
                                        backoff=nltk.DefaultTagger("NN"))))
    return tagger
Пример #54
0
 def from_treebank(klass):
     from nltk.corpus import brown, treebank
     probdist = klass()
     for sent in treebank.tagged_sents():
         for word, tag in sent:
             probdist.inc(word.lower(), tag)
     for sent in treebank_brown.tagged_sents():
         for word, tag in sent:
             probdist.inc(word.lower(), tag)
     for word, tag in get_lexicon():
         probdist.inc(word, tag, closed_class=False)
     for i in range(10): probdist.inc('can', 'VB')
     return probdist
Пример #55
0
def create_dataset():
    print 'Loading dataset'
    dataset = []
    tags = []
    sents = wsj.sents()

    for i, sentence in enumerate(wsj.tagged_sents()[:10]):
        prev = None
        prev_prev = None
        for j, word in enumerate(sentence):
            datapoint = {}
            temp = []
            len_sentence = len(sentence)

            temp.append(sents[i][j])
            if (j > 0):
                temp.append(sents[i][j - 1])
            else:
                temp.append('*')
            if (j > 1):
                temp.append(sents[i][j - 2])
            else:
                temp.append('*')
            if (j < len_sentence - 1):
                temp.append(sents[i][j + 1])
            else:
                temp.append('*')
            if (j < len_sentence - 2):
                temp.append(sents[i][j + 2])
            else:
                temp.append('*')

            datapoint['wn'] = temp

            datapoint['index'] = j
            if (prev == None):
                datapoint['t_minus_one'] = '*'
            else:
                datapoint['t_minus_one'] = prev[1]
            if (prev_prev == None):
                datapoint['t_minus_two'] = '*'
            else:
                datapoint['t_minus_two'] = prev_prev[1]

            prev_prev = prev
            prev = word
            # print datapoint,word[1]
            dataset.append(datapoint)
            tags.append(word[1])
    print 'Done'
    return dataset, tags
 def LemmatizeSents(self,sents):
     tagger=tagging(treebank.tagged_sents(),[UnigramTagger,BigramTagger,TrigramTagger],backoff=None)
     newSents=[]
     for sent in sents:
         taggedSent=tagger.tag(word_tokenize(sent))
         words=[]
         for (wd,tg) in taggedSent:
             newTag=self.tagMap(tg)
             wd=WordNetLemmatizer().lemmatize(wd,newTag)
             words=words+[wd]
         newSent=' '.join(words)
         #print(newSent)
         newSents.append(newSent)
     return newSents
Пример #57
0
    def __init__(self, train_set='treebank'):
        '''
        Constructor
        '''

        # Before building a new tagger check if one has already been pickled
        if (os.path.exists(os.getcwd() + '/' + _pickle_file)):
            input = open(_pickle_file, 'rb')
            self._tagger = load(input)
            input.close()
            input = open(_test_sents_pickle_file, 'rb')
            self._test_sents = load(input)
            input.close()
            
        # Primitives necessary for training the Brill tagger.
        # Taken from cookbook
        else:
            if train_set == 'treebank':
                tagged_sents = list(treebank.tagged_sents())
            else:
                tagged_sents = list(brown.tagged_sents())
            random.shuffle(tagged_sents)
            split_index = int(round(0.8 * len(tagged_sents)))
            train_sents = tagged_sents[:split_index]
            self._test_sents = tagged_sents[split_index:]
            default_tagger = DefaultTagger('NN')
            tagger_classes = [UnigramTagger, BigramTagger, TrigramTagger]
            initial_tagger = backoff_tagger(train_sents, tagger_classes, backoff=default_tagger)
            sym_bounds = [(1,1), (2,2), (1,2), (1,3)]
            asym_bounds = [(-1, -1), (1,1)]
            templates = [
                brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, *sym_bounds),
                brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, *sym_bounds),
                brill.ProximateTokensTemplate(brill.ProximateTagsRule, *asym_bounds),
                brill.ProximateTokensTemplate(brill.ProximateWordsRule, *asym_bounds)]

            # Train the tagger
            trainer = brill.FastBrillTaggerTrainer(initial_tagger, templates, deterministic=True)
            self._tagger = trainer.train(train_sents)

            #Pickle the trained tagger
            if not os.path.exists(os.getcwd() + '/pickles/'):
                os.mkdir(os.getcwd() + '/pickles/')
            output = open(_pickle_file, 'wb')
            dump(self._tagger, output, -1)
            output.close()
            output = open(_test_sents_pickle_file, 'wb')
            dump(self._test_sents, output, -1)
            output.close()
Пример #58
0
def test_Phrase():
    import nltk
    from nltk.corpus import treebank
    fileids = treebank.fileids()
    grammar = r"""
    ADVP:{<RB>(<CC>*<RB>*|<JJ>*)}
    {}
    """
    for fileld in fileids:
        sents = treebank.tagged_sents(fileld)
        for sent in sents:
            tree_Gram = nltk.RegexpParser(grammar).parse(sent)
            for subtree in tree_Gram.subtrees():
                if subtree.label() == "ADVP":
                    print subtree
Пример #59
0
def _load_penntreebank():

    global penntree_tagged_words, penntree_tagged_sents
    if penntree_tagged_words is None or penntree_tagged_sents is None:
        nltk.download('treebank')
        from nltk.corpus import treebank

        # Organized in sentences and words
        penntree_tagged_sents = treebank.tagged_sents()
        penntree_tagged_words = [
            word for sent in penntree_tagged_sents for word in sent
        ]
    else:
        timestamp_msg('Using already loaded sequences ...')
    return penntree_tagged_sents, penntree_tagged_words
Пример #60
0
def tag_penn(words):
    """
    Tokenizes text by using a Penn Treebank tagged sentence and word tokenizer.

    Parameters
    ----------
    words: A list of strings.

    Returns
    -------
    A list of tuples of (str, str)
    """

    pt_tagger = UnigramTagger(treebank.tagged_sents())
    tags = pt_tagger.tag(words)

    return tags