def iob_sents(self, fileids=None, categories=None): return ConllCorpusReader.iob_sents(self, self._resolve(fileids, categories))
# Copyright # https://github.com/scrapinghub/python-crfsuite/blob/master/examples/CoNLL%202002.ipynb from itertools import chain import pycrfsuite from nltk.corpus.reader import ConllCorpusReader from sklearn.metrics import classification_report from sklearn.preprocessing import LabelBinarizer train = ConllCorpusReader("datasets/conll2003", "eng.train", ["words", "pos", "ignore", "chunk"]) test = ConllCorpusReader("datasets/conll2003", "eng.testb", ["words", "pos", "ignore", "chunk"]) train_sents = list(train.iob_sents()) test_sents = list(test.iob_sents()) def word2features(sent, i): # remove postag word = sent[i][0] # postag = sent[i][1] features = [ "bias", "word.lower=" + word.lower(), "word[-3:]=" + word[-3:], "word[-2:]=" + word[-2:], "word.isupper=%s" % word.isupper(), "word.istitle=%s" % word.istitle(), "word.isdigit=%s" % word.isdigit(), # 'postag=' + postag,
return self._get_iob_words(grid, tagset) return LazyMap(get_iob_words, self._grids(fileids)) def _get_iob_words(self, grid, tagset=None): pos_tags = self._get_column(grid, self._colmap['pos']) if tagset and tagset != self._tagset: pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags] return list( zip(self._get_column(grid, self._colmap['words']), pos_tags, self._get_column(grid, self._colmap['ne']))) bject = ConllCorpusReader("/home/subham", 'train_ner.txt', ('words', 'pos', 'chunk'), ('NP_B', 'PP', 'VP')) train_sents = bject.iob_sents('train_ner.txt') bject1 = ConllCorpusReader("/home/subham", 'test_accuracy.txt', ('words', 'pos', 'chunk'), ('NP_B', 'PP', 'VP')) #train_sents=bject.iob_sents('conll.txt') test_sents = bject1.iob_sents('test_accuracy.txt') #train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train')) #test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb')) #print(test_sents[0]) #print(train_sents[0]) def word2features(sent, i): word = sent[i][0] postag = sent[i][1] features = {
:rtype: list(list) :param fileids: the list of fileids that make up this corpus :type fileids: None or str or list """ self._require(self.WORDS, self.POS) def get_iob_words(grid): return self._get_iob_words(grid, tagset) return LazyMap(get_iob_words, self._grids(fileids)) def _get_iob_words(self, grid, tagset=None): pos_tags = self._get_column(grid, self._colmap['pos']) if tagset and tagset != self._tagset: pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags] return list(zip(self._get_column(grid, self._colmap['words']),pos_tags)) bject=ConllCorpusReader("/home/subham",'train_ner.txt',('words','pos','chunk'),('NP_B','PP','VP')) train_sents=bject.iob_sents('train_ner.txt') bject1=project("/home/subham",'test_file.txt',('words','pos'),('NP_B','PP')) #train_sents=bject.iob_sents('conll.txt') test_sents=bject1.iob_sents1('test_file.txt') #print(test_sents) #train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train')) #test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb')) #print(test_sents[0]) #print(train_sents[0]) def word2features(sent, i): word = sent[i][0] postag = sent[i][1] features = {