예제 #1
0
 def iob_sents(self, fileids=None, categories=None):
     return ConllCorpusReader.iob_sents(self,
                                        self._resolve(fileids, categories))
	def iob_sents(self, fileids=None, categories=None):
		return ConllCorpusReader.iob_sents(self, self._resolve(fileids, categories))
예제 #3
0
# Copyright
# https://github.com/scrapinghub/python-crfsuite/blob/master/examples/CoNLL%202002.ipynb
from itertools import chain

import pycrfsuite
from nltk.corpus.reader import ConllCorpusReader
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelBinarizer

train = ConllCorpusReader("datasets/conll2003", "eng.train",
                          ["words", "pos", "ignore", "chunk"])
test = ConllCorpusReader("datasets/conll2003", "eng.testb",
                         ["words", "pos", "ignore", "chunk"])

train_sents = list(train.iob_sents())
test_sents = list(test.iob_sents())


def word2features(sent, i):
    # remove postag
    word = sent[i][0]
    # postag = sent[i][1]
    features = [
        "bias",
        "word.lower=" + word.lower(),
        "word[-3:]=" + word[-3:],
        "word[-2:]=" + word[-2:],
        "word.isupper=%s" % word.isupper(),
        "word.istitle=%s" % word.istitle(),
        "word.isdigit=%s" % word.isdigit(),
        # 'postag=' + postag,
            return self._get_iob_words(grid, tagset)

        return LazyMap(get_iob_words, self._grids(fileids))

    def _get_iob_words(self, grid, tagset=None):
        pos_tags = self._get_column(grid, self._colmap['pos'])
        if tagset and tagset != self._tagset:
            pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
        return list(
            zip(self._get_column(grid, self._colmap['words']), pos_tags,
                self._get_column(grid, self._colmap['ne'])))


bject = ConllCorpusReader("/home/subham", 'train_ner.txt',
                          ('words', 'pos', 'chunk'), ('NP_B', 'PP', 'VP'))
train_sents = bject.iob_sents('train_ner.txt')
bject1 = ConllCorpusReader("/home/subham", 'test_accuracy.txt',
                           ('words', 'pos', 'chunk'), ('NP_B', 'PP', 'VP'))
#train_sents=bject.iob_sents('conll.txt')

test_sents = bject1.iob_sents('test_accuracy.txt')
#train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))
#test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))
#print(test_sents[0])
#print(train_sents[0])


def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = {
        :rtype: list(list)
        :param fileids: the list of fileids that make up this corpus
        :type fileids: None or str or list
        """
        self._require(self.WORDS, self.POS)
        def get_iob_words(grid):
            return self._get_iob_words(grid, tagset)
        return LazyMap(get_iob_words, self._grids(fileids))
    
    def _get_iob_words(self, grid, tagset=None):
        pos_tags = self._get_column(grid, self._colmap['pos'])
        if tagset and tagset != self._tagset:
            pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
        return list(zip(self._get_column(grid, self._colmap['words']),pos_tags))
bject=ConllCorpusReader("/home/subham",'train_ner.txt',('words','pos','chunk'),('NP_B','PP','VP'))
train_sents=bject.iob_sents('train_ner.txt')
bject1=project("/home/subham",'test_file.txt',('words','pos'),('NP_B','PP'))
#train_sents=bject.iob_sents('conll.txt')

test_sents=bject1.iob_sents1('test_file.txt')
#print(test_sents)
#train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))
#test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))
#print(test_sents[0])
#print(train_sents[0])


def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = {