Пример #1
0
 def initializeConll(self):
     from nltk.corpus import conll2000
     self.test_sents = conll2000.chunked_sents('test.txt',
                                               chunk_types=['NP'])
     self.train_sents = conll2000.chunked_sents('train.txt',
                                                chunk_types=['NP'])
     self.NPChunker = ChunkParser(self.train_sents)
def main():
    # 使用CoNLL2000分块语料库训练
    test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
    train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])

    chunker = ConsecutiveNPChunker(train_sents)
    print(chunker.evaluate(test_sents))
Пример #3
0
def train_ai():
    print("Training...")
    train_sents = conll2000.chunked_sents('train.txt',
                                          chunk_types=['NP', 'VP', 'PP'])
    test_sents = conll2000.chunked_sents('test.txt',
                                         chunk_types=['NP', 'VP', 'PP'])
    return BigramChunker(train_sents)
Пример #4
0
def exercise3():
    print("part a")
    test_sents = conll2000.chunked_sents('train.txt')[:99]
    grammar = r"""
                 NP: {<DT>?<JJ>*<NN>}
                     {<VBD>?<IN>?<JJ>*<NNS>}
                    """
    cp = nltk.RegexpParser(grammar)
    print(cp.evaluate(test_sents))
    print("part b")
    test_sents = "Many little dogs barked at cats"
    cp = nltk.RegexpParser("")
    test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
    print("Baseline with no chunks : ", cp.evaluate(test_sents))

    grammar = r"NP: {<[CDJNP].*>+}"
    cp = nltk.RegexpParser(grammar)
    print("IOB tag evaluation: ", cp.evaluate(test_sents))

    print("part c")
    test_sents = conll2000.chunked_sents('train.txt')[:99]
    grammar = r"""
                 NP: {<DT>?<JJ>*<NN>}
                     {<VBD>?<IN>?<JJ>*<NNS>}
                     {<[CDJNP].*>+}
                    """
    cp = nltk.RegexpParser(grammar)
    print(cp.evaluate(test_sents))
def get_noun_phrases_and_named_entities_data(data):
    # print data
    train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
    test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
    chunker = BigramChunker.BigramChunker(train_sents + test_sents)

    tagged_data = []
    for sent in data:
        tokens = nltk.word_tokenize(sent)
        tagged = nltk.pos_tag(tokens)
        tagged_data.append(tagged)

    noun_phrases = []
    for tagged_sent in tagged_data:
        tree = chunker.parse(tagged_sent)
        noun_phrases += nltk.chunk.tree2conlltags(tree)

    named_entities = []
    for tagged_sent in tagged_data:
        tree = nltk.chunk.ne_chunk(tagged_sent)
        named_entities += nltk.chunk.tree2conlltags(tree)

    words = []
    cnt = 0
    for sent in data:
        cnt += 1
        tokens = nltk.word_tokenize(sent)
        for token in tokens:
            words.append((token, cnt))

    # print words
    # print noun_phrases
    # print named_entities

    return (words, noun_phrases, named_entities)
Пример #6
0
def exercise3():
    #Carry out the following evaluation tasks for the chunker you have developed in question 2
    # set variables
    chunk_types = ['NP', 'NNS']  #'JJ', 'NNS', 'VBD', 'IN'
    test_sents = "Many little dogs barked at cats"
    #test_sents = conll2000.chunked_sents('test.txt', chunk_types=chunk_types)
    train_sents = conll2000.chunked_sents('train.txt', chunk_types=chunk_types)

    # establishing a baseline for the trivial chunk parser cp that creates no chunks
    cp = nltk.RegexpParser("")
    test_sents = conll2000.chunked_sents('test.txt', chunk_types=chunk_types)
    print("Baseline with no chunks", cp.evaluate(test_sents))

    grammar = r"NP: {<[CDJNP].*>+}"  #tags beginning with letters that are characteristic of noun phrase tags (e.g. CD, DT, and JJ)
    cp = nltk.RegexpParser(grammar)
    print("IOB tag evaluation", cp.evaluate(test_sents))

    # UnigramChunker
    unigram_chunker = UnigramChunker(train_sents)
    print("UnigramChunker", unigram_chunker.evaluate(test_sents))

    # BiGramChunker
    bigram_chunker = BigramChunker(train_sents)
    print("BigramChunker", bigram_chunker.evaluate(test_sents))

    # ConsecutiveNPChunker
    ngram_chunker = ConsecutiveNPChunker(train_sents)
    print("ConsecutiveNPChunker", ngram_chunker.evaluate(test_sents))
Пример #7
0
    def __init__(self):

        train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
        ctagged_sents = [[((w,t),c) for (w,t,c) in nltk.chunk.tree2conlltags(sent)] for sent in train_sents]
        test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
        self._test_sents = [[((w,t), c) for (w,t,c) in nltk.chunk.tree2conlltags(sent)] for sent in test_sents]
        self._tagger = ClassifierBasedTagger(train=ctagged_sents, feature_detector=npchunk_features)
Пример #8
0
def evaluate():

    text = '''
    he PRP B-NP
    accepted VBD B-VP
    the DT B-NP
    position NN I-NP
    of IN B-PP
    vice NN B-NP
    chairman NN I-NP
    of IN B-PP
    Carlyle NNP B-NP
    Group NNP I-NP
    , , O
    a DT B-NP
    merchant NN I-NP
    banking NN I-NP
    concern NN I-NP
    . . O
    '''

    nltk.chunk.conllstr2tree(text, chunk_types=['NP']).draw()

    print conll2000.chunked_sents('train.txt')[99]
    print conll2000.chunked_sents('train.txt', chunk_types=['NP'])[99]
def main():
    train_sents = (nltk.chunk.tree2conlltags(s) for s in conll2000.chunked_sents('train.txt', chunk_types=['NP']))
    # test_sents = (nltk.chunk.tree2conlltags(s) for s in conll2000.chunked_sents('test.txt', chunk_types=['NP']))
    test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])

    fd = np_tags_fd(train_sents)
    print_frequencies(fd, num_results=50)
    # pattern = regex_generator(fd)
    # print pattern
    # pattern = r"NP: {<NN>}"

    print nltk.RegexpParser("").evaluate(test_sents)
    print ''

    pattern_book = r"NP: {<[CDJNP].*>+}"
    print nltk.RegexpParser(pattern_book).evaluate(test_sents)
    print ''

    pattern_modified = r"NP: {<(\$)>?<[CDJNP].*>+}"
    print nltk.RegexpParser(pattern_modified).evaluate(test_sents)
    print ''

    pattern_modified = r"""NP: {<(\$)>?<[CDJNP].*>+}
                               {<W(P|DT)>}"""
    print nltk.RegexpParser(pattern_modified).evaluate(test_sents)
Пример #10
0
def chunk_with_unigram_tagger():
  # use unigram tagger to find the IOB tag given its POS tag
  from nltk.corpus import conll2000
  test_sents = conll2000.chunked_sents("test.txt", chunk_types=["NP"])
  train_sents = conll2000.chunked_sents("train.txt", chunk_types=["NP"])
  unigram_chunker = UnigramChunker(train_sents)
  print unigram_chunker.evaluate(test_sents)
  postags = sorted(set(pos for sent in train_sents
                           for (word, pos) in sent.leaves()))
  print unigram_chunker.tagger.tag(postags)
Пример #11
0
def chunker_sample7():
    """
    分类器分块器示例代码
    :return: 
    """
    train_sents = conll2000.chunked_sents("train.txt", chunk_types=["NP"])
    test_sents = conll2000.chunked_sents("test.txt", chunk_types=["NP"])
    tagged_sents = [[((w, t), c)
                     for (w, t, c) in nltk.chunk.tree2conlltags(sent)]
                    for sent in train_sents]
    chunker = ClassifierChunker(tagged_sents)
    print(chunker.evaluate(test_sents))
def chunked_sents():
    print(conll2000.chunked_sents('train.txt')[99])
    # (S
    #   (PP Over/IN)
    #   (NP a/DT cup/NN)
    #   (PP of/IN)
    #   (NP coffee/NN)
    #   ,/,
    #   (NP Mr./NNP Stone/NNP)
    #   (VP told/VBD)
    #   (NP his/PRP$ story/NN)
    #   ./.)
    print(conll2000.chunked_sents('train.txt', chunk_types=['NP'])[99])
Пример #13
0
 def _load_data():
     try:
         train_set = conll2000.chunked_sents('train.txt')
         test_set = conll2000.chunked_sents('test.txt')
     except Exception:
         if license_prompt('CONLL2000 data set', 'http://www.nltk.org/nltk_data/') is False:
             sys.exit(0)
         nltk.download('conll2000')
         train_set = conll2000.chunked_sents('train.txt')
         test_set = conll2000.chunked_sents('test.txt')
     train_data = [list(zip(*nltk.chunk.tree2conlltags(sent))) for sent in train_set]
     test_data = [list(zip(*nltk.chunk.tree2conlltags(sent))) for sent in test_set]
     return train_data, test_data
def main(convert_func = None):        
    train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
    test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])

    if convert_func:
        # transform the sentence
        print "convert leaf nodes"
        test_sents = [convert_leaf_node(sent, convert_func) 
                      for sent in test_sents]
    print "train..."
    chunker = ConsecutiveNPChunker(train_sents)
    print "evaluate..."
    print(chunker.evaluate(test_sents))
Пример #15
0
def main(convert_func=None):
    train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
    test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])

    if convert_func:
        # transform the sentence
        print "convert leaf nodes"
        test_sents = [
            convert_leaf_node(sent, convert_func) for sent in test_sents
        ]
    print "train..."
    chunker = ConsecutiveNPChunker(train_sents)
    print "evaluate..."
    print(chunker.evaluate(test_sents))
def chunker(sent):

    #a = [("I","PRP"),("hear","VBP"),("Jerusalem","NNP"),("bells","NNS"),("ringing","VBG")]
    #input_sent = " Rockwell said the agreement calls for it to supply 200 addititonal so-called shipsets for the planes."
    input_sent = sent
    text = nltk.word_tokenize(input_sent)
    a = nltk.pos_tag(text)
    phrases = []

    tup = ()
    '''test_sents = conll2000.chunked_sents('test.txt', chunk_types=['VP'])
	train_sents = conll2000.chunked_sents('train.txt', chunk_types=['VP'])
	test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])'''
    NP_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
    VP_sents = conll2000.chunked_sents('train.txt', chunk_types=['VP'])

    class ChunkParser(nltk.ChunkParserI):
        def __init__(self, train_sents):
            train_data = [[(t, c)
                           for w, t, c in nltk.chunk.tree2conlltags(sent)]
                          for sent in train_sents]
            self.tagger = nltk.TrigramTagger(train_data)

        def parse(self, sentence):
            pos_tags = [pos for (word, pos) in sentence]
            tagged_pos_tags = self.tagger.tag(pos_tags)
            chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
            conlltags = [(word, pos, chunktag)
                         for ((word, pos),
                              chunktag) in zip(sentence, chunktags)]
            return nltk.chunk.util.conlltags2tree(conlltags)

    NPChunker = ChunkParser(NP_sents)
    VPChunker = ChunkParser(VP_sents)
    #print (NPChunker.parse("I hear Jerusalem bells ringing"))
    parsed_sent = NPChunker.parse(a)
    for i in parsed_sent:
        if (type(i) != type(tup)):
            l = []
            for t in tuple(i):
                l.append(t[0])
            phrases.append({"NP": " ".join(l)})
    parsed_sent = VPChunker.parse(a)
    for i in parsed_sent:
        if (type(i) != type(tup)):
            l = []
            for t in tuple(i):
                l.append(t[0])
            phrases.append({"VP": " ".join(l)})
    return phrases
Пример #17
0
def main():
    # 使用CoNLL2000分块语料库训练
    test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
    train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
    # chunker = UnigramChunker(train_sents)
    chunker = BigramChunker(train_sents)

    print(chunker.evaluate(test_sents))
    # ChunkParse score:
    #     IOB Accuracy:  92.9%%
    #     Precision:     79.9%%
    #     Recall:        86.8%%
    #     F-Measure:     83.2%%
    postags = sorted(
        set(pos for sent in train_sents for (word, pos) in sent.leaves()))
    print(chunker.tagger.tag(postags))
Пример #18
0
def exercise3():
    print "Exercise - 3"
    grammar1 = r"""
    NP: {<DT>?<JJ><NNS>}
        {<CD><NNS>}
    """
    test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])[:100]
    cp1 = nltk.RegexpParser(grammar1)
    res1 = cp1.evaluate(test_sents)
    print "Statistics data for custom chunker"
    print res1
    print


    cp2 = nltk.RegexpParser("")
    res2 = cp2.evaluate(test_sents)
    print "Statistics data for baseline chunker"
    print res2
    print

    grammar3 = r"""
    NP: {<DT>?<JJ><NNS>}
        {<CD><NNS>}
        {<DT><NN>}
    """
    cp3 = nltk.RegexpParser(grammar3)
    res3 = cp3.evaluate(test_sents)
    print "Statistics data for custom chunker with added regular expression: {<DT><NN>}"
    print res3
    print
Пример #19
0
 def __init__(self):
   try:
     tagger = cPickle.load(open('nerdb_tagger.pkl'))
   except IOError:
     print 'failed to load nerdb_tagger, recreating...'
     train_sents = conll2000.tagged_sents() + brown.tagged_sents()
     tagger = nltk.DefaultTagger('NN')
     tagger = nltk.UnigramTagger(train_sents, backoff=tagger)
     tagger = nltk.BigramTagger(train_sents, backoff=tagger)
     tagger = nltk.TrigramTagger(train_sents, backoff=tagger)
     cPickle.dump(tagger, open('nerdb_tagger.pkl', 'w'))
     print 'done'
   try:
     chunker = cPickle.load(open('nerdb_chunker.pkl'))
   except IOError:
     print 'failed to load nerdb_chunker, recreating...'
     train_sents = conll2000.chunked_sents()
     chunker = ConsecutiveNPChunker(tagger, train_sents)
     cPickle.dump(chunker, open('nerdb_chunker.pkl', 'w'))
     print 'done'
   self.chunker = chunker
   self.people = [line.strip().split(" ", 1) for line in open('actors_index.txt').readlines()]
   self.people += [line.strip().split(" ", 1) for line in open('actresses_index.txt').readlines()]
   self.movies = [line.strip().split(" ", 1) for line in open('title_index.txt').readlines()]
   self.entity_types = {'PERSON' : self.people, 'MOVIE' : self.movies}
Пример #20
0
def regexp_parser_sample5():
    grammar = r"NP: {<[CDJNP].*>+}"
    cp = nltk.RegexpParser(grammar)

    # 加载训练文本中的NP块
    test_sents = conll2000.chunked_sents("train.txt", chunk_types=["NP"])
    print(cp.evaluate(test_sents))
Пример #21
0
def simple_np_bgram(documents):
	bgram = BigramChunker(conll2000.chunked_sents('train.txt'))
	for doc in documents:
		buf = []
		for sent in pos.preprocess(doc):
			buf.append(bgram.parse(sent))
		yield buf
Пример #22
0
def train_parser():
    """
    训练分块器
    """

    # 简单的分块器,抽取NNP(专有名词)
    def mySimpleChunker():
        grammar = 'NP: {<NNP>+}'
        return nltk.RegexpParser(grammar)

    # 不抽取任何东西,只用于检验算法能否正常运行
    def test_nothing(data):
        cp = nltk.RegexpParser("")
        print(cp.evaluate(data))

    # 测试mySimpleChunker()函数
    def test_mySimpleChunker(data):
        schunker = mySimpleChunker()
        print(schunker.evaluate(data))

    datasets = [
        conll2000.chunked_sents('test.txt', chunk_types=['NP']),
        treebank_chunk.chunked_sents(),
    ]

    # 前50个IOB标注语句 计算分块器的准确率
    for dataset in datasets:
        test_nothing(dataset[:50])
        print('---------------------')
        test_mySimpleChunker(dataset[:50])
        print()
Пример #23
0
 def __init__(self):
     super().__init__()
     nltk.download("conll2000")
     nltk.download("averaged_perceptron_tagger")
     data = conll2000.chunked_sents()
     train_data = data[:10900]
     self.model = ClassifierChunkParser(train_data)
Пример #24
0
 def __init__(self):
     try:
         tagger = cPickle.load(open("nerdb_tagger.pkl"))
     except IOError:
         print "failed to load nerdb_tagger, recreating..."
         train_sents = conll2000.tagged_sents() + brown.tagged_sents()
         tagger = nltk.DefaultTagger("NN")
         tagger = nltk.UnigramTagger(train_sents, backoff=tagger)
         tagger = nltk.BigramTagger(train_sents, backoff=tagger)
         tagger = nltk.TrigramTagger(train_sents, backoff=tagger)
         cPickle.dump(tagger, open("nerdb_tagger.pkl", "w"))
         print "done"
     try:
         chunker = cPickle.load(open("nerdb_chunker.pkl"))
     except IOError:
         print "failed to load nerdb_chunker, recreating..."
         train_sents = conll2000.chunked_sents()
         chunker = ConsecutiveNPChunker(tagger, train_sents)
         cPickle.dump(chunker, open("nerdb_chunker.pkl", "w"))
         print "done"
     self.chunker = chunker
     self.people = [line.strip().split(" ", 1) for line in open("actors_index.txt").readlines()]
     self.people += [line.strip().split(" ", 1) for line in open("actresses_index.txt").readlines()]
     self.movies = [line.strip().split(" ", 1) for line in open("title_index.txt").readlines()]
     self.entity_types = {"PERSON": self.people, "MOVIE": self.movies}
     self.numbers = eval(open("numbers.txt").read())
Пример #25
0
def classifier_based_parser(input_dict):
    train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
    classifier_based_chunker = ClassifierBasedChunkParser(train_sents)
    return {'chunker':
                {'object': classifier_based_chunker,
                 'function': 'parse', }
    }
def get_noun_phrases_and_named_entities(file_name, start_index, end_index):

    sentences = conll2000.sents(file_name)
    noun_phrase_sentences = conll2000.chunked_sents(file_name, chunk_types=['NP'])
    pos_tagged_sentences = conll2000.tagged_sents(file_name)

    sentences = sentences[start_index:end_index]
    pos_tagged_sentences = pos_tagged_sentences[start_index:end_index]
    noun_phrase_sentences = noun_phrase_sentences[start_index:end_index]

    # Extacting mentions.
    words = []
    cnt = 0
    for sent in sentences:
        cnt += 1
        for word in sent:
            words.append((word, cnt))

    noun_phrases = []
    for sent in noun_phrase_sentences:
        noun_phrases += nltk.chunk.tree2conlltags(sent)

    named_entities = []
    for tagged_sent in pos_tagged_sentences:
        tree = nltk.chunk.ne_chunk(tagged_sent)
        named_entities += nltk.chunk.tree2conlltags(tree)

    return (words, noun_phrases, named_entities)
Пример #27
0
def evaluate_chunker():
  from nltk.corpus import conll2000
  cp = nltk.RegexpParser("") # baseline
  test_sents = conll2000.chunked_sents("test.txt", chunk_types=["NP"])
  print cp.evaluate(test_sents)
  grammar = r"NP: {<[CDJNP].*>+}"
  cp1 = nltk.RegexpParser(grammar) # naive tagger, look for all tags in NP chunk
  print cp1.evaluate(test_sents)
Пример #28
0
 def __init__(self):
     try:
         self.unigram_chunker = cPickle.load(open('chunker.pkl', 'r'))
     except (EOFError, IOError):
         train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
         unigram_chunker = ConsecutiveNPChunker(train_sents)
         f = open('chunker.pkl', 'wb')
         cPickle.dump(unigram_chunker, f, -1)
Пример #29
0
def train_chunker(filesDir):
    # Create chunked sentences in the CoNLL format.
    train_sents = conll2000.chunked_sents('train_locations.txt', chunk_types=['Loc'])

    # Train the chunker with the NaiveBayesClassifier
    chunker = ConsecutiveNPChunker(train_sents, combine_features, nltk.NaiveBayesClassifier)

    return chunker
Пример #30
0
 def _build_training_sents(self ):
     # This method randomly select a corpus from the provided lists and then
     # build and return a train sentences that the chunkers will use
     corpuses = [(conll2000,'train.txt'),(conll2002,'esp.train')]
     #trainer = random.choice(corpuses)
     #train_sents = trainer[0].chunked_sents(trainer[1],chunk_types=['NP'])
     train_sents = conll2000.chunked_sents('train.txt',chunk_types=['NP'])
     return train_sents
Пример #31
0
def drawParse(text):
    sentences = posTagging(text)

    # test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
    train_sents = conll2000.chunked_sents("train.txt", chunk_types=["NP"])
    chunker = ChunkParser(train_sents)

    for s in sentences:
        chunker.parse(s).draw()
Пример #32
0
def simple_np_ugram(documents):
	ugram = UnigramChunker(conll2000.chunked_sents('train.txt'))

	"""String sentences get split up into a datastructure"""
	for doc in documents:
		buf = []
		for sent in pos.preprocess(doc):
			buf.append(ugram.parse(sent))
		yield buf
Пример #33
0
def chunking():
    train_sents = conll2000.chunked_sents('train.txt')
    train_data = [[w for w, t, c in nltk.chunk.tree2conlltags(sent)]
                  for sent in train_sents]
    train_label = [[c for w, t, c in nltk.chunk.tree2conlltags(sent)]
                   for sent in train_sents]

    # now append chunking to the front of each group/string
    return train_data, train_label
Пример #34
0
    def __init__(self, stopWordPath=False, megamPath=False):
        '''
        Initialise the class.
        The method initialises
        a. self.texts = a list of texts contained in the corpus
        b. self.IDS : the id of each text
        c. self.stopWords : a list of any additional stopword
        d. self.Lemmatizer : a defaultdict to stem words. It returns 'n' for each token, except for verbs, for which it returns 'v'
        ---------------------------
        KeyWord arguments:
        i. stopWordPath : a file path to load extra stop words
        ii. megamPath : a path to the megam binary to train the chunker

        '''
        ## a container of the texts in the corpus
        self.texts = []
        ## input files
        self.in_files = []
        ## ids of the texts
        self.IDS = []
        ## stopwords
        self.stopWords = []
        if stopWordPath:
            ## a path to a text file containing a list of
            ## stop words
            self.stopWordPath = stopWordPath
            self.stopWords = [
                l.strip() for l in open(self.stopWordPath).readlines()
            ]
        ## initialise the dict for stemming
        self.Lemmatizer = collections.defaultdict(nounDict)
        self.Lemmatizer['v'] = 'v'

        ## try to load a trained chunker
        trainPath = pkg_resources.resource_filename('MyLanguageCorpus', '')
        if os.path.exists('%s/trainedChunker.pkl' % trainPath):

            fin = open('%s/trainedChunker.pkl' % trainPath, 'rb')
            self.chunker = pickle.load(fin)
            fin.close()

        else:
            train_sents = conll2000.chunked_sents('train.txt',
                                                  chunk_types=['NP'])

            print("Training chunker...")
            if 'MEGAM' in os.environ:
                MEGAM = os.environ['MEGAM']
            elif megamPath:
                MEGAM = megamPath
                os.environ['MEGAM'] = megamPath

            chunker = ConsecutiveNPChunker(train_sents)
            fout = open('%s/trainedChunker.pkl' % trainPath, 'wb')
            pickle.dump(chunker, fout)
            fout.close()
            self.chunker = chunker
Пример #35
0
def simpleEvaluation():

    cp = nltk.RegexpParser("")
    test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
    print cp.evaluate(test_sents)

    grammar = r"NP: {<[CDJNP].*>+}"
    cp = nltk.RegexpParser(grammar)
    print cp.evaluate(test_sents)
Пример #36
0
 def _load_data():
     try:
         train_set = conll2000.chunked_sents('train.txt')
         test_set = conll2000.chunked_sents('test.txt')
     except Exception:
         if license_prompt('CONLL2000 data set',
                           'http://www.nltk.org/nltk_data/') is False:
             sys.exit(0)
         nltk.download('conll2000')
         train_set = conll2000.chunked_sents('train.txt')
         test_set = conll2000.chunked_sents('test.txt')
     train_data = [
         list(zip(*nltk.chunk.tree2conlltags(sent))) for sent in train_set
     ]
     test_data = [
         list(zip(*nltk.chunk.tree2conlltags(sent))) for sent in test_set
     ]
     return train_data, test_data
Пример #37
0
def train_unigram(fichero):
    corpus_comida = conll2000.chunked_sents(fichero,
                                            chunk_types=['COMIDA', 'CANTIDAD'])
    print(corpus_comida)
    train_data = [[(w, c) for w, t, c in nltk.chunk.tree2conlltags(sent)]
                  for sent in corpus_comida]
    print(train_data)
    tagger = nltk.UnigramTagger(train_data)
    return tagger
Пример #38
0
def drawParse(text):
    sentences = posTagging(text)

    #test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
    train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
    chunker = ChunkParser(train_sents)

    for s in sentences:
        chunker.parse(s).draw()
Пример #39
0
 def __init__(self, POS):
     '''      
     @param POS: the POS tagger is passed through 
     '''
     train_sents = conll2000.chunked_sents()
     train_data = [[(t, c) for w, t, c in tree2conlltags(sent)]
                   for sent in train_sents]
     self.T = nltk.TrigramTagger(train_data)
     self.Tagger = POS
     self.tmp = []
Пример #40
0
 def __init__(self, POS):
     '''      
     @param POS: the POS tagger is passed through 
     '''
     train_sents = conll2000.chunked_sents()
     train_data = [[(t, c) for w, t, c in tree2conlltags(sent)]
                   for sent in train_sents]
     self.T = nltk.TrigramTagger(train_data)
     self.Tagger = POS
     self.tmp = []
Пример #41
0
def ch07_07_chunker_eval():
  from nltk.corpus import conll2000
  grammar = r"""
    NP: {<NN.*>}
       {<DT> <NN> <JJ> <NN>}
       {<DT> <JJ>* <NN.*>}
       {<POS> <JJ>* <NN>}
       {<NNP> <CC> <NNP>}
  """
  cp = nltk.RegexpParser(grammar)
  test_sents = conll2000.chunked_sents("test.txt", chunk_types=["NP"])
  print cp.evaluate(test_sents)
    def load_nltk_chunked_sentences(cls):
        """
        Load CONLL2000 chunktagged sentences and convert from nltk.tree format
        Returns:
            List of lists where inner list is [(pos, chunk_tag), ... ] representing one sentence
        """
        train_sents = [
            [(pos_tag, chunk_tag) for word, pos_tag, chunk_tag in nltk.chunk.tree2conlltags(sentence)]
            for sentence in conll2000.chunked_sents()
                ]

        return cls(train_sents)
Пример #43
0
def ch07_13c_better_chunker():
  # can be improved with more patterns from the top from previous method
  from nltk.corpus import conll2000
  grammar = r"""
  NP : {<DT> <JJ> <NN.*>}
       {<DT> <NN.*>}
       {<JJ> <NN.*>}
       {<NN.*>+}
  """
  cp = nltk.RegexpParser(grammar)
  test_sents = conll2000.chunked_sents("test.txt", chunk_types=["NP"])
  print cp.evaluate(test_sents)
Пример #44
0
def ch07_07_chunker_eval():
    from nltk.corpus import conll2000
    grammar = r"""
    NP: {<NN.*>}
       {<DT> <NN> <JJ> <NN>}
       {<DT> <JJ>* <NN.*>}
       {<POS> <JJ>* <NN>}
       {<NNP> <CC> <NNP>}
  """
    cp = nltk.RegexpParser(grammar)
    test_sents = conll2000.chunked_sents("test.txt", chunk_types=["NP"])
    print cp.evaluate(test_sents)
Пример #45
0
def ch07_13c_better_chunker():
    # can be improved with more patterns from the top from previous method
    from nltk.corpus import conll2000
    grammar = r"""
  NP : {<DT> <JJ> <NN.*>}
       {<DT> <NN.*>}
       {<JJ> <NN.*>}
       {<NN.*>+}
  """
    cp = nltk.RegexpParser(grammar)
    test_sents = conll2000.chunked_sents("test.txt", chunk_types=["NP"])
    print cp.evaluate(test_sents)
Пример #46
0
    def run_on_corpus(self, corpus):
        """Write sentences to a temporary file as strings of words, run
        TagChunk on the file and retrieve the tagged results, then delete the
        file.
        """
        # Check if the corpus consists of Sentences or MultiSentences, and
        # get a single list of Sentences either way
        sentences = []
        if corpus[0].__class__ == text.sentence.MultiSentence:
            for multisentence in corpus:
                # Collect the Sentence objects from each MultiSentence
                sentences.extend(multisentence.sentences)
        else:
            sentences = corpus

        train_sents = conll2000.chunked_sents('train.txt',
                                              chunk_types=['NP', 'VP', 'PP'])
        unigram_chunker = UnigramChunker(train_sents)

        strings_BIO = []

        for sentence in sentences:
            sentence_text = ' '.join(sentence.tokens)
            tags = [t[1] for t in nltk.pos_tag(sentence_text.split())]

            sentence_arr = sentence_text.split(" ")
            bio = unigram_chunker.tagger.tag(tags)

            temp = ""
            for i in range(0, len(sentence_arr)):
                if str(bio[i][1]) == "O":
                    temp += sentence_arr[i] + "_" + bio[i][0] + "_B-" + str(
                        bio[i][1]) + " "
                else:
                    temp += sentence_arr[i] + "_" + bio[i][0] + "_" + str(
                        bio[i][1]) + " "

            temp = temp[:-1]
            strings_BIO.append(temp)

        #print(strings_BIO)

        # Process sentence
        for sentence, string_BIO in zip(sentences, strings_BIO):
            #print("debugging")
            pos_tags, chunks = self.process_BIO_string(string_BIO)

            sentence.add_token_tags(pos_tags,
                                    name='pos_tags',
                                    annotator='chunker')

            sentence.add_span_tags(chunks, name='chunks', annotator='chunker')
Пример #47
0
def ch07_03_develop_grammar_with_chunkparser():
  # nltk.app.chunkparser()
  from nltk.corpus import conll2000
  grammar = r"""
    NP: {<NN.*>}
       {<DT> <NN> <JJ> <NN>}
       {<DT> <JJ>* <NN.*>}
       {<POS> <JJ>* <NN>}
       {<NNP> <CC> <NNP>}
  """
  cp = nltk.RegexpParser(grammar)
  for sentence in conll2000.chunked_sents("train.txt", chunk_types=["NP"]):
    print cp.parse(sentence)
Пример #48
0
def process_file(input_file):
	file_text=''
	with open(input_file, 'r') as content_file:
	  file_text = content_file.read().decode('utf-8')

	#sentences= nltk.sent_tokenize(file_text)
	sentences=file_text.split('\n')
	noun_phrases=[] 
	train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
	test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP']) 
	chunker = BigramChunker(train_sents)
	print (chunker.evaluate(test_sents))
	for sent in sentences:
	  if not sent:
	    continue
	  tokens = nltk.word_tokenize(sent)
	  if len(tokens)>0:
	    tagged = nltk.pos_tag(tokens)
	    chunked = chunk_np(tagged)
	    #chunked = chunker.parse(tagged)
	    #chunked.draw()
	    utils.traverse(chunked)
	  
	  """
Пример #49
0
def ch07_13a_tag_seqs_for_np():
  from nltk.corpus import conll2000
  train_sents = conll2000.chunked_sents("train.txt", chunk_types=["NP"])
  fdist = nltk.FreqDist()
  tagseq = []
  for sent in train_sents:
    for word, postag, iobtag in nltk.chunk.tree2conlltags(sent):
      if iobtag == "B-NP":
        fdist.inc(" ".join(tagseq))
        tagseq = []
        tagseq.append(postag)
      elif iobtag == "O":
        continue
      else:
        tagseq.append(postag)
  for tagseq in fdist.keys():
    print tagseq, fdist[tagseq]
Пример #50
0
    def __init__(self, **kwargs):
        super(StoryApp, self).__init__(**kwargs)

        self.model = models.Model()
        self.parser = strategy.Parser(
            stemmer=nltk.PorterStemmer(),
            sentence_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle'),

            # todo: analyze more chunk types
            # possible chunk types:
            #       NP (noun phrase)
            #       VP (such as 'has already delivered')
            #       PP (such as 'because of')
            chunker=chunkers.UnigramChunker(conll2000.chunked_sents('train.txt', chunk_types=['NP']))
        )
        self.entity_resolver = strategy.EntityResolutionStrategy()

        self.window = None
Пример #51
0
def test_chunker(filesDir, classifier):
    # Create chunked sentences in the CoNLL format.
    test_sents = conll2000.chunked_sents('test_locations.txt', chunk_types=['Loc'])
    print classifier.evaluate(test_sents);

    text = conll2000.raw('test_data_normal.txt')

    location_list = []
    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    for sent in sent_tokenizer.tokenize(text):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
            if hasattr(chunk, 'node'):
                if chunk.node == "GPE":
                    location = ' '.join(c[0] for c in chunk.leaves())
                    location_list.append(location)

    print location_list
Пример #52
0
    def __init__(self,
                 binary=False,
                 extract_noun_phrases=False,
                 first_sentence_weight=1):

        self.columns = FEATURE_COLUMNS
        self.binary = binary
        self.extract_noun_phrases = extract_noun_phrases
        self.first_sentence_weight = first_sentence_weight
        train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
        self.chunker = ChunkParser(train_sents)
        opts = ''
        if binary:
            opts = opts + 'binary '
        if extract_noun_phrases:
            opts = opts + 'extract_noun_phrases '
        if first_sentence_weight > 1:
            opts = opts + 'upweight_first_sentence '
        self.name = opts
Пример #53
0
def run(q_id):
    train_sents = conll2000.chunked_sents('train.txt')
    unigram_chunker = UnigramChunker(train_sents)

    import init
    #get document here and tag; put into this format:
    #tagged = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"),("dog", "NN"), ("barked", "VBD"), ("at", "IN"),  ("the", "DT"), ("cat", "NN"),(".", ".")]
    topdoc = init.get_corpus(q_id)
    doc_nums = topdoc.keys()
    answers= [];
    for key in doc_nums:
        doc_text = topdoc[key]
        docnum= key
        #print docnum
        doc_text = clean_punctuation(doc_text)
        #print doc_text
        doc_text= doc_text.split()
        tagged=pos_tag(doc_text)

    
        chunked=unigram_chunker.parse2(tagged)
        flatten= chunked.pos()
        #print flatten
        numbered= enumerate(flatten)
        currentTag=''
        words=[]
        for i,v in numbered:
            #print i,v
            ((word,tag),phrasetag)=v
            if currentTag=='':
                currentTag=phrasetag
            if currentTag==phrasetag:
                words.append(word)
            else:
                answers.append((' '.join(words),docnum,i-len(words),currentTag,q_id))
                currentTag= phrasetag
                words= [word]
        answers.append((' '.join(words),docnum,i-len(words),currentTag,q_id))
        #print answers
          
    return answers
Пример #54
0
def demo():
    """
    A demonstration for the C{RegexpChunkParser} class.  A single text is
    parsed with four different chunk parsers, using a variety of rules
    and strategies.
    """

    from nltk import chunk, Tree

    text = """\
    [ the/DT little/JJ cat/NN ] sat/VBD on/IN [ the/DT mat/NN ] ./.
    [ John/NNP ] saw/VBD [the/DT cats/NNS] [the/DT dog/NN] chased/VBD ./.
    [ John/NNP ] thinks/VBZ [ Mary/NN ] saw/VBD [ the/DT cat/NN ] sit/VB on/IN [ the/DT mat/NN ]./.
    """

    print '*'*75
    print 'Evaluation text:'
    print text
    print '*'*75
    print

    grammar = r"""
    NP:                   # NP stage
      {<DT>?<JJ>*<NN>}    # chunk determiners, adjectives and nouns
      {<NNP>+}            # chunk proper nouns
    """
    cp = chunk.RegexpParser(grammar)
    chunk.demo_eval(cp, text)

    grammar = r"""
    NP:
      {<.*>}              # start by chunking each tag
      }<[\.VI].*>+{       # unchunk any verbs, prepositions or periods
      <DT|JJ>{}<NN.*>     # merge det/adj with nouns
    """
    cp = chunk.RegexpParser(grammar)
    chunk.demo_eval(cp, text)

    grammar = r"""
    NP: {<DT>?<JJ>*<NN>}    # chunk determiners, adjectives and nouns
    VP: {<TO>?<VB.*>}       # VP = verb words
    """
    cp = chunk.RegexpParser(grammar)
    chunk.demo_eval(cp, text)

    grammar = r"""
    NP: {<.*>*}             # start by chunking everything
        }<[\.VI].*>+{       # chink any verbs, prepositions or periods
        <.*>}{<DT>          # separate on determiners
    PP: {<IN><NP>}          # PP = preposition + noun phrase
    VP: {<VB.*><NP|PP>*}    # VP = verb words + NPs and PPs
    """
    cp = chunk.RegexpParser(grammar)
    chunk.demo_eval(cp, text)

# Evaluation

    from nltk.corpus import conll2000

    print
    print "Demonstration of empty grammar:"
    
    cp = chunk.RegexpParser("")
    print chunk.accuracy(cp, conll2000.chunked_sents('test.txt',
                                                     chunk_types=('NP',)))

    print
    print "Demonstration of accuracy evaluation using CoNLL tags:"

    grammar = r"""
    NP:
      {<.*>}              # start by chunking each tag
      }<[\.VI].*>+{       # unchunk any verbs, prepositions or periods
      <DT|JJ>{}<NN.*>     # merge det/adj with nouns
    """
    cp = chunk.RegexpParser(grammar)
    print chunk.accuracy(cp, conll2000.chunked_sents('test.txt')[:5])

    print
    print "Demonstration of tagged token input"
    
    grammar = r"""
    NP: {<.*>*}             # start by chunking everything
        }<[\.VI].*>+{       # chink any verbs, prepositions or periods
        <.*>}{<DT>          # separate on determiners
    PP: {<IN><NP>}          # PP = preposition + noun phrase
    VP: {<VB.*><NP|PP>*}    # VP = verb words + NPs and PPs
    """
    cp = chunk.RegexpParser(grammar)
    print cp.parse([("the","DT"), ("little","JJ"), ("cat", "NN"),
                    ("sat", "VBD"), ("on", "IN"), ("the", "DT"),
                    ("mat", "NN"), (".", ".")])