示例#1
0
	def __init__(self, train_sents, feature_detector=prev_next_pos_iob, **kwargs):
		if not feature_detector:
			feature_detector = self.feature_detector
		
		train_chunks = chunk_trees2train_chunks(train_sents)
		self.tagger = ClassifierBasedTagger(train=train_chunks,
			feature_detector=feature_detector, **kwargs)
    def __init__(self, train_sents, **kwargs):
        assert isinstance(train_sents, Iterable)

        self.feature_detector = features
        self.tagger = ClassifierBasedTagger(train=train_sents,
                                            feature_detector=features,
                                            **kwargs)
示例#3
0
class Chunker(nltk.chunk.ChunkParserI):
    '''
    Chunker for SCLE. Only chunks NP for now.
    '''
    def __init__(self):

        train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
        ctagged_sents = [[((w,t),c) for (w,t,c) in nltk.chunk.tree2conlltags(sent)] for sent in train_sents]
        test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
        self._test_sents = [[((w,t), c) for (w,t,c) in nltk.chunk.tree2conlltags(sent)] for sent in test_sents]
        self._tagger = ClassifierBasedTagger(train=ctagged_sents, feature_detector=npchunk_features)

    def chunk(self, sentences):
        '''
        '''
        chunked_sents = []
        for sent in sentences:
            c_sent = self._tagger.tag(sent)
            conlltags =[(w,t,c) for ((w,t),c) in c_sent]
            chunked_sents.append(nltk.chunk.conlltags2tree(conlltags))
        return chunked_sents
   
    def evaluate(self):
        '''
        Evaluate the chunker.
        '''
        print self._tagger.evaluate(self._test_sents)
示例#4
0
 def __init__(self, feature_detector, labeled_sequence, classifier_builder):
     """
     @param feature_detector: the function or dictionary used to featurize
         the training data.
     @type feature_dector: C{dict} or C{function}
     @param labeled_sequence: the list of training tokens
     @type labeled_sequence: C{list} of C{list} of C{tuple}
     @param classifier_builder: the function used to initialize the
         classifier
     @type classifier_builder: C{function}
     """
     if self.__class__ == AbstractClassifierBasedTagger:
         raise AssertionError, "Interfaces can't be instantiated"
     ClassifierBasedTagger.__init__(self, feature_detector, 
                                    labeled_sequence, classifier_builder)
class NamedEntityChunker(ChunkParserI):
    def __init__(self, train_sents, **kwargs):
        assert isinstance(train_sents, Iterable)

        self.feature_detector = features
        self.tagger = ClassifierBasedTagger(train=train_sents,
                                            feature_detector=features,
                                            **kwargs)

    def parse(self, tagged_sent):
        chunks = self.tagger.tag(tagged_sent)

        # Transform the result from [((w1, t1), iob1), ...]
        # to the preferred list of triplets format [(w1, t1, iob1), ...]
        iob_triplets = [(w, t, c) for ((w, t), c) in chunks]

        # Transform the list of triplets to nltk.Tree format
        return conlltags2tree(iob_triplets)

    def parse_tweets(self, tweets):
        regex = re.compile(
            '[,#@`:)(\[\]\'%^~=&*+/;<>{}|!?._]|http[,#@`\-:)(\[\]\'%^=&_*+/;<>{}|.!?a-z]*'
        )
        named_entities_tree = ''
        for tweet in tweets:
            text = str.lower(str(tweet.processed_text))
            text = regex.sub('', text)
            current_tree = self.parse(pos_tag(word_tokenize(text)))
            named_entities_tree += str(current_tree)
        return named_entities_tree
示例#6
0
    def __init__(self):

        train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
        ctagged_sents = [[((w,t),c) for (w,t,c) in nltk.chunk.tree2conlltags(sent)] for sent in train_sents]
        test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
        self._test_sents = [[((w,t), c) for (w,t,c) in nltk.chunk.tree2conlltags(sent)] for sent in test_sents]
        self._tagger = ClassifierBasedTagger(train=ctagged_sents, feature_detector=npchunk_features)
示例#7
0
    def __init__(self,
                 train_sents=None,
                 tagger="ClassifierBasedTagger",
                 model=None,
                 model_name="../results/modelCRF_featured",
                 entities=None,
                 language="english",
                 **kwargs):

        self.all_entities = []
        self.acronyms = []
        self.language = language

        if not model:
            assert isinstance(train_sents, Iterable)

        if tagger == "ClassifierBasedTagger":
            self.feature_detector = iob_features
            self.tagger = ClassifierBasedTagger(train=train_sents,
                                                feature_detector=iob_features,
                                                **kwargs)

        elif tagger == "CRFTagger":
            self.set_entities(entities)
            if not model:

                self.tagger = CRFTagger(feature_func=self.crf_features)
                self.tagger.train(
                    train_data=train_sents,
                    model_file="../results/{}".format(model_name))
            else:
                self.tagger = CRFTagger(feature_func=self.crf_features)
                self.tagger.set_model_file(model)
        else:
            raise Exception('Unknown tagger')
示例#8
0
class NamedEntityChunker(ChunkParserI):
    """Class with overridden parser and init. This class is equipped to learn and predict given
    training data. The data is [[()]]

    """
    def __init__(self, train_sents, feat_detector, **kwargs):

        assert isinstance(train_sents, Iterable)

        self.feature_detector = feat_detector
        self.tagger = ClassifierBasedTagger(train=train_sents,
                                            feature_detector=feat_detector,
                                            **kwargs)

    def parse(self, tagged_sent):
        """This function is used by evaluate to make guesses and format the guesses
        """
        #make gueess (tag)
        chunks = self.tagger.tag(tagged_sent)

        # Transform the result from [((w1, t1), iob1), ...]
        # to the preferred list of triplets format [(w1, t1, iob1), ...]
        iob_triplets = [(w, p, t) for ((w, p), t) in chunks]

        # Transform the list of triplets to nltk.Tree format
        return conlltags2tree(iob_triplets)
示例#9
0
class ClassifierChunker(ChunkParserI):
	def __init__(self, train_sents, feature_detector=prev_next_pos_iob, **kwargs):
		if not feature_detector:
			feature_detector = self.feature_detector
		
		train_chunks = chunk_trees2train_chunks(train_sents)
		self.tagger = ClassifierBasedTagger(train=train_chunks,
			feature_detector=feature_detector, **kwargs)
	
	def parse(self, tagged_sent):
		if not tagged_sent: return None
		chunks = self.tagger.tag(tagged_sent)
		return conlltags2tree([(w,t,c) for ((w,t),c) in chunks])
示例#10
0
class ClassifierChunker(ChunkParserI):  # pylint: disable = W0223
    """
        Classifier-based chunker class implementation
    """
    def __init__(self, train_sents, feature_detector, **kwargs):
        train_chunks = chunk_trees2train_chunks(train_sents)
        self.tagger = ClassifierBasedTagger(train=train_chunks,
                                            feature_detector=feature_detector,
                                            **kwargs)

    def parse(self, tokens):
        """
            Parse sentence into chunks
        """
        if not tokens:
            return None
        chunked = self.tagger.tag(tokens)
        return conlltags2tree([(w, t, c) for ((w, t), c) in chunked])
class NamedEntityChunker(ChunkParserI):
    def __init__(self, train_sents, **kwargs):
        assert isinstance(train_sents, Iterable)

        self.feature_detector = features
        self.tagger = ClassifierBasedTagger(train=train_sents,
                                            feature_detector=features,
                                            **kwargs)

    def parse(self, tagged_sent):
        chunks = self.tagger.tag(tagged_sent)

        # Transform the result from [((w1, t1), iob1), ...]
        # to the preferred list of triplets format [(w1, t1, iob1), ...]
        iob_triplets = [(w, t, c) for ((w, t), c) in chunks]

        # Transform the list of triplets to nltk.Tree format
        return conlltags2tree(iob_triplets)
示例#12
0
class NEChunker(ChunkParserI):
    def __init__(self, train_sents, **kwargs):
        assert isinstance(train_sents, Iterable)
        self.feature_detector = features
        self.tagger = ClassifierBasedTagger(
            train=train_sents,
            feature_detector=features,
            **kwargs
        )
    
    def parse(self, tagged_sent):
        chunks = self.tagger.tag(tagged_sent)

        # Transform the result from [((w1, t1), iob1), ...]
        # to the normalized format of triplets [(w1, t1, iob1), ...]
        iob_triplets = [(word, token, chunk) for ((word, token), chunk) in chunks]

        # Transformthe list of triplets to NLTK tree format
        return conlltags2tree(iob_triplets)
示例#13
0
class NamedEntityChunker(ChunkParserI):
    '''
	Named Entity Chunker using ClassiferBasedTagger() and features generated by features()
	'''
    def __init__(self, train_sents, **kwargs):
        assert isinstance(train_sents, Iterable)

        self.feature_detector = features
        self.tagger = ClassifierBasedTagger(feature_detector=features,
                                            train=train_sents,
                                            **kwargs)

    def parse(self, tagged_sent):
        chunks = self.tagger.tag(tagged_sent)

        # Transform [((w1, t1), iob1), ...] to triplets [(w1, t1, iob1), ...]
        iob_triplets = [(w, t, c) for ((w, t), c) in chunks]

        return iob_triplets
示例#14
0
class address_chunker(ChunkParserI):
    def __init__(self, train_sents, **kwargs):
        self.tagger = ClassifierBasedTagger(train=train_sents,
                                            feature_detector=self.features,
                                            **kwargs)

    def parse(self, tagged_sent):
        chunks = self.tagger.tag(tagged_sent)
        iob_triplets = [(w, t, c) for ((w, t), c) in chunks]
        return conlltags2tree(iob_triplets)

    def features(self, tokens, index, history):
        # for more details see: http://nlpforhackers.io/named-entity-extraction/
        """
        `tokens`  = a POS-tagged sentence [(w1, t1), ...]
        `index`   = the index of the token we want to extract features for
        `history` = the previous predicted IOB tags
        """

        # init the stemmer
        stemmer = SnowballStemmer('english')

        # Pad the sequence with placeholders
        tokens = [
            ('[START2]', '[START2]'), ('[START1]', '[START1]')
        ] + list(tokens) + [('[END1]', '[END1]'), ('[END2]', '[END2]')]
        history = ['[START2]', '[START1]'] + list(history)

        # shift the index with 2, to accommodate the padding
        index += 2

        word, pos = tokens[index]
        prevword, prevpos = tokens[index - 1]
        prevprevword, prevprevpos = tokens[index - 2]
        nextword, nextpos = tokens[index + 1]
        nextnextword, nextnextpos = tokens[index + 2]
        previob = history[index - 1]
        contains_dash = '-' in word
        contains_dot = '.' in word
        allascii = all([True for c in word if c in string.ascii_lowercase])

        allcaps = word == word.capitalize()
        capitalized = word[0] in string.ascii_uppercase

        prevallcaps = prevword == prevword.capitalize()
        prevcapitalized = prevword[0] in string.ascii_uppercase

        nextallcaps = nextword == nextword.capitalize()
        nextcapitalized = nextword[0] in string.ascii_uppercase

        return {
            'word': word,
            'lemma': stemmer.stem(word),
            'pos': pos,
            'all-ascii': allascii,
            'next-word': nextword,
            'next-lemma': stemmer.stem(nextword),
            'next-pos': nextpos,
            'next-next-word': nextnextword,
            'next-next-pos': nextnextpos,
            'prev-word': prevword,
            'prev-lemma': stemmer.stem(prevword),
            'prev-pos': prevpos,
            'prev-prev-word': prevprevword,
            'prev-prev-pos': prevprevpos,
            'prev-iob': previob,
            'contains-dash': contains_dash,
            'contains-dot': contains_dot,
            'all-caps': allcaps,
            'capitalized': capitalized,
            'prev-all-caps': prevallcaps,
            'prev-capitalized': prevcapitalized,
            'next-all-caps': nextallcaps,
            'next-capitalized': nextcapitalized,
        }

    def save_to_file(self, file_name):
        save_classifier = open(file_name, "wb")
        pickle.dump(self, save_classifier)
        save_classifier.close()

    def chunk(self, sentence):

        tagged_tree = self.parse(pos_tag(word_tokenize(sentence)))

        chunks = []
        for subtree in tagged_tree.subtrees(filter=tree_filter):
            chunks.append(untag(subtree.leaves()))

        max_length = 0
        for i in range(len(chunks)):
            if len(chunks[i]) > max_length:
                chunk = chunks[i]
                max_length = len(chunks[i])

        output = ''
        if len(chunks) > 0:
            for i in range(len(chunk)):
                if not chunk[i] == '.' and not chunk[i] == ',' and not i == 0:
                    output = output + ' ' + chunk[i]
                else:
                    output = output + chunk[i]

        return output
 def __init__(self, train):
     ClassifierBasedTagger.__init__(
         self, train=train,
         classifier_builder=self._classifier_builder)
示例#16
0
 def __init__(self, train_sents, feature_detector, **kwargs):
     train_chunks = chunk_trees2train_chunks(train_sents)
     self.tagger = ClassifierBasedTagger(train=train_chunks,
                                         feature_detector=feature_detector,
                                         **kwargs)
    testFeats = None

    for category in categories:
        instancesOfEntityTrain = getInstancesOfEntity(
            category, completeTaggedSentencesTrain)
        instancesOfEntityTest = getInstancesOfEntity(
            category, completeTaggedSentencesTest)

        entityFeatsTrain = train_feats(category, instancesOfEntityTrain)
        entityFeatsTest = train_feats(category, instancesOfEntityTrain)

        if trainFeats == None:
            trainFeats = entityFeatsTrain
            testFeats = entityFeatsTest
        else:
            trainFeats += entityFeatsTrain
            testFeats += entityFeatsTest

    features = prev_next_pos_iob

    #naiveBayers
    naiveBayers = NaiveBayesClassifier.train(trainFeats)
    naiveBayersTagger = ClassifierBasedTagger(
        train=completeTaggedSentencesTrain,
        feature_detector=features,
        classifier_builder=naiveBayers)
    nerChunkerNaiveBayers = ClassifierChunker(completeTaggedSentencesTrain,
                                              naiveBayersTagger)
    evalNaiveBayers = nerChunkerNaiveBayers.evaluate2(testFeats)
    print(evalNaiveBayers)
示例#18
0
from nltk.tag import ClassifierBasedTagger
#from utils import read_ud_pos_data
#from tag import pos_features
if __name__ == "__main__":
    print("Loading data ...")
    train_data = list(
        read_ud_pos_data(r'C:\UD_English-EWT-master\en_ewt-ud-train.conllu'))
    test_data = list(
        read_ud_pos_data(r'C:\UD_English-EWT-master\en_ewt-ud-dev.conllu'))
    print("train_data", train_data)
    print("Data loaded .")
    start_time = time.time()
    print("Starting training ...")
    tagger = ClassifierBasedTagger(
        feature_detector=pos_features,
        train=train_data[:100],
        classifier_builder=train_scikit_classifier,
    )
    end_time = time.time()
    print("Training complete. Time={0:.2f}s".format(end_time - start_time))
    print("Computing test set accuracy ...")
    print(tagger.evaluate(test_data))  # 0.8949021790997296

import time
import itertools
from sklearn.feature_extraction import FeatureHasher
from sklearn.linear_model import Perceptron


def incremental_train_scikit_classifier(sentences, feature_detector,
                                        batch_size, max_iterations):
示例#19
0
# test
tic()
tag1_eval['test_accuracy'] = tag1b_tagger.evaluate(val_sents)
tag1_eval['test_time'] = toc()
# display results
display_training_metrics(tag1_eval)
"""
# =============================================================================
# finalise a classification-based tagger
# =============================================================================
"""
""" 1. Naive Bayes classifier tagger with features and Brill """
nb_eval = dict()
# train
tic()
nb_tagger = ClassifierBasedTagger(train=train_sents,
                                  feature_detector=add_features)
nb_eval['train_time'] = toc()
# test
tic()
nb_eval['test_accuracy'] = nb_tagger.evaluate(val_sents)
nb_eval['test_time'] = toc()
# display results
display_training_metrics(nb_eval)
"""
# =============================================================================
# finalise a deep learning tagger
# =============================================================================
"""
""" 1. prepare the data """
# for train, test and validation
train_X, train_y = create_observation(train_sents)
 def __init__(self, train):
     self.stemmer = PorterStemmer()
     ClassifierBasedTagger.__init__(
         self,
         train=train,
         classifier_builder=self._classifier_builder)
示例#21
0
class AddressChunker(ChunkParserI):
    def __init__(self, train_sents, **kwargs):
        self.tagger = ClassifierBasedTagger(
            train=train_sents,
            feature_detector=self.features,
            **kwargs)

    def parse(self, tagged_sent):
        chunks = self.tagger.tag(tagged_sent)

        # Transform the result from [((w1, t1), iob1), ...]
        # to the preferred list of triplets format [(w1, t1, iob1), ...]
        iob_triplets = [(w, t, c) for ((w, t), c) in chunks]

        # Transform the list of triplets to nltk.Tree format
        return conlltags2tree(iob_triplets)
    
    def features(self, tokens, index, history):
        # for more details see: http://nlpforhackers.io/named-entity-extraction/ 
        
        """
        `tokens`  = a POS-tagged sentence [(w1, t1), ...]
        `index`   = the index of the token we want to extract features for
        `history` = the previous predicted IOB tags
        """

        # init the stemmer
        stemmer = SnowballStemmer('english')

        # Pad the sequence with placeholders
        tokens = [('[START2]', '[START2]'), ('[START1]', '[START1]')] + list(tokens) + [('[END1]', '[END1]'), ('[END2]', '[END2]')]
        history = ['[START2]', '[START1]'] + list(history)

        # shift the index with 2, to accommodate the padding
        index += 2

        word, pos = tokens[index]
        prevword, prevpos = tokens[index - 1]
        prevprevword, prevprevpos = tokens[index - 2]
        nextword, nextpos = tokens[index + 1]
        nextnextword, nextnextpos = tokens[index + 2]
        previob = history[index - 1]
        contains_dash = '-' in word
        contains_dot = '.' in word
        allascii = all([True for c in word if c in string.ascii_lowercase])

        allcaps = word == word.capitalize()
        capitalized = word[0] in string.ascii_uppercase

        prevallcaps = prevword == prevword.capitalize()
        prevcapitalized = prevword[0] in string.ascii_uppercase

        nextallcaps = prevword == prevword.capitalize()
        nextcapitalized = prevword[0] in string.ascii_uppercase

        f = {
            'word': word,
            'lemma': stemmer.stem(word),
            'pos': pos,
            'all-ascii': allascii,

            'next-word': nextword,
            'next-lemma': stemmer.stem(nextword),
            'next-pos': nextpos,

            'next-next-word': nextnextword,
            'nextnextpos': nextnextpos,

            'prev-word': prevword,
            'prev-lemma': stemmer.stem(prevword),
            'prev-pos': prevpos,

            'prev-prev-word': prevprevword,
            'prev-prev-pos': prevprevpos,

            'prev-iob': previob,

            'contains-dash': contains_dash,
            'contains-dot': contains_dot,

            'all-caps': allcaps,
            'capitalized': capitalized,

            'prev-all-caps': prevallcaps,
            'prev-capitalized': prevcapitalized,

            'next-all-caps': nextallcaps,
            'next-capitalized': nextcapitalized,
        }

        return f
示例#22
0
 def __init__(self, train):
     ClassifierBasedTagger.__init__(
         self, train=train, classifier_builder=self._classifier_builder)
示例#23
0
 def __init__(self, train_sents, **kwargs):
     self.tagger = ClassifierBasedTagger(train=train_sents,
                                         feature_detector=self.features,
                                         **kwargs)