def __init__(self, train_sents, feature_detector=prev_next_pos_iob, **kwargs): if not feature_detector: feature_detector = self.feature_detector train_chunks = chunk_trees2train_chunks(train_sents) self.tagger = ClassifierBasedTagger(train=train_chunks, feature_detector=feature_detector, **kwargs)
def __init__(self, train_sents, **kwargs): assert isinstance(train_sents, Iterable) self.feature_detector = features self.tagger = ClassifierBasedTagger(train=train_sents, feature_detector=features, **kwargs)
class Chunker(nltk.chunk.ChunkParserI): ''' Chunker for SCLE. Only chunks NP for now. ''' def __init__(self): train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP']) ctagged_sents = [[((w,t),c) for (w,t,c) in nltk.chunk.tree2conlltags(sent)] for sent in train_sents] test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP']) self._test_sents = [[((w,t), c) for (w,t,c) in nltk.chunk.tree2conlltags(sent)] for sent in test_sents] self._tagger = ClassifierBasedTagger(train=ctagged_sents, feature_detector=npchunk_features) def chunk(self, sentences): ''' ''' chunked_sents = [] for sent in sentences: c_sent = self._tagger.tag(sent) conlltags =[(w,t,c) for ((w,t),c) in c_sent] chunked_sents.append(nltk.chunk.conlltags2tree(conlltags)) return chunked_sents def evaluate(self): ''' Evaluate the chunker. ''' print self._tagger.evaluate(self._test_sents)
def __init__(self, feature_detector, labeled_sequence, classifier_builder): """ @param feature_detector: the function or dictionary used to featurize the training data. @type feature_dector: C{dict} or C{function} @param labeled_sequence: the list of training tokens @type labeled_sequence: C{list} of C{list} of C{tuple} @param classifier_builder: the function used to initialize the classifier @type classifier_builder: C{function} """ if self.__class__ == AbstractClassifierBasedTagger: raise AssertionError, "Interfaces can't be instantiated" ClassifierBasedTagger.__init__(self, feature_detector, labeled_sequence, classifier_builder)
class NamedEntityChunker(ChunkParserI): def __init__(self, train_sents, **kwargs): assert isinstance(train_sents, Iterable) self.feature_detector = features self.tagger = ClassifierBasedTagger(train=train_sents, feature_detector=features, **kwargs) def parse(self, tagged_sent): chunks = self.tagger.tag(tagged_sent) # Transform the result from [((w1, t1), iob1), ...] # to the preferred list of triplets format [(w1, t1, iob1), ...] iob_triplets = [(w, t, c) for ((w, t), c) in chunks] # Transform the list of triplets to nltk.Tree format return conlltags2tree(iob_triplets) def parse_tweets(self, tweets): regex = re.compile( '[,#@`:)(\[\]\'%^~=&*+/;<>{}|!?._]|http[,#@`\-:)(\[\]\'%^=&_*+/;<>{}|.!?a-z]*' ) named_entities_tree = '' for tweet in tweets: text = str.lower(str(tweet.processed_text)) text = regex.sub('', text) current_tree = self.parse(pos_tag(word_tokenize(text))) named_entities_tree += str(current_tree) return named_entities_tree
def __init__(self): train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP']) ctagged_sents = [[((w,t),c) for (w,t,c) in nltk.chunk.tree2conlltags(sent)] for sent in train_sents] test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP']) self._test_sents = [[((w,t), c) for (w,t,c) in nltk.chunk.tree2conlltags(sent)] for sent in test_sents] self._tagger = ClassifierBasedTagger(train=ctagged_sents, feature_detector=npchunk_features)
def __init__(self, train_sents=None, tagger="ClassifierBasedTagger", model=None, model_name="../results/modelCRF_featured", entities=None, language="english", **kwargs): self.all_entities = [] self.acronyms = [] self.language = language if not model: assert isinstance(train_sents, Iterable) if tagger == "ClassifierBasedTagger": self.feature_detector = iob_features self.tagger = ClassifierBasedTagger(train=train_sents, feature_detector=iob_features, **kwargs) elif tagger == "CRFTagger": self.set_entities(entities) if not model: self.tagger = CRFTagger(feature_func=self.crf_features) self.tagger.train( train_data=train_sents, model_file="../results/{}".format(model_name)) else: self.tagger = CRFTagger(feature_func=self.crf_features) self.tagger.set_model_file(model) else: raise Exception('Unknown tagger')
class NamedEntityChunker(ChunkParserI): """Class with overridden parser and init. This class is equipped to learn and predict given training data. The data is [[()]] """ def __init__(self, train_sents, feat_detector, **kwargs): assert isinstance(train_sents, Iterable) self.feature_detector = feat_detector self.tagger = ClassifierBasedTagger(train=train_sents, feature_detector=feat_detector, **kwargs) def parse(self, tagged_sent): """This function is used by evaluate to make guesses and format the guesses """ #make gueess (tag) chunks = self.tagger.tag(tagged_sent) # Transform the result from [((w1, t1), iob1), ...] # to the preferred list of triplets format [(w1, t1, iob1), ...] iob_triplets = [(w, p, t) for ((w, p), t) in chunks] # Transform the list of triplets to nltk.Tree format return conlltags2tree(iob_triplets)
class ClassifierChunker(ChunkParserI): def __init__(self, train_sents, feature_detector=prev_next_pos_iob, **kwargs): if not feature_detector: feature_detector = self.feature_detector train_chunks = chunk_trees2train_chunks(train_sents) self.tagger = ClassifierBasedTagger(train=train_chunks, feature_detector=feature_detector, **kwargs) def parse(self, tagged_sent): if not tagged_sent: return None chunks = self.tagger.tag(tagged_sent) return conlltags2tree([(w,t,c) for ((w,t),c) in chunks])
class ClassifierChunker(ChunkParserI): # pylint: disable = W0223 """ Classifier-based chunker class implementation """ def __init__(self, train_sents, feature_detector, **kwargs): train_chunks = chunk_trees2train_chunks(train_sents) self.tagger = ClassifierBasedTagger(train=train_chunks, feature_detector=feature_detector, **kwargs) def parse(self, tokens): """ Parse sentence into chunks """ if not tokens: return None chunked = self.tagger.tag(tokens) return conlltags2tree([(w, t, c) for ((w, t), c) in chunked])
class NamedEntityChunker(ChunkParserI): def __init__(self, train_sents, **kwargs): assert isinstance(train_sents, Iterable) self.feature_detector = features self.tagger = ClassifierBasedTagger(train=train_sents, feature_detector=features, **kwargs) def parse(self, tagged_sent): chunks = self.tagger.tag(tagged_sent) # Transform the result from [((w1, t1), iob1), ...] # to the preferred list of triplets format [(w1, t1, iob1), ...] iob_triplets = [(w, t, c) for ((w, t), c) in chunks] # Transform the list of triplets to nltk.Tree format return conlltags2tree(iob_triplets)
class NEChunker(ChunkParserI): def __init__(self, train_sents, **kwargs): assert isinstance(train_sents, Iterable) self.feature_detector = features self.tagger = ClassifierBasedTagger( train=train_sents, feature_detector=features, **kwargs ) def parse(self, tagged_sent): chunks = self.tagger.tag(tagged_sent) # Transform the result from [((w1, t1), iob1), ...] # to the normalized format of triplets [(w1, t1, iob1), ...] iob_triplets = [(word, token, chunk) for ((word, token), chunk) in chunks] # Transformthe list of triplets to NLTK tree format return conlltags2tree(iob_triplets)
class NamedEntityChunker(ChunkParserI): ''' Named Entity Chunker using ClassiferBasedTagger() and features generated by features() ''' def __init__(self, train_sents, **kwargs): assert isinstance(train_sents, Iterable) self.feature_detector = features self.tagger = ClassifierBasedTagger(feature_detector=features, train=train_sents, **kwargs) def parse(self, tagged_sent): chunks = self.tagger.tag(tagged_sent) # Transform [((w1, t1), iob1), ...] to triplets [(w1, t1, iob1), ...] iob_triplets = [(w, t, c) for ((w, t), c) in chunks] return iob_triplets
class address_chunker(ChunkParserI): def __init__(self, train_sents, **kwargs): self.tagger = ClassifierBasedTagger(train=train_sents, feature_detector=self.features, **kwargs) def parse(self, tagged_sent): chunks = self.tagger.tag(tagged_sent) iob_triplets = [(w, t, c) for ((w, t), c) in chunks] return conlltags2tree(iob_triplets) def features(self, tokens, index, history): # for more details see: http://nlpforhackers.io/named-entity-extraction/ """ `tokens` = a POS-tagged sentence [(w1, t1), ...] `index` = the index of the token we want to extract features for `history` = the previous predicted IOB tags """ # init the stemmer stemmer = SnowballStemmer('english') # Pad the sequence with placeholders tokens = [ ('[START2]', '[START2]'), ('[START1]', '[START1]') ] + list(tokens) + [('[END1]', '[END1]'), ('[END2]', '[END2]')] history = ['[START2]', '[START1]'] + list(history) # shift the index with 2, to accommodate the padding index += 2 word, pos = tokens[index] prevword, prevpos = tokens[index - 1] prevprevword, prevprevpos = tokens[index - 2] nextword, nextpos = tokens[index + 1] nextnextword, nextnextpos = tokens[index + 2] previob = history[index - 1] contains_dash = '-' in word contains_dot = '.' in word allascii = all([True for c in word if c in string.ascii_lowercase]) allcaps = word == word.capitalize() capitalized = word[0] in string.ascii_uppercase prevallcaps = prevword == prevword.capitalize() prevcapitalized = prevword[0] in string.ascii_uppercase nextallcaps = nextword == nextword.capitalize() nextcapitalized = nextword[0] in string.ascii_uppercase return { 'word': word, 'lemma': stemmer.stem(word), 'pos': pos, 'all-ascii': allascii, 'next-word': nextword, 'next-lemma': stemmer.stem(nextword), 'next-pos': nextpos, 'next-next-word': nextnextword, 'next-next-pos': nextnextpos, 'prev-word': prevword, 'prev-lemma': stemmer.stem(prevword), 'prev-pos': prevpos, 'prev-prev-word': prevprevword, 'prev-prev-pos': prevprevpos, 'prev-iob': previob, 'contains-dash': contains_dash, 'contains-dot': contains_dot, 'all-caps': allcaps, 'capitalized': capitalized, 'prev-all-caps': prevallcaps, 'prev-capitalized': prevcapitalized, 'next-all-caps': nextallcaps, 'next-capitalized': nextcapitalized, } def save_to_file(self, file_name): save_classifier = open(file_name, "wb") pickle.dump(self, save_classifier) save_classifier.close() def chunk(self, sentence): tagged_tree = self.parse(pos_tag(word_tokenize(sentence))) chunks = [] for subtree in tagged_tree.subtrees(filter=tree_filter): chunks.append(untag(subtree.leaves())) max_length = 0 for i in range(len(chunks)): if len(chunks[i]) > max_length: chunk = chunks[i] max_length = len(chunks[i]) output = '' if len(chunks) > 0: for i in range(len(chunk)): if not chunk[i] == '.' and not chunk[i] == ',' and not i == 0: output = output + ' ' + chunk[i] else: output = output + chunk[i] return output
def __init__(self, train): ClassifierBasedTagger.__init__( self, train=train, classifier_builder=self._classifier_builder)
def __init__(self, train_sents, feature_detector, **kwargs): train_chunks = chunk_trees2train_chunks(train_sents) self.tagger = ClassifierBasedTagger(train=train_chunks, feature_detector=feature_detector, **kwargs)
testFeats = None for category in categories: instancesOfEntityTrain = getInstancesOfEntity( category, completeTaggedSentencesTrain) instancesOfEntityTest = getInstancesOfEntity( category, completeTaggedSentencesTest) entityFeatsTrain = train_feats(category, instancesOfEntityTrain) entityFeatsTest = train_feats(category, instancesOfEntityTrain) if trainFeats == None: trainFeats = entityFeatsTrain testFeats = entityFeatsTest else: trainFeats += entityFeatsTrain testFeats += entityFeatsTest features = prev_next_pos_iob #naiveBayers naiveBayers = NaiveBayesClassifier.train(trainFeats) naiveBayersTagger = ClassifierBasedTagger( train=completeTaggedSentencesTrain, feature_detector=features, classifier_builder=naiveBayers) nerChunkerNaiveBayers = ClassifierChunker(completeTaggedSentencesTrain, naiveBayersTagger) evalNaiveBayers = nerChunkerNaiveBayers.evaluate2(testFeats) print(evalNaiveBayers)
from nltk.tag import ClassifierBasedTagger #from utils import read_ud_pos_data #from tag import pos_features if __name__ == "__main__": print("Loading data ...") train_data = list( read_ud_pos_data(r'C:\UD_English-EWT-master\en_ewt-ud-train.conllu')) test_data = list( read_ud_pos_data(r'C:\UD_English-EWT-master\en_ewt-ud-dev.conllu')) print("train_data", train_data) print("Data loaded .") start_time = time.time() print("Starting training ...") tagger = ClassifierBasedTagger( feature_detector=pos_features, train=train_data[:100], classifier_builder=train_scikit_classifier, ) end_time = time.time() print("Training complete. Time={0:.2f}s".format(end_time - start_time)) print("Computing test set accuracy ...") print(tagger.evaluate(test_data)) # 0.8949021790997296 import time import itertools from sklearn.feature_extraction import FeatureHasher from sklearn.linear_model import Perceptron def incremental_train_scikit_classifier(sentences, feature_detector, batch_size, max_iterations):
# test tic() tag1_eval['test_accuracy'] = tag1b_tagger.evaluate(val_sents) tag1_eval['test_time'] = toc() # display results display_training_metrics(tag1_eval) """ # ============================================================================= # finalise a classification-based tagger # ============================================================================= """ """ 1. Naive Bayes classifier tagger with features and Brill """ nb_eval = dict() # train tic() nb_tagger = ClassifierBasedTagger(train=train_sents, feature_detector=add_features) nb_eval['train_time'] = toc() # test tic() nb_eval['test_accuracy'] = nb_tagger.evaluate(val_sents) nb_eval['test_time'] = toc() # display results display_training_metrics(nb_eval) """ # ============================================================================= # finalise a deep learning tagger # ============================================================================= """ """ 1. prepare the data """ # for train, test and validation train_X, train_y = create_observation(train_sents)
def __init__(self, train): self.stemmer = PorterStemmer() ClassifierBasedTagger.__init__( self, train=train, classifier_builder=self._classifier_builder)
class AddressChunker(ChunkParserI): def __init__(self, train_sents, **kwargs): self.tagger = ClassifierBasedTagger( train=train_sents, feature_detector=self.features, **kwargs) def parse(self, tagged_sent): chunks = self.tagger.tag(tagged_sent) # Transform the result from [((w1, t1), iob1), ...] # to the preferred list of triplets format [(w1, t1, iob1), ...] iob_triplets = [(w, t, c) for ((w, t), c) in chunks] # Transform the list of triplets to nltk.Tree format return conlltags2tree(iob_triplets) def features(self, tokens, index, history): # for more details see: http://nlpforhackers.io/named-entity-extraction/ """ `tokens` = a POS-tagged sentence [(w1, t1), ...] `index` = the index of the token we want to extract features for `history` = the previous predicted IOB tags """ # init the stemmer stemmer = SnowballStemmer('english') # Pad the sequence with placeholders tokens = [('[START2]', '[START2]'), ('[START1]', '[START1]')] + list(tokens) + [('[END1]', '[END1]'), ('[END2]', '[END2]')] history = ['[START2]', '[START1]'] + list(history) # shift the index with 2, to accommodate the padding index += 2 word, pos = tokens[index] prevword, prevpos = tokens[index - 1] prevprevword, prevprevpos = tokens[index - 2] nextword, nextpos = tokens[index + 1] nextnextword, nextnextpos = tokens[index + 2] previob = history[index - 1] contains_dash = '-' in word contains_dot = '.' in word allascii = all([True for c in word if c in string.ascii_lowercase]) allcaps = word == word.capitalize() capitalized = word[0] in string.ascii_uppercase prevallcaps = prevword == prevword.capitalize() prevcapitalized = prevword[0] in string.ascii_uppercase nextallcaps = prevword == prevword.capitalize() nextcapitalized = prevword[0] in string.ascii_uppercase f = { 'word': word, 'lemma': stemmer.stem(word), 'pos': pos, 'all-ascii': allascii, 'next-word': nextword, 'next-lemma': stemmer.stem(nextword), 'next-pos': nextpos, 'next-next-word': nextnextword, 'nextnextpos': nextnextpos, 'prev-word': prevword, 'prev-lemma': stemmer.stem(prevword), 'prev-pos': prevpos, 'prev-prev-word': prevprevword, 'prev-prev-pos': prevprevpos, 'prev-iob': previob, 'contains-dash': contains_dash, 'contains-dot': contains_dot, 'all-caps': allcaps, 'capitalized': capitalized, 'prev-all-caps': prevallcaps, 'prev-capitalized': prevcapitalized, 'next-all-caps': nextallcaps, 'next-capitalized': nextcapitalized, } return f
def __init__(self, train_sents, **kwargs): self.tagger = ClassifierBasedTagger(train=train_sents, feature_detector=self.features, **kwargs)