def load(self, dirpath): """ Loads a trained model from local disk, given the dirpath Parameters ---------- dirpath : str a directory where model artifacts are saved. Returns ------- self """ if not os.path.exists(dirpath): raise ValueError("Model directory not found: {:s}".format(dirpath)) weights_file = os.path.join(dirpath, "weights.h5") params_file = os.path.join(dirpath, "params.json") preprocessor_file = os.path.join(dirpath, "preprocessor.pkl") if not (os.path.exists(weights_file) or os.path.exists(params_file) or os.path.exists(preprocessor_file)): raise ValueError("Model files may be corrupted, exiting") self.model_ = load_model(weights_file, params_file) self.preprocessor_ = IndexTransformer.load(preprocessor_file) self.tagger_ = Tagger(self.model_, preprocessor=self.preprocessor_) return self
def __init__(self, process_proper_nouns=False): super().__init__(process_proper_nouns) model = load_model(os.path.join(ELMO_TAGGER_PATH, 'weights.h5'), os.path.join(ELMO_TAGGER_PATH, 'params.json')) it = IndexTransformer.load( os.path.join(ELMO_TAGGER_PATH, 'preprocessor.pkl')) self.pos_tagger = Tagger(model, preprocessor=it, tokenizer=wordpunct_tokenize)
def main(args): print('Loading objects...') model = BiLSTMCRF.load(args.weights_file, args.params_file) it = IndexTransformer.load(args.preprocessor_file) tagger = Tagger(model, preprocessor=it) print('Tagging a sentence...') res = tagger.analyze(args.sent) pprint(res)
def analyze(self, text): """Analyze text and return pretty format. Args: text: string, the input text. Returns: res: dict. """ if not self.tagger: self.tagger = Tagger(self.model, preprocessor=self.p, tokenizer=self.tokenizer) return self.tagger.analyze(text)
def analyze(self, text, tokenizer=str.split): """Analyze text and return pretty format. Args: text: string, the input text. tokenizer: Tokenize input sentence. Default tokenizer is `str.split`. Returns: res: dict. """ if not self.tagger: self.tagger = Tagger(self.model, preprocessor=self.p, tokenizer=tokenizer) return self.tagger.analyze(text)
class ElmoBiLSTM_CRFProcessor(CustomProcessor): def __init__(self, process_proper_nouns=False): super().__init__(process_proper_nouns) model = load_model(os.path.join(ELMO_TAGGER_PATH, 'weights.h5'), os.path.join(ELMO_TAGGER_PATH, 'params.json')) it = IndexTransformer.load( os.path.join(ELMO_TAGGER_PATH, 'preprocessor.pkl')) self.pos_tagger = Tagger(model, preprocessor=it, tokenizer=wordpunct_tokenize) def extract_phrase_by_type(self, token, type): return self._extract_phrase( list( zip(self.pos_tagger.tokenizer(token), self.pos_tagger.predict(token))), type)
def fit(self, X, y): """ Trains the NER model. Input is list of list of tokens and tags. Parameters ---------- X : list(list(str)) list of list of tokens y : list(list(str)) list of list of BIO tags Returns ------- self """ log.info("Preprocessing dataset...") self.preprocessor_ = IndexTransformer(use_char=self.use_char) self.preprocessor_.fit(X, y) log.info("Building model...") self.model_ = BiLSTMCRF( char_embedding_dim=self.char_embedding_dim, word_embedding_dim=self.word_embedding_dim, char_lstm_size=self.char_lstm_size, word_lstm_size=self.word_lstm_size, char_vocab_size=self.preprocessor_.char_vocab_size, word_vocab_size=self.preprocessor_.word_vocab_size, num_labels=self.preprocessor_.label_size, dropout=self.dropout, use_char=self.use_char, use_crf=self.use_crf) self.model_, loss = self.model_.build() optimizer = Adam(lr=self.learning_rate) self.model_.compile(loss=loss, optimizer=optimizer) self.model_.summary() log.info('Training the model...') self.trainer_ = Trainer(self.model_, preprocessor=self.preprocessor_) x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=42) self.trainer_.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid, batch_size=self.batch_size, epochs=self.max_iter) self.tagger_ = Tagger(self.model_, preprocessor=self.preprocessor_) return self
def test_model(sentence: str, model_dir: str) -> None: """Performs NER analysis on sentence (defaults to using base model which can be trained with train_base_model()) Args: sentence (str): text file to perform analysis on model_dir (str): path to model to use for analysis """ model = anago.Sequence.load(model_dir) tagger = Tagger(model.model, preprocessor=model.p) data = sentence.strip().split() pred = tagger.predict(data) tags = tagger._get_tags(pred) probs = tagger._get_prob(pred) res = tagger._build_response(data, tags, probs) print() print(list(zip(data, tags, probs))) print() if not res['entities']: print("No entities found.") else: print("Entities Found: ") for entity in res['entities']: print(f"\t{entity['text']} = {entity['type']}")
def run_model(text: str, model_dir: str=BASE_MODEL_PATH) -> List: """Performs NER analysis on sentence (defaults to using base model which can be trained with train_base_model()) Args: text (str): text to perform analysis on model (str): path to model to use for analysis """ model = anago.Sequence.load(model_dir) tagger = Tagger(model.model, preprocessor=model.p) data = text.strip().split() pred = tagger.predict(data) tags = tagger._get_tags(pred) probs = tagger._get_prob(pred) res = tagger._build_response(data, tags, probs) return res['entities']
class Sequence(object): def __init__(self, word_embedding_dim=100, char_embedding_dim=25, word_lstm_size=100, char_lstm_size=25, fc_dim=100, dropout=0.5, embeddings=None, use_char=True, use_crf=True, initial_vocab=None, optimizer='adam'): self.model = None self.p = None self.tagger = None self.word_embedding_dim = word_embedding_dim self.char_embedding_dim = char_embedding_dim self.word_lstm_size = word_lstm_size self.char_lstm_size = char_lstm_size self.fc_dim = fc_dim self.dropout = dropout self.embeddings = embeddings self.use_char = use_char self.use_crf = use_crf self.initial_vocab = initial_vocab self.optimizer = optimizer def fit(self, x_train, y_train, x_valid=None, y_valid=None, epochs=1, batch_size=32, verbose=1, callbacks=None, shuffle=True): """Fit the model for a fixed number of epochs. Args: x_train: list of training data. y_train: list of training target (label) data. x_valid: list of validation data. y_valid: list of validation target (label) data. batch_size: Integer. Number of samples per gradient update. If unspecified, `batch_size` will default to 32. epochs: Integer. Number of epochs to train the model. verbose: Integer. 0, 1, or 2. Verbosity mode. 0 = silent, 1 = progress bar, 2 = one line per epoch. callbacks: List of `keras.callbacks.Callback` instances. List of callbacks to apply during training. shuffle: Boolean (whether to shuffle the training data before each epoch). `shuffle` will default to True. """ p = IndexTransformer(initial_vocab=self.initial_vocab, use_char=self.use_char) p.fit(x_train, y_train) embeddings = filter_embeddings(self.embeddings, p._word_vocab.vocab, self.word_embedding_dim) model = BiLSTMCRF(char_vocab_size=p.char_vocab_size, word_vocab_size=p.word_vocab_size, num_labels=p.label_size, word_embedding_dim=self.word_embedding_dim, char_embedding_dim=self.char_embedding_dim, word_lstm_size=self.word_lstm_size, char_lstm_size=self.char_lstm_size, fc_dim=self.fc_dim, dropout=self.dropout, embeddings=embeddings, use_char=self.use_char, use_crf=self.use_crf) model, loss = model.build() model.compile(loss=loss, optimizer=self.optimizer) trainer = Trainer(model, preprocessor=p) trainer.train(x_train, y_train, x_valid, y_valid, epochs=epochs, batch_size=batch_size, verbose=verbose, callbacks=callbacks, shuffle=shuffle) self.p = p self.model = model def score(self, x_test, y_test): """Returns the f1-micro score on the given test data and labels. Args: x_test : array-like, shape = (n_samples, sent_length) Test samples. y_test : array-like, shape = (n_samples, sent_length) True labels for x. Returns: score : float, f1-micro score. """ if self.model: x_test = self.p.transform(x_test) lengths = map(len, y_test) y_pred = self.model.predict(x_test) y_pred = self.p.inverse_transform(y_pred, lengths) score = f1_score(y_test, y_pred) return score else: raise OSError('Could not find a model. Call load(dir_path).') def analyze(self, text, tokenizer=str.split): """Analyze text and return pretty format. Args: text: string, the input text. tokenizer: Tokenize input sentence. Default tokenizer is `str.split`. Returns: res: dict. """ if not self.tagger: self.tagger = Tagger(self.model, preprocessor=self.p, tokenizer=tokenizer) return self.tagger.analyze(text) def save(self, weights_file, params_file, preprocessor_file): self.p.save(preprocessor_file) save_model(self.model, weights_file, params_file) @classmethod def load(cls, weights_file, params_file, preprocessor_file): self = cls() self.p = IndexTransformer.load(preprocessor_file) self.model = load_model(weights_file, params_file) return self
class BiLstmCrfNER(NERModel): def __init__(self, word_embedding_dim=100, char_embedding_dim=25, word_lstm_size=100, char_lstm_size=25, fc_dim=100, dropout=0.5, embeddings=None, use_char=True, use_crf=True, batch_size=16, learning_rate=0.001, max_iter=10): """ Construct a BiLSTM-CRF NER model. Model is augmented with character level embeddings as well as word embeddings by default. Implementation is provided by the Anago project. Parameters ---------- word_embedding_dim : int, optional, default 100 word embedding dimensions. char_embedding_dim : int, optional, default 25 character embedding dimensions. word_lstm_size : int, optional, default 100 character LSTM feature extractor output dimensions. char_lstm_size : int, optional, default 25 word tagger LSTM output dimensions. fc_dim : int, optional, default 100 output fully-connected layer size. dropout : float, optional, default 0.5 dropout rate. embeddings : numpy array word embedding matrix. use_char : bool, optional, default True add char feature. use_crf : bool, optional, default True use crf as last layer. batch_size : int, optional, default 16 training batch size. learning_rate : float, optional, default 0.001 learning rate for Adam optimizer max_iter : int number of epochs of training Attributes ---------- preprocessor_ : reference to preprocessor model_ : reference to generated model trainer_ : internal reference to Anago Trainer (model) tagger_ : internal reference to Anago Tagger (predictor) """ super().__init__() self.word_embedding_dim = word_embedding_dim self.char_embedding_dim = char_embedding_dim self.word_lstm_size = word_lstm_size self.char_lstm_size = char_lstm_size self.fc_dim = fc_dim self.dropout = dropout self.embedding = None self.use_char = True self.use_crf = True self.batch_size = batch_size self.learning_rate = learning_rate self.max_iter = max_iter # populated by fit() and load(), expected by save() and transform() self.preprocessor_ = None self.model_ = None self.trainer_ = None self.tagger_ = None def fit(self, X, y): """ Trains the NER model. Input is list of list of tokens and tags. Parameters ---------- X : list(list(str)) list of list of tokens y : list(list(str)) list of list of BIO tags Returns ------- self """ log.info("Preprocessing dataset...") self.preprocessor_ = IndexTransformer(use_char=self.use_char) self.preprocessor_.fit(X, y) log.info("Building model...") self.model_ = BiLSTMCRF( char_embedding_dim=self.char_embedding_dim, word_embedding_dim=self.word_embedding_dim, char_lstm_size=self.char_lstm_size, word_lstm_size=self.word_lstm_size, char_vocab_size=self.preprocessor_.char_vocab_size, word_vocab_size=self.preprocessor_.word_vocab_size, num_labels=self.preprocessor_.label_size, dropout=self.dropout, use_char=self.use_char, use_crf=self.use_crf) self.model_, loss = self.model_.build() optimizer = Adam(lr=self.learning_rate) self.model_.compile(loss=loss, optimizer=optimizer) self.model_.summary() log.info('Training the model...') self.trainer_ = Trainer(self.model_, preprocessor=self.preprocessor_) x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=42) self.trainer_.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid, batch_size=self.batch_size, epochs=self.max_iter) self.tagger_ = Tagger(self.model_, preprocessor=self.preprocessor_) return self def predict(self, X): """ Predicts using the NER model. Parameters ---------- X : list(list(str)) list of list of tokens. Returns ------- y : list(list(str)) list of list of predicted BIO tags. """ if self.tagger_ is None: raise ValueError("No tagger found, either run fit() to train or load() a trained model") log.info("Predicting from model...") ypreds = [self.tagger_.predict(" ".join(x)) for x in X] return ypreds def save(self, dirpath): """ Saves model to local disk, given a dirpath Parameters ---------- dirpath : str a directory where model artifacts will be saved. Model saves a weights.h5 weights file, a params.json parameter file, and a preprocessor.pkl preprocessor file. Returns ------- None """ if self.model_ is None or self.preprocessor_ is None: raise ValueError("No model artifacts to save, either run fit() to train or load() a trained model") if not os.path.exists(dirpath): os.makedirs(dirpath) weights_file = os.path.join(dirpath, "weights.h5") params_file = os.path.join(dirpath, "params.json") preprocessor_file = os.path.join(dirpath, "preprocessor.pkl") save_model(self.model_, weights_file, params_file) self.preprocessor_.save(preprocessor_file) write_param_file(self.get_params(), os.path.join(dirpath, "params.yaml")) def load(self, dirpath): """ Loads a trained model from local disk, given the dirpath Parameters ---------- dirpath : str a directory where model artifacts are saved. Returns ------- self """ if not os.path.exists(dirpath): raise ValueError("Model directory not found: {:s}".format(dirpath)) weights_file = os.path.join(dirpath, "weights.h5") params_file = os.path.join(dirpath, "params.json") preprocessor_file = os.path.join(dirpath, "preprocessor.pkl") if not (os.path.exists(weights_file) or os.path.exists(params_file) or os.path.exists(preprocessor_file)): raise ValueError("Model files may be corrupted, exiting") self.model_ = load_model(weights_file, params_file) self.preprocessor_ = IndexTransformer.load(preprocessor_file) self.tagger_ = Tagger(self.model_, preprocessor=self.preprocessor_) return self
from CRF.anago.data import prepare_preprocessor DATA_ROOT = 'data/phenebank/' train_path = os.path.join(DATA_ROOT, 'train.txt') x_train, y_train = load_data_and_labels(train_path) p = prepare_preprocessor(x_train, y_train) model_config = ModelConfig() SAVE_ROOT = './models' # trained model weights = 'model_weights.h5' tagger = anago.Tagger(model_config, weights, save_path=SAVE_ROOT, preprocessor=p) test_path = "data/phenebank/test.txt" with open(test_path) as ifile: this_sentence = [] all_sentences = [] this_output = [] all_outputs = [] for line in ifile: line = line.strip() if len(line) == 0: this_output = tag.predict(tag, this_sentence) print this_sentence, this_output else: this_sentence.append(line.split("\t")[0])
def fit(self, X, y): """ Trains the NER model. Input is list of AnnotatedDocuments. Parameters ---------- X : list(list(str)) list of list of tokens y : list(list(str)) list of list of BIO tags Returns ------- self """ if self.embeddings is None and self.embeddings_file is None: raise ValueError( "Either embeddings or embeddings_file should be provided, exiting." ) log.info("Preprocessing dataset...") self.preprocessor_ = ELMoTransformer() self.preprocessor_.fit(X, y) if self.embeddings is None: self.embeddings = load_glove(self.embeddings_file) embeddings_dim != self.embeddings[list( self.embeddings.keys())[0]].shape[0] self.embeddings = filter_embeddings( self.embeddings, self.preprocessor_._word_vocab.vocab, embeddings_dim) log.info("Building model...") self.model_ = ELModel( char_embedding_dim=self.char_embedding_dim, word_embedding_dim=self.word_embedding_dim, char_lstm_size=self.char_lstm_size, word_lstm_size=self.word_lstm_size, char_vocab_size=self.preprocessor_.char_vocab_size, word_vocab_size=self.preprocessor_.word_vocab_size, num_labels=self.preprocessor_.label_size, embeddings=self.embeddings, dropout=self.dropout) self.model_, loss = self.model_.build() optimizer = Adam(lr=self.learning_rate) self.model_.compile(loss=loss, optimizer=optimizer) self.model_.summary() log.info('Training the model...') self.trainer_ = Trainer(self.model_, preprocessor=self.preprocessor_) x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=42) self.trainer_.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid, batch_size=self.batch_size, epochs=self.max_iter) self.tagger_ = Tagger(self.model_, preprocessor=self.preprocessor_) return self
class ElmoNER(NERModel): def __init__(self, word_embedding_dim=100, char_embedding_dim=25, word_lstm_size=100, char_lstm_size=25, fc_dim=100, dropout=0.5, embeddings=None, embeddings_file="glove.6B.100d.txt", batch_size=16, learning_rate=0.001, max_iter=2): """ Construct a ELMo based NER model. Model is similar to the BiLSTM-CRF model except that the word embeddings are contextual, since they are returned by a trained ELMo model. ELMo model requires an additional embedding, which is Glove-100 by default. ELMo model is provided by the (dev) Anago project. Parameters ---------- word_embedding_dim : int, optional, default 100 word embedding dimensions. char_embedding_dim : int, optional, default 25 character embedding dimensions. word_lstm_size: int, optional, default 100 character LSTM feature extractor output dimensions. char_lstm_size : int, optional, default 25 word tagger LSTM output dimensions. fc_dim : int, optional, default 100 output fully-connected layer size. dropout : float, optional, default 0.5 dropout rate. embeddings : numpy array word embedding matrix. embeddings_file : str path to embedding file. batch_size : int, optional, default 16 training batch size. learning_rate : float, optional, default 0.001 learning rate for Adam optimizer. max_iter : int, optional, default 2 number of epochs of training. Attributes ---------- preprocessor_ : reference to Anago preprocessor. model_ : reference to the internal Anago ELModel trainer_ : reference to the internal Anago Trainer object. tagger_ : reference to the internal Anago Tagger object. """ super().__init__() self.word_embedding_dim = word_embedding_dim self.char_embedding_dim = char_embedding_dim self.word_lstm_size = word_lstm_size self.char_lstm_size = char_lstm_size self.fc_dim = fc_dim self.dropout = dropout self.embeddings = embeddings self.embeddings_file = embeddings_file self.batch_size = batch_size self.learning_rate = learning_rate self.max_iter = max_iter # populated by fit() and load(), expected by save() and transform() self.preprocessor_ = None self.model_ = None self.trainer_ = None self.tagger_ = None def fit(self, X, y): """ Trains the NER model. Input is list of AnnotatedDocuments. Parameters ---------- X : list(list(str)) list of list of tokens y : list(list(str)) list of list of BIO tags Returns ------- self """ if self.embeddings is None and self.embeddings_file is None: raise ValueError( "Either embeddings or embeddings_file should be provided, exiting." ) log.info("Preprocessing dataset...") self.preprocessor_ = ELMoTransformer() self.preprocessor_.fit(X, y) if self.embeddings is None: self.embeddings = load_glove(self.embeddings_file) embeddings_dim != self.embeddings[list( self.embeddings.keys())[0]].shape[0] self.embeddings = filter_embeddings( self.embeddings, self.preprocessor_._word_vocab.vocab, embeddings_dim) log.info("Building model...") self.model_ = ELModel( char_embedding_dim=self.char_embedding_dim, word_embedding_dim=self.word_embedding_dim, char_lstm_size=self.char_lstm_size, word_lstm_size=self.word_lstm_size, char_vocab_size=self.preprocessor_.char_vocab_size, word_vocab_size=self.preprocessor_.word_vocab_size, num_labels=self.preprocessor_.label_size, embeddings=self.embeddings, dropout=self.dropout) self.model_, loss = self.model_.build() optimizer = Adam(lr=self.learning_rate) self.model_.compile(loss=loss, optimizer=optimizer) self.model_.summary() log.info('Training the model...') self.trainer_ = Trainer(self.model_, preprocessor=self.preprocessor_) x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=42) self.trainer_.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid, batch_size=self.batch_size, epochs=self.max_iter) self.tagger_ = Tagger(self.model_, preprocessor=self.preprocessor_) return self def predict(self, X): """ Predicts using the NER model. Parameters ---------- X : list(list(str)) list of list of tokens. Returns ------- y : list(list(str)) list of list of predicted BIO tags. """ if self.tagger_ is None: raise ValueError( "No tagger found, either run fit() to train or load() a trained model" ) log.info("Predicting from model...") ypreds = [self.tagger_.predict(" ".join(x)) for x in X] return ypreds def save(self, dirpath): """ Saves model to local disk, given a dirpath Parameters ----------- dirpath : str a directory where model artifacts will be saved. Model saves a weights.h5 weights file, a params.json parameter file, and a preprocessor.pkl preprocessor file. Returns ------- None """ if self.model_ is None or self.preprocessor_ is None: raise ValueError( "No model artifacts to save, either run fit() to train or load() a trained model" ) if not os.path.exists(dirpath): os.makedirs(dirpath) weights_file = os.path.join(dirpath, "weights.h5") params_file = os.path.join(dirpath, "params.json") preprocessor_file = os.path.join(dirpath, "preprocessor.pkl") save_model(self.model_, weights_file, params_file) self.preprocessor_.save(preprocessor_file) write_param_file(self.get_params(), os.path.join(dirpath, "params.yaml")) def load(self, dirpath): """ Loads a trained model from local disk, given the dirpath Parameters ---------- dirpath : str a directory where model artifacts are saved. Returns ------- self """ if not os.path.exists(dirpath): raise ValueError("Model directory not found: {:s}".format(dirpath)) weights_file = os.path.join(dirpath, "weights.h5") params_file = os.path.join(dirpath, "params.json") preprocessor_file = os.path.join(dirpath, "preprocessor.pkl") if not (os.path.exists(weights_file) or os.path.exists(params_file) or os.path.exists(preprocessor_file)): raise ValueError("Model files may be corrupted, exiting") self.model_ = load_model(weights_file, params_file) self.preprocessor_ = ELMoTransformer.load(preprocessor_file) self.tagger_ = Tagger(self.model_, preprocessor=self.preprocessor_) return self
class Sequence(object): def __init__( self, word_embedding_dim=100, char_embedding_dim=25, word_lstm_size=100, char_lstm_size=25, fc_dim=100, dropout=0.5, embeddings=None, use_char=True, use_crf=True, initial_vocab=None, optimizer='adam', layer2Flag=False, layerdropout=0, # fastArFlag=False, # fastModelAr="", # fastEnFlag=False, # fastModelEn="",ArTwitterFlag=False,ArTwitterModel="",fileToWrite="Invalid.txt", bretFlag=False, bretMaxLen=100, bert_path="https://tfhub.dev/google/bert_multi_cased_L-12_H-768_A-12/1" ): self.model = None self.p = None self.tagger = None self.word_embedding_dim = word_embedding_dim self.char_embedding_dim = char_embedding_dim self.word_lstm_size = word_lstm_size self.char_lstm_size = char_lstm_size self.fc_dim = fc_dim self.dropout = dropout self.embeddings = embeddings self.use_char = use_char self.use_crf = use_crf self.initial_vocab = initial_vocab self.optimizer = optimizer self._layer2Flag = layer2Flag self._layerdropout = layerdropout # self._fastArFlag=fastArFlag # self._fastEnFlag=fastEnFlag # self._fastModelAr=fastModelAr # self._fastModelEn=fastModelEn # self._ArTwitterFlag=ArTwitterFlag # self._ArTwitterModel=ArTwitterModel # self._fileToWrite=fileToWrite self._bretFlag = bretFlag self._bretMaxLen = bretMaxLen self._bert_path = bert_path def bertFit(self, x_train, y_train, x_valid=None, y_valid=None, epochs=1, batch_size=32, verbose=1, callbacks=None, shuffle=True): sess = tf.Session() bert_path = "https://tfhub.dev/google/bert_multi_cased_L-12_H-768_A-12/1" max_seq_length = self._bretMaxLen tokenizer = create_tokenizer_from_hub_module() print("tokenizar done") train_examples = convert_text_to_examples(x_train, y_train) (train_input_ids, train_input_masks, train_segment_ids, train_labels) = convert_examples_to_features( tokenizer, train_examples, max_seq_length=max_seq_length) model = ABM.BertBiLSTMCRF(num_labels=9, char_embedding_dim=self.char_embedding_dim, word_lstm_size=self.word_lstm_size, char_lstm_size=self.char_lstm_size, fc_dim=self.fc_dim, use_char=self.use_char, char_vocab_size=None, use_crf=self.use_crf, layer2Flag=self._layer2Flag, layerdropout=self._layerdropout, bretFlag=self._bretFlag, bretMaxLen=self._bretMaxLen, bert_path=self._bert_path) model, loss = model.build() # Instantiate variables ABM.initialize_vars(sess) model.fit([train_input_ids, train_input_masks, train_segment_ids], train_labels, epochs=epochs, batch_size=batch_size) def bertFitV2(self, x_train, y_train, x_valid=None, y_valid=None, epochs=1, batch_size=32, verbose=1, callbacks=None, shuffle=True): sess = tf.Session() bert_path = "https://tfhub.dev/google/bert_multi_cased_L-12_H-768_A-12/1" max_seq_length = self._bretMaxLen p = IndexTransformer(initial_vocab=self.initial_vocab, use_char=self.use_char) p.fit(x_train, y_train) embeddings = filter_embeddings(self.embeddings, p._word_vocab.vocab, self.word_embedding_dim) #tokenizer = create_tokenizer_from_hub_module() #print("tokenizar done") #train_examples = convert_text_to_examples(x_train, y_train) #(train_input_ids, train_input_masks, train_segment_ids, train_labels) = convert_examples_to_features(tokenizer,train_examples,max_seq_length=max_seq_length) model = ABM.BertBiLSTMCRF(num_labels=p.label_size, char_embedding_dim=self.char_embedding_dim, word_lstm_size=self.word_lstm_size, char_lstm_size=self.char_lstm_size, fc_dim=self.fc_dim, use_char=self.use_char, char_vocab_size=None, use_crf=self.use_crf, layer2Flag=self._layer2Flag, layerdropout=self._layerdropout, bretFlag=self._bretFlag, bretMaxLen=self._bretMaxLen, bert_path=self._bert_path) model, loss = model.build() # Instantiate variables ABM.initialize_vars(sess) model.compile(loss=loss, optimizer=self.optimizer) trainer = Trainer(model, preprocessor=p) trainer.train(x_train, y_train, x_valid, y_valid, epochs=epochs, batch_size=batch_size, verbose=verbose, callbacks=callbacks, shuffle=shuffle) self.p = p self.model = model def fit(self, x_train, y_train, x_valid=None, y_valid=None, epochs=1, batch_size=32, verbose=1, callbacks=None, shuffle=True): """Fit the model for a fixed number of epochs. Args: x_train: list of training data. y_train: list of training target (label) data. x_valid: list of validation data. y_valid: list of validation target (label) data. batch_size: Integer. Number of samples per gradient update. If unspecified, `batch_size` will default to 32. epochs: Integer. Number of epochs to train the model. verbose: Integer. 0, 1, or 2. Verbosity mode. 0 = silent, 1 = progress bar, 2 = one line per epoch. callbacks: List of `keras.callbacks.Callback` instances. List of callbacks to apply during training. shuffle: Boolean (whether to shuffle the training data before each epoch). `shuffle` will default to True. """ p = IndexTransformer(initial_vocab=self.initial_vocab, use_char=self.use_char) p.fit(x_train, y_train, bretFlag=self._bretFlag, max_len=self._bretMaxLen) embeddings = filter_embeddings(self.embeddings, p._word_vocab.vocab, self.word_embedding_dim) model = BiLSTMCRF(char_vocab_size=p.char_vocab_size, word_vocab_size=p.word_vocab_size, num_labels=p.label_size, word_embedding_dim=self.word_embedding_dim, char_embedding_dim=self.char_embedding_dim, word_lstm_size=self.word_lstm_size, char_lstm_size=self.char_lstm_size, fc_dim=self.fc_dim, dropout=self.dropout, embeddings=embeddings, use_char=self.use_char, use_crf=self.use_crf, layer2Flag=self._layer2Flag, layerdropout=self._layerdropout, bretFlag=self._bretFlag, bretMaxLen=self._bretMaxLen, bert_path=self._bert_path) model, loss = model.build() #if(self.optimizer.lower()=="adam"): #self.optimizer=keras.optimizers.Adamax(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.1) model.compile(loss=loss, optimizer=self.optimizer) trainer = Trainer(model, preprocessor=p) trainer.train(x_train, y_train, x_valid, y_valid, epochs=epochs, batch_size=batch_size, verbose=verbose, callbacks=callbacks, shuffle=shuffle) self.p = p self.model = model def score(self, x_test, y_test, fileToWrite): """Returns the f1-micro score on the given test data and labels. Args: x_test : array-like, shape = (n_samples, sent_length) Test samples. y_test : array-like, shape = (n_samples, sent_length) True labels for x. Returns: score : float, f1-micro score. """ if self.model: # if(self._fastArFlag): # ArText=KeyedVectors.load_word2vec_format(self._fastModelAr) # if(self._fastEnFlag): # EnText=KeyedVectors.load_word2vec_format(self._fastModelEn) # if(self._ArTwitterFlag): # ArTwitter=gensim.models.Word2Vec.load(self._ArTwitterModel) x_test_org = x_test x_test = self.p.transform(x_test) lengths = map(len, y_test) y_pred = self.model.predict(x_test) y_pred = self.p.inverse_transform(y_pred, lengths) # adjust here # vector similarity approach # if(self._ArTwitterFlag and self._fastEnFlag): # print("here") # AdjustPredTag(t_model=ArTwitter,t_en_model=EnText,x_test_org=x_test_org,y_pred=y_pred,ratioSimilarity=0.6,topn=30) writeTupleArray(x_test_org, y_pred, fileToWrite) #checkerLen(x_test_org,y_pred) #print(y_pred) print(classification_report(y_test, y_pred)) score = f1_score(y_test, y_pred) print("F-score is") return score else: raise OSError('Could not find a model. Call load(dir_path).') def analyze(self, text, tokenizer=str.split): """Analyze text and return pretty format. Args: text: string, the input text. tokenizer: Tokenize input sentence. Default tokenizer is `str.split`. Returns: res: dict. """ if not self.tagger: self.tagger = Tagger(self.model, preprocessor=self.p, tokenizer=tokenizer) return self.tagger.analyze(text) def save(self, weights_file, params_file, preprocessor_file): self.p.save(preprocessor_file) save_model(self.model, weights_file, params_file) @classmethod def load(cls, weights_file, params_file, preprocessor_file): self = cls() self.p = IndexTransformer.load(preprocessor_file) self.model = load_model(weights_file, params_file) return self
def analyze(self, words): if self.model: tagger = Tagger(self.model, preprocessor=self.p) return tagger.analyze(words) else: raise (OSError('Could not find a model. Call load(dir_path).'))