def load_model_to_app(): # app.predictor = load_model('./static/POS_BiLSTM_CRF_WSJ_new.h5') with open('static/POS/tokenizer.json') as f1: data1 = json.load(f1) tokenizer = tokenizer_from_json(data1) app.tokenizer = tokenizer with open('static/POS/tag_tokenizer.json') as f2: data2 = json.load(f2) tag_tokenizer = tokenizer_from_json(data2) app.tag_tokenizer = tag_tokenizer word_index = tokenizer.word_index vocab_size = len(word_index) + 1 tag_index = tag_tokenizer.word_index app.index_tag = {i: t for t, i in tag_index.items()} tag_size = len(tag_index) + 1 model = create_model(vocab_size, max_length, embedding_dim, word_index, tag_index) model.load_weights('static/POS/POS_BiLSTM_CRF_WSJ_new.h5') app.pos_tagger = model # sentiment analysis model with open('static/SA/tokenizer.json') as f3: data3 = json.load(f3) tokenizer3 = tokenizer_from_json(data3) app.sa_tokenizer = tokenizer3 model3 = load_model('static/SA/model.h5') app.sa_model = model3
def predict_sentiment(tweets): #load tokenizer with open(config.DATA / 'sentiment' / 'tokenizer_200k.json') as f: data = json.load(f) tokenizer = tokenizer_from_json(data) #compile the keras model embedding_dim = 100 max_words = 200000 max_length = 50 lstm_model4 = Sequential() lstm_model4.add( Embedding(max_words, embedding_dim, input_length=max_length)) lstm_model4.add(LSTM(64, return_sequences=True)) lstm_model4.add(LSTM(32)) lstm_model4.add(Dense(32, activation='relu')) #output layer lstm_model4.add(Dense(1, activation='sigmoid')) lstm_model4.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc']) lstm_model4.load_weights(config.MODELS / 'sentiment' / 'LSTM_model5_nostop.h5') #pre-process tweets to remove mentions and hashtags political_tweets_proc = list(map(preprocess_tweet, tweets)) #transform the tweets to sequences of numbers pol_seqs = tokenizer.texts_to_sequences(political_tweets_proc) #pad with zeros pol_seqs_padded = pad_sequences(pol_seqs, maxlen=max_length) return lstm_model4.predict(pol_seqs_padded)
def load(self, cache): """Load trained model.""" self.model = load_model(self.model_cache) with open(self.tokenizer_cache) as f: self.tokenizer = tokenizer_from_json(json.load(f)) super().load(cache)
def Vectorize2(news_list , json_string, max_words = 100): from keras.preprocessing.text import tokenizer_from_json from keras.preprocessing.sequence import pad_sequences maxlen = 100 training_samples = 200 validation_samples = 10000 # tokenizamos los textos tokenizer = tokenizer_from_json(json_string) # convierte los strings en una lista de los indices of tokens sequences = tokenizer.texts_to_sequences(news_list) # print(len(sequences[0])) # diccionario de los tokens con sus indices word_index = tokenizer.word_index # print(len(word_index.keys())) # print('Found unique tokens') # esto conviere una lista en una matrix 2D data = pad_sequences(sequences, maxlen=1494) # print('Shape of data tensor:', data.shape) # print('Shape of label:',labels.shape) # hago una permutacion random de los features # indices = np.arange(data.shape[0]) # np.random.shuffle(indices) # data = data[indices] # labels = labels[indices] # print(labels) return data
def test_tokenizer_serde_fitting(self): sample_texts = [ "There was a time that the pieces fit, but I watched them fall away", "Mildewed and smoldering, strangled by our coveting", "I've done the math enough to know the dangers of our second guessing", ] tokenizer = text.Tokenizer(num_words=100) tokenizer.fit_on_texts(sample_texts) seq_generator = tokenizer.texts_to_sequences_generator(sample_texts) sequences = [seq for seq in seq_generator] tokenizer.fit_on_sequences(sequences) tokenizer_json = tokenizer.to_json() recovered = text.tokenizer_from_json(tokenizer_json) self.assertEqual(tokenizer.char_level, recovered.char_level) self.assertEqual(tokenizer.document_count, recovered.document_count) self.assertEqual(tokenizer.filters, recovered.filters) self.assertEqual(tokenizer.lower, recovered.lower) self.assertEqual(tokenizer.num_words, recovered.num_words) self.assertEqual(tokenizer.oov_token, recovered.oov_token) self.assertEqual(tokenizer.word_docs, recovered.word_docs) self.assertEqual(tokenizer.word_counts, recovered.word_counts) self.assertEqual(tokenizer.word_index, recovered.word_index) self.assertEqual(tokenizer.index_word, recovered.index_word) self.assertEqual(tokenizer.index_docs, recovered.index_docs)
def get_sequence_of_tokens(corpus, refresh=True): """ :param corpus: :param refresh: :return: """ # tokenization if refresh: tokenizer = Tokenizer() # fit the tokenizer on the text tokenizer.fit_on_texts(corpus) else: with open("tokenizer.json", 'r') as tj: tokenizer = tokenizer_from_json(json.load(tj)) tokenizer_json = tokenizer.to_json() with open('tokenizer.json', 'w') as fobj: json.dump(tokenizer_json, fobj) index_dict = tokenizer.word_index seq = tokenizer.texts_to_sequences(corpus) # calculate the vocab size total_words = len(tokenizer.word_index) + 1 print(total_words) return total_words, seq
def open_txt_tokeznizer(path): # open the tokenizer with open(str(path) + 'tokenizer.json') as f: data = json.load(f) tokenizer = tokenizer_from_json(data) return tokenizer
def tokenize(df): with open('tokenizer.json') as f: data = json.load(f) tokenizer = tokenizer_from_json(data) text_sequences = tokenizer.texts_to_sequences(df) text_sequences = pad_sequences(text_sequences, 200) return text_sequences
def load_pretrained(): model = load_model('./model/model.h5') with open('./model/tokenizer.json') as f: data = json.load(f) tokenizer = tokenizer_from_json(data) print('loaded') sys.stdout.flush() return model, tokenizer
def preprocess(input): input = [input] with open('tokenizer.json') as f: data = json.load(f) tokenizer = tokenizer_from_json(data) tokenized = tokenizer.texts_to_sequences(input) padded = sequence.pad_sequences(tokenized, maxlen=MAX_SEQUENCE_LENGTH) return padded
def train(self, df, verbose=False, cache=None): """Train the neural network.""" labels = df['label'].to_numpy() self.model = load_model(self.model_cache) with open(self.tokenizer_cache) as f: self.tokenizer = tokenizer_from_json(json.load(f)) super().train(df, labels, verbose, cache)
def loadSentenceTokenizer(filepath): ''' Load the sentences tokenizer ''' with open(filepath) as f: data = json.load(f) tokenizer = tokenizer_from_json(data) return tokenizer
def loadLabelTokenizer(filepath): ''' Load the label tokenizer ''' with open(filepath) as f: data = json.load(f) tokenizer = tokenizer_from_json(data) return tokenizer
def vqa(): read = request.get_json() if type(read) == str: read = json.loads(read) img = read['image'] img_name = 'image' + str(random.randint(1,1001)) + '.jpg' with open(img_name, "wb") as fh: fh.write(base64.b64decode(img)) with open('vqa/tokenizer2.json') as f: data = json.load(f) vqa_ques_tokenizer = tokenizer_from_json(data) vqa_model = tf.keras.models.load_model('vqa/vqa_model.h5') vqa_image_model = tf.keras.applications.xception.Xception(weights='imagenet', include_top=False) topAnsIndexWord = pickle.load(open('vqa/topAnsIndexWord2.pkl', 'rb')) img = image.load_img(img_name, target_size=(299, 299)) os.remove(img_name) # Obtaining features from image using Xception model like the one used in training x = image.img_to_array(img) x = np.expand_dims(x, axis=0) x = tf.keras.applications.xception.preprocess_input(x) features = vqa_image_model.predict(x) X1 = features.reshape((1, 10*10, -1)) # Cleaning the input question the same way used with the model ques = read['question'] ques = clean_str(ques) X2 = vqa_ques_tokenizer.texts_to_sequences([ques]) X2 = tf.keras.preprocessing.sequence.pad_sequences(X2, padding='post', truncating='post', maxlen=15) data = {} data['question'] = read['question'] # Obtaining model prediction then converting it with the index-to-word mapper that was built with the model pred = vqa_model.predict([X1, X2]) pred2 = pred[0].argsort()[-5:][::-1] data['answers'] = {} txt = "" for i in pred2: if pred[0][i] > 0.01: txt += topAnsIndexWord[i] + " بنسبة " + str(pred[0][i])[:4] + ". \n" data['answers'][topAnsIndexWord[i]] = str(pred[0][i]) break else: txt = "عفوا، لا يمكنني الإجابة على هذا السؤال" data['text'] = txt data['sound'] = read_text(txt) return jsonify(data)
def __init__(self, seq_len): with open('./pre-trained/tokenizer.json', 'r', encoding='utf-8') as f1: tokenizer_config = json.load(f1) with open('./pre-trained/label_tokenizer_json.json', 'r', encoding='utf-8') as f2: label_tokenizer_config = json.load(f2) self.tokenizer = tokenizer_from_json(tokenizer_config) self.label_tokenizer = tokenizer_from_json(label_tokenizer_config) self.train_sequences = np.load('./pre-trained/train_sequences.npy') self.train_label = np.load('./pre-trained/train_label.npy') self.test_sequences = np.load('./pre-trained/test_sequences.npy') self.test_label = np.load('./pre-trained/test_label.npy') self.embeddings_matrix = np.load('./pre-trained/embeddings_matrix.npy') self.embedding_dim = 100 self.word_index = self.tokenizer.word_index self.vocab_size = len(self.word_index) self.max_len = seq_len self.rnn_units = self.embedding_dim self.category_num = len(set(self.test_label[:, 0]))
def _load_jsons(self): print("Loading jsons...") loaded = read_json(rootpath + "yval_tokens.json") self.y_tokenizer = tokenizer_from_json(loaded) raw_word2int = read_json(rootpath + "xval_man_tokens.json") self.word2int = ast.literal_eval(raw_word2int) # print(self.word2int["_NA"],self.word2int["社保"]) self.reverse_word_map = dict( map(reversed, self.y_tokenizer.word_index.items())) print("Done with jsons") return
def test_tokenizer_serde_no_fitting(self): tokenizer = text.Tokenizer(num_words=100) tokenizer_json = tokenizer.to_json() recovered = text.tokenizer_from_json(tokenizer_json) self.assertEqual(tokenizer.get_config(), recovered.get_config()) self.assertEqual(tokenizer.word_docs, recovered.word_docs) self.assertEqual(tokenizer.word_counts, recovered.word_counts) self.assertEqual(tokenizer.word_index, recovered.word_index) self.assertEqual(tokenizer.index_word, recovered.index_word) self.assertEqual(tokenizer.index_docs, recovered.index_docs)
def load_utils(tokenizer_path, labels_path, index_path): with open(tokenizer_path, 'r', encoding='utf-8') as jsonfile: tokenizer_data = json.load(jsonfile) tokenizer = tokenizer_from_json(tokenizer_data) with open(labels_path, 'r', encoding='utf-8') as jsonfile: reverse_labels = json.load(jsonfile) with open(index_path, 'r') as jsonfile: index = json.load(jsonfile) return tokenizer, reverse_labels, index
def preprocess(self, tokenizer_string=None): """ Preprocess the textual data. Returns ------- x_train: The processed-sequenced training data. y_train: Processed training labels x_val: The processed-sequenced validation data y_val: processed validation labels word_index: A dictionary containing the word-tokens and their indices for the sequencing. """ if tokenizer_string is None: tokenizer = Tokenizer(num_words=self.MAX_NB_WORDS) tokenizer.fit_on_texts(self.texts) self.tokenizer_string = tokenizer.to_json() else: self.tokenizer_string = tokenizer_string from keras.preprocessing.text import tokenizer_from_json tokenizer = tokenizer_from_json(tokenizer_string) sequences = tokenizer.texts_to_sequences(self.texts) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) data = pad_sequences(sequences, maxlen=self.MAX_SEQUENCE_LENGTH) labels = to_categorical(np.asarray(self.labels)) print('Shape of data tensor:', data.shape) print('Shape of label tensor:', labels.shape) # split the data into a training set and a validation set if (self.VALIDATION_SPLIT): indices = np.arange(data.shape[0]) np.random.shuffle(indices) data = data[indices] labels = labels[indices] num_validation_samples = int(self.VALIDATION_SPLIT * data.shape[0]) x_train = data[:-num_validation_samples] y_train = labels[:-num_validation_samples] x_val = data[-num_validation_samples:] y_val = labels[-num_validation_samples:] else: x_train = data y_train = labels x_val = None y_val = None return x_train, y_train, x_val, y_val, word_index
def load_tokenizer(): if os.path.exists(config.TOKENIZER_PATH): with open(config.TOKENIZER_PATH) as f: return tokenizer_from_json(f.read()) train_sentences, val_sentences, test_sentences = load_sentences() tokenizer = Tokenizer() tokenizer.fit_on_texts(train_sentences) tokenizer.fit_on_texts(val_sentences) tokenizer.fit_on_texts(test_sentences) # persistent to file. with open(config.TOKENIZER_PATH, 'w') as f: f.write(tokenizer.to_json(ensure_ascii=False)) return tokenizer
def predict(self, model): text = self.statement_text.toPlainText() if text.isspace(): return text = " ".join(self.preprocess().split("\n")) model_path = Path(__file__).parent.absolute() / "model" with open(model_path / "tokenizer.json", "r") as f: tokenizer_json = f.read() if not tokenizer_json: raise IOError("Cannot read tokenizer") tokenizer = tokenizer_from_json(tokenizer_json) x = tokenizer.texts_to_matrix([text], mode="binary") p = model.predict(x) y = argmax(p, axis=-1) pred = " ".join([word.capitalize() for word in types[y[0].item()].split("-")]) prob = f"{round(amax(p, axis=-1)[0].item() * 100, 2)}%" self.label_3.setText(f"Prediction: {pred}\nProbability: {prob}")
def load_tokenizer( texts=None, num_words=MAX_WORDS ): file = os.path.join( DATA_HOME, SAVE_DIR, __TOKENIZER_FILE.format( num_words ) ) # tokenizer config file exists. load it and return tokenizer if os.path.exists( file ): print( 'loading tokenizer' ) with open( file, 'r' ) as f: return tokenizer_from_json( f.readline() ) if texts is None: texts, _ = load_raw_text() # load the review data tokenizer = Tokenizer( num_words=MAX_WORDS ) print( 'fitting tokenizer' ) tokenizer.fit_on_texts( texts ) json = tokenizer.to_json() print( 'saving tokenizer' ) with open( file, 'w' ) as f: f.write( json ) return tokenizer
def __init__(self, epochs=5, batch_size=36, max_seq_len=25, fit_verbose=2, print_summary=True, load_model_path=None, tokenizer_path=None): self.epochs = epochs self.batch_size = batch_size self.max_seq_len = max_seq_len self.fit_verbose = fit_verbose self.print_summary = print_summary self.encoder = LabelEncoder() if load_model_path: self.model = load_model(load_model_path) with open(tokenizer_path) as f: data = json.load(f) self.tokenizer = tokenizer_from_json(data) else: self.model = self.model_1b self.tokenizer = Tokenizer()
def textPreproc(self, text_in): ''' Delete Special Character ''' print('[model.py] Deleting Special Character..') temp_text = re.sub("[^ㄱ-ㅎㅏ-ㅣ가-힣a-zA-Z ]", "", str(text_in)) print('[model.py] >>> ', temp_text) ''' Tockenization and Delete Stopword ''' print('[model.py] Tockenization Special Character..') okt = Okt() okt.morphs tocken_text = [] tocken_text = okt.morphs(temp_text, stem=True) print('[model.py] >>> ', tocken_text) ''' Delete Stopword ''' print('[model.py] Deleting Stopword..') stopwords = [ '의', '가', '이', '은', '들', '는', '좀', '잘', '걍', '과', '도', '를', '으로', '자', '에', '와', '한', '하다' ] # Stopword List tocken_text = [word for word in tocken_text if not word in stopwords] tocken_text = [tocken_text] print('[model.py] >>> ', tocken_text) ''' Load token ''' print('[model.py] Loading tockenized data..') with open('./../model/tokenizer.json') as f: data = json.load(f) tokenizer = tokenizer_from_json(data) tocken_text = tokenizer.texts_to_sequences(tocken_text) print('[model.py] >>> ', tocken_text) ''' Array Size Synch ''' print('[model.py] Syncing array size..') max_array_len = 30 preprocessed_data = pad_sequences(tocken_text, maxlen=max_array_len) print('[model.py] >>> ', preprocessed_data) print('[model.py] Preprocessing Done!') return preprocessed_data
def WordtoInt(arr: List[str]) -> List[List[int]]: with open('/app/ml_controller/tokenizer.json') as f: data = json.load(f) tokenizer = tokenizer_from_json(data) return tokenizer.texts_to_sequences([arr])
aa_list = aa_list_pre[30001:38001] # Encrypt DNA, AA sequences into separate 'words' by adding spaces every 3 or 1 characters aa_spaces = [] for aa_seq in aa_list: aa_current = encrypt(aa_seq,1) aa_spaces.append(aa_current) dna_spaces = [] for dna_seq in dna_list: dna_current = encrypt(dna_seq,3) dna_spaces.append(dna_current) # Import tokenizers as json (must be same tokenizers from training) with open('aa_tokenizer.json') as f: aa_json = json.load(f) aa_tokenizer = tokenizer_from_json(aa_json) with open('dna_tokenizer.json') as f: dna_json = json.load(f) dna_tokenizer = tokenizer_from_json(dna_json) # Preprocess DNA and AA sequences (tokenize and pad) preproc_aa, preproc_dna = preprocess(aa_spaces, dna_spaces) # Ensure correct dimensionality tmp_x = pad(preproc_aa, preproc_dna.shape[1]) tmp_x = tmp_x.reshape((-1, preproc_dna.shape[-2])) # Evaluate the test sequences on the trained model results = model.evaluate(preproc_aa,preproc_dna, batch_size=16)
import pickle,re,json import pandas as pd from keras.preprocessing.sequence import pad_sequences from keras.preprocessing.text import tokenizer_from_json with open('label_set.pkl', 'rb') as f: label_set = pickle.load(f) with open('tokenizer.json') as f: data = json.load(f) tokenizer = tokenizer_from_json(data) with open('finalized_model.sav', 'rb') as handle: model = pickle.load(handle) def cleanPunctuation(sentence): # function to clean the word of any punctuation or special characters cleaned = re.sub(r'[?|!|\'|"|#]', r'', sentence) cleaned = re.sub(r'[.|,|)|(|\|/]', r' ', cleaned) cleaned = cleaned.replace("\n", " ") return cleaned def keepAlpha(sentence): alpha_sent = "" for word in sentence.split(): alpha_word = re.sub('[^a-z A-Z]+', ' ', word) alpha_sent += alpha_word alpha_sent += " " alpha_sent = alpha_sent.strip() return alpha_sent
def load_tokenizer(): with open('Data/tokenized-chars.json') as json_file: tokenizer_conf = json.load(json_file) tokenizer = tokenizer_from_json(tokenizer_conf) return tokenizer
def load_bow(path): with open(path) as f: _bow = json.load(f) bow = tokenizer_from_json(_bow) return bow
x_anomalous_len = len(x_anomalous) x = x_normal + x_anomalous no_yy = [0 for i in range(x_normal_len)] an_yy = [1 for i in range(x_anomalous_len)] y = no_yy + an_yy x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=21) print('len x_train: {}, len y_train: {}'.format(len(x_test), len(y_test))) with open('data/tokenized-chars.json') as json_file: tokenizer_conf = json.load(json_file) tokenizer = tokenizer_from_json(tokenizer_conf) char_index = tokenizer.word_index to_predict = x_test #creating the numerical sequences by mapping the indices to the characters sequences = tokenizer.texts_to_sequences(to_predict) char_index = tokenizer.word_index maxlen = 1000 #length of the longest sequence=input_length xx = pad_sequences(sequences, maxlen=maxlen) model = models.load_model('model/lstm-model.h5') model.load_weights('model/lstm-weights.h5') model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])