def predict(data, maxlen, model_dir,output_file_name): cleaned_predict= preprocess(data['document'].values) if selected_layer=='bert': FullTokenizer = bert.bert_tokenization.FullTokenizer tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=False) predict_tokens = [["[CLS]"] + tokenizer.tokenize(sentence) + ["[SEP]"] for sentence in cleaned_predict] predict_data = [tokenizer.convert_tokens_to_ids(token) for token in predict_tokens] predict_data = pad_sequences(predict_data, maxlen=maxlen, dtype="long", truncating="post", padding="post") else: with open('tokenizer.json') as f: json_data = json.load(f) tokenizer=tokenizer_from_json(json_data) predict_data = tokenizer.texts_to_sequences(cleaned_predict) predict_data = pad_sequences(predict_data, padding='post', maxlen=maxlen) model = load_model(model_dir) # 각 데이터의 텍스트, 확률, 라벨을 예측하여 파일에 저장함 result = model.predict(predict_data) label = np.around(model.predict(predict_data)) with open(output_file_name, 'w', encoding='utf-8') as fw: for i in range(len(data)): fw.write('{}\t{}\t{}\n'.format(data['document'].iloc[i], result[i],label[i]))
def convert_text_to_index_array(text): # cnn_lstm_model() # we're still going to use a Tokenizer here, but we don't need to fit it # for human-friendly printing # labels = ['positive', 'neutral', 'negative'] # read in our saved dictionary ''' with open('D:\\app\\DL_models\\Aspect\\vocab.json', 'r') as dictionary_file: dictionary = json.load(dictionary_file) ''' with open( 'D:\\Final IT\\GRADUATE THESIS\\Projects\\CNN\\word2vec\\vocab.json' ) as f: data = json.load(f) dictionary = tokenizer_from_json(data) words = kpt.text_to_word_sequence( text, filters='!"#$%&()*+,-./:;<=>?@[\\]^`{|}~\t\n\'') wordIndices = [] for word in words: if word in dictionary.word_docs: wordIndices.append(dictionary.word_docs[word]) else: print("'%s' not in training corpus; ignoring." % (word)) return wordIndices
def setUp(self): self.model = load_model(filename) with open(vocabualry_file, "r") as json_file: json_content = json_file.read() self.tokenizer = tokenizer_from_json(json_content)
def evaluation(text): clean_text = treatment(text) # Change texts into sequence of indexes with open('tokenizer.json') as f: data = json.load(f) tokenizer = tokenizer_from_json(data) X = tokenizer.texts_to_sequences( [clean_text] ) #Change the texts into numeric identifiers that represent the index of each word in the dictionary # Pad the sequences X = pad_sequences(X, 40) #Sequences have the same size #Change input form X_ = X.reshape(X.shape[0], X.shape[1], 1) #Model loaded model = keras.models.load_model('best_model.hdf5') predictions = model.predict(X_).reshape(1, -1)[0] predictions = ["Positivo" if x < 0.5 else "Negativo" for x in predictions] return (clean_text, predictions[0])
def test_tokenizer_serde_fitting(): sample_texts = [ 'There was a time that the pieces fit, but I watched them fall away', 'Mildewed and smoldering, strangled by our coveting', 'I\'ve done the math enough to know the dangers of our second guessing' ] tokenizer = text.Tokenizer(num_words=100) tokenizer.fit_on_texts(sample_texts) seq_generator = tokenizer.texts_to_sequences_generator(sample_texts) sequences = [seq for seq in seq_generator] tokenizer.fit_on_sequences(sequences) tokenizer_json = tokenizer.to_json() recovered = text.tokenizer_from_json(tokenizer_json) assert tokenizer.char_level == recovered.char_level assert tokenizer.document_count == recovered.document_count assert tokenizer.filters == recovered.filters assert tokenizer.lower == recovered.lower assert tokenizer.num_words == recovered.num_words assert tokenizer.oov_token == recovered.oov_token assert tokenizer.word_docs == recovered.word_docs assert tokenizer.word_counts == recovered.word_counts assert tokenizer.word_index == recovered.word_index assert tokenizer.index_word == recovered.index_word assert tokenizer.index_docs == recovered.index_docs
def get_tokenizer(self): ''' :param embedded_matrix_size: 嵌入矩阵大小 :return: tokenizer ''' if not os.path.exists(self.tokenizer_path): self.remove_stop_word_list = self.get_remove_stop_word() tokenizer = text.Tokenizer(num_words=self.embedded_matrix_size, lower=False, char_level=False) tokenizer.fit_on_texts(self.remove_stop_word_list) tokenizer_json = tokenizer.to_json() with open(self.tokenizer_path, "w") as f: f.write(tokenizer_json) print("save tokenizer_json success as '{}'".format( self.tokenizer_path)) return tokenizer else: print("更换数据集需手动删除{}此文件,并重新运行代码后会自动生成tokenizer.".format( self.tokenizer_path)) with open(self.tokenizer_path, "r") as f: tokenizer_json = f.read() tokenizer = text.tokenizer_from_json(tokenizer_json) print("load tokenizer_json success as '{}'".format( self.tokenizer_path)) return tokenizer
def eval(ctx, evaluation_data, model_dir, batch_size): MODEL = ctx.obj['model'] _, x_test, y_test = load_data(evaluation_data, sample_ratio=1, shuffle=False) with open(join(model_dir, 'tokenizer.json')) as f: json_string = f.read() tokenizer = tokenizer_from_json(json_string) with open(join(model_dir, 'model.config'), 'rb') as f: config = pickle.load(f) makedirs(model_dir, exist_ok=True) x_test, _, tokenizer = \ data_preprocessing(x_test, max_len=config['max_len'], tokenizer=tokenizer) config['batch_size'] = batch_size config = Bunch(config) sess = tf.compat.v1.Session() model = MODEL(config) logger = Logger(sess, config) trainer = BaseTrain(sess, model, None, config, logger, val_data=(x_test, y_test), restore=True) summaries_dict, _, _ = trainer.eval() print(summaries_dict) with open( join(model_dir, f'evaluation-on-"{basename(evaluation_data)}".json'), 'w') as fp: json.dump(summaries_dict, fp, cls=CustomJSONEncoder)
def _load_tokenizer(config): with open( os.path.join(config.get('PATHS', 'model_path'), 'saved_models', config.get('INFERENCE', 'inference_model'), 'tokenizer.json'), 'rb') as f: tok = json.load(f) return tokenizer_from_json(tok)
def test(dataset, style, test_file): obj = json.load(open(test_file, 'r')) # obj = json.load(open('/media/wentian/sdb2/work/caption_ma/save/2019-10-04_21-09-37_2agent_neg/annotation.json', 'r'))['annotations'] sents = [i['caption'] for i in obj] d = pickle.load( open(r'../data/clf_nn/model_rnn_info_{}_{}.pkl'.format(dataset, style), 'rb')) w, e, tokenizer_config = d['word_index'], d['embeddings_index'], d[ 'tokenizer_config'] MAX_SEQUENCE_LENGTH, nclasses, EMBEDDING_DIM = d['MAX_SEQUENCE_LENGTH'], d[ 'nclasses'], d['EMBEDDING_DIM'] tokenizer = tokenizer_from_json(tokenizer_config) model_RNN = Build_Model_RNN_Text(w, e, nclasses=nclasses, MAX_SEQUENCE_LENGTH=MAX_SEQUENCE_LENGTH, EMBEDDING_DIM=EMBEDDING_DIM) model_RNN.load_weights('../data/clf_nn/model_rnn_{}_{}.h5'.format( dataset, style)) X_train_Glove = test_tokenize(tokenizer, sents, MAX_SEQUENCE_LENGTH) predicted = model_RNN.predict_classes(X_train_Glove, verbose=0) for i in range(len(predicted)): if predicted[i] == 0: print(sents[i], predicted[i]) print(sum(predicted) / len(predicted))
def __init__(self): self.__assessment_model = tf.keras.models.load_model( 'release/assessment_model.h5', custom_objects={'soft_acc': soft_acc}) self.__simplification_model = tf.keras.models.load_model( 'release/simplification_model.h5') with open('release/assessment_tokenizer.json') as f: data = json.load(f) self.__assessment_tokenizer = tokenizer_from_json(data) with open('release/simplification_tokenizer.json') as f: data = json.load(f) self.__simplification_tokenizer = tokenizer_from_json(data) self.levels = ['Beginner', 'Intermediate', 'Advanced'] self.__id_to_word = { i: word for word, i in self.__simplification_tokenizer.word_index.items() }
def get_tokenizer(folder_path): folder_path = os.path.join('tmp/', folder_path) tokenizer_path = os.path.join(folder_path, 'tokenizer.json') with open(tokenizer_path) as f: data = json.load(f) tokenizer = tokenizer_from_json(data) return tokenizer
def loadModel(name='qasystem'): fin = open(name + '.tok', 'rb') tokJSON = pickle.load(fin) fin.close() return load_model(name + '.model', custom_objects={'SeqSelfAttention': SeqSelfAttention }), tokenizer_from_json(tokJSON)
def __init__(self): self.logger = logging.getLogger('predictor.Predictor') self.nn = NeuralNetwork() with open(self.BRANDS, 'r') as f: self.brands = np.array(json.load(f)) with open(self.CATEGORIES, 'r') as f: self.categories = np.array(json.load(f)) with open(self.TOKENIZER, 'r') as f: self.tokenizer = tokenizer_from_json(f.read()) self.nn.load([self.MODEL_ARCH, self.MODEL_WEIGHTS])
def __init__(self): self.preprocessor = textPreprocessor() self.model = load_model( os.path.join(CURRENT_PATH, "../../artifacts/c_lstm_reduced.h5")) self.model._make_predict_function() with open( os.path.join(CURRENT_PATH, "../../artifacts/keras_tokenizer.json")) as f: data = json.load(f) self.tokenizer = tokenizer_from_json(data)
def get_tokenizer(self): self.tokenizer_path = config.data_preprocessing_config().tokenizer_path if not os.path.exists(self.tokenizer_path): print("please run data_preprocessing generate '{}'".format( self.tokenizer_path)) exit() else: with open(self.tokenizer_path, "r") as f: tokenizer_json = f.read() tokenizer = text.tokenizer_from_json(tokenizer_json) return tokenizer
def preload_model(): ''' Returns nothing. Creates two global variables. model: the Keras model to perform sentiment analysis. tokenizer: the Keras tokenizer to tokenize predictions. ''' global tokenizer global keras_model with open(os.path.join(dirname, 'tokenizer.json')) as f: json_data = json.load(f) tokenizer = tokenizer_from_json(json_data) keras_model = tf.keras.models.load_model(os.path.join(dirname, 'model.h5'))
def load_tokenizer(self, file_name=None): if file_name is None: file_name = tokenizer_path + "/" + self.source + '_tokenizer.json' if os.path.isfile(file_name): with open(file_name) as f: data = json.load(f) tokenizer = tokenizer_from_json(data) return tokenizer else: return Tokenizer(num_words=10000)
def inferEmotion(self, input_string): with open('tokenizer.json') as f: data = json.load(f) newdata = json.dumps(data) tokenizer = text.tokenizer_from_json(newdata) tokenized_inputs = tokenizer.texts_to_sequences([input_string]) padded_inputs = sequence.pad_sequences(tokenized_inputs, maxlen=1000) formatted_input = padded_inputs.tolist() data = json.dumps({"instances": formatted_input}) json_response = requests.post("http://diadist.herokuapp.com/v1/models/diarydistiller/versions/1:predict", data=data) response = json.loads(json_response.text) return response["predictions"][0]
def test_tokenizer_serde_no_fitting(): tokenizer = text.Tokenizer(num_words=100) tokenizer_json = tokenizer.to_json() recovered = text.tokenizer_from_json(tokenizer_json) assert tokenizer.get_config() == recovered.get_config() assert tokenizer.word_docs == recovered.word_docs assert tokenizer.word_counts == recovered.word_counts assert tokenizer.word_index == recovered.word_index assert tokenizer.index_word == recovered.index_word assert tokenizer.index_docs == recovered.index_docs
def preprocess_data(data, use_loaded=True, file_emb="./data/glove.840B.300d.txt", max_num_words=50000, max_len_seq=35, emb_dim=300): # preprocess data file_processed_data = dir_processed + "data_processed.pkl" file_tokenizer = dir_processed + "tokenizer.pkl" file_label_index = dir_processed + "label_index.npy" if use_loaded: X, y, emb = pickle.load(open(file_processed_data, "rb")) tokenizer = tokenizer_from_json( open(file_tokenizer, "r", encoding="utf-8").read()) label_encoder = LabelEncoder() label_encoder.classes_ = np.load(file_label_index) return X, y, emb, tokenizer, label_encoder cleaned_text = data["text"].apply(clean_text).values tokenizer = Tokenizer(num_words=max_num_words, oov_token='oov_token_placeholder') tokenizer.fit_on_texts(list(cleaned_text)) tokenizer_json = tokenizer.to_json(ensure_ascii=False) with open(file_tokenizer, 'w', encoding='utf-8') as fout: fout.write(tokenizer_json) sequences = tokenizer.texts_to_sequences(cleaned_text) X = pad_sequences(sequences, maxlen=max_len_seq) word_index = tokenizer.word_index num_words = len(word_index) print('Found %s Words' % num_words) print(set(data["label"].values)) label_encoder = LabelEncoder().fit(data["label"].values) np.save(file_label_index, label_encoder.classes_) print('Found %s Classes' % len(label_encoder.classes_)) y = label_encoder.transform(data["label"].values) print('Loading Word Embeddings...') emb = (np.random.rand(min(num_words + 1, max_num_words), emb_dim) - 0.5) * 0.1 # +1 because idx 0 is not used with open(file_emb, 'r', encoding='utf-8') as fin: for line in fin: tokens = line.rstrip().split(' ') if tokens[0] in word_index.keys( ) and word_index[tokens[0]] < max_num_words: emb[word_index[tokens[0]]] = np.asarray(tokens[1:], dtype='float32') pickle.dump((X, y, emb), open(file_processed_data, "wb")) return X, y, emb, tokenizer, label_encoder
def __fit_tokenizer(self): if os.path.isfile(config.vocab_path): with open(config.vocab_path, 'r') as f: json_content = f.read() self.tokenizer = tokenizer_from_json(json_content) else: tmp_doc = (self.beg_token + ' ' + self.end_token + ' ') * 100 docs = [tmp_doc, self.__read_raw_formulas('train')] num_tokens = config.vocab_size - 3 # for beg, and, unk token self.tokenizer = Tokenizer(num_words=num_tokens, filters='\t\n', lower=False, oov_token=self.unk_token) self.tokenizer.fit_on_texts(docs) with open(config.vocab_path, 'w+') as f: f.write(self.tokenizer.to_json())
def from_numpy( self, train_data_file: str, test_data_file: str = None, val_data_file: str = None, ds_type="TensorDataset", ): logging.info("Starting Data Preparation...") start_time = time.time() self.tokenizer = text.tokenizer_from_json() train_npz = np.load(train_data_file, allow_pickle=True) self.X_train = train_npz["X"].item() self.y_train = train_npz["y"] self.num_classes = len(np.unique(self.y_train)) self.vocab_size = np.shape(self.X_train)[1] train_ds = CSRDataset(self.X_train, self.y_train) if test_data_file is not None: test_npz = np.load(test_data_file, allow_pickle=True) self.X_test = test_npz["X"].item() self.y_test = test_npz["y"] test_ds = CSRDataset(self.X_test, self.y_test) if val_data_file is not None: val_npz = np.load(val_data_file, allow_pickle=True) self.X_val = val_npz["X"].item() self.y_val = val_npz["y"] val_ds = CSRDataset(self.X_val, self.y_val) logging.info("Data Import Completed - Time elapsed: " + get_elapsed_time(start_time)) if val_data_file is not None: if test_data_file is not None: return train_ds, val_ds, test_ds else: return train_ds, val_ds else: return train_ds
def load_vocabulary_file_from(directory_path_or_file, split_name=None, flat=True): """ @param split_name: when given looks for the sub-directory or file in the flat directory @param flat: when True looks for a file in the given directory, otherwise looks into the sub-directory """ lookup_filename = DEFAULT_VOCABULARY_FILE_NAME if split_name and not flat: directory_path_or_file = "/".join([directory_path_or_file, split_name]) if split_name and flat: lookup_filename = "mscoco_vocabulary_{}.json".format(split_name) # print("No support for split specific vocabulary loading. Please just name the file to use to " + lookup_filename) tokenizer_config = load_json_from(directory_path_or_file, lookup_filename) tokenizer = tokenizer_from_json(json.dumps(tokenizer_config)) return tokenizer
def review_process(request): if request.method == 'POST': review_stmt = request.POST.get('review') english_stops = set(stopwords.words('english')) max_length = 130 #From train data #Loading stored model loaded_model = load_model( '/home/dsdroid/Desktop/6th Sem/SWLAB/Assignment 4/Sentiment-Analysis/review_classifier/review/sentiment.h5' ) # Pre-process input regex = re.compile(r'[^a-zA-Z\s]') review_stmt = regex.sub('', review_stmt) print('Cleaned: ', review_stmt) words = review_stmt.split(' ') filtered = [w for w in words if w not in english_stops] filtered = ' '.join(filtered) filtered = [filtered.lower()] print('Filtered: ', filtered) with open( '/home/dsdroid/Desktop/6th Sem/SWLAB/Assignment 4/Sentiment-Analysis/review_classifier/review/data.txt' ) as json_file: token = json.load(json_file) token = tokenizer_from_json(token) print("TOKEN:", token) tokenize_words = token.texts_to_sequences(filtered) tokenize_words = pad_sequences(tokenize_words, maxlen=max_length, padding='post', truncating='post') print('tokenize words', tokenize_words) result = loaded_model.predict(tokenize_words) #result=1 print(result) if result >= 0.5: print('positive') return HttpResponse('Positive review') else: print('negative') return HttpResponse('Negative review') else: return render(request, 'form.html')
def input_Text_proc(text_Data): text_Data = text_Data.strip() text_Data = re.split('; |, |\*|\n| |', text_Data) print(text_Data) global tokenizer if tokenizer is None: print('Tokenizer is Loaded') with open('./Tokenizer/tokenizer2.json') as f: token_json = json.load(f) tokenizer = text.tokenizer_from_json( token_json ) res = [[]] for w in text_Data: w = w.lower() if w not in tokenizer.word_index: res[0].append(0) else: res[0].append(tokenizer.word_index[w]) print(res) return res
def predict(ctx, prediction_data, model_dir): MODEL = ctx.obj['model'] titles_test, x_test, y_test = load_data(prediction_data, shuffle=False) labels_available = True with open(join(model_dir, 'tokenizer.json')) as f: json_string = f.read() tokenizer = tokenizer_from_json(json_string) with open(join(model_dir, 'model.config'), 'rb') as f: config = pickle.load(f) makedirs(model_dir, exist_ok=True) x_test, _, tokenizer = \ data_preprocessing(x_test, max_len=config['max_len'], tokenizer=tokenizer) config['batch_size'] = 1 config = Bunch(config) sess = tf.compat.v1.Session() model = MODEL(config) if y_test is None: y_test = np.zeros((x_test.shape[0], model.y.shape[1])) labels_available = False logger = Logger(sess, config) trainer = BaseTrain(sess, model, None, config, logger, val_data=(x_test, y_test), restore=True, no_labels=True) _, predictions, probabilities = trainer.eval() frame_dict = {'title': titles_test, 'prediction': predictions} for i in range(probabilities.shape[1]): frame_dict[f'probability_{i}'] = probabilities[:, i] if labels_available: frame_dict[f'true'] = np.argmax(y_test, axis=1) df = pd.DataFrame.from_dict(frame_dict) print(df) df.to_csv(join(model_dir, f'predictions-for-"{basename(prediction_data)}".csv'), index=False)
def get_predict(text): tokenizer = Tokenizer(num_words=50000) labels = ['chưa xác định', 'bệnh hạ huyết áp', 'bệnh viêm đường ruột'] with open('C:\\Users\\HONGANH\\OneDrive\\Resources\\vocab.json') as f: data = json.load(f) dictionary = tokenizer_from_json(data) json_file = open( 'C:\\Users\\HONGANH\\OneDrive\\Resources\\CNN_train_3c_relu.json', 'r') loaded_model_json = json_file.read() json_file.close() model = model_from_json(loaded_model_json) model.load_weights( 'C:\\Users\\HONGANH\\OneDrive\\Resources\\CNN_train_3c-035-0.0476-0.9940.h5' ) tok_sam, seq_sam, sample = load_full_data() sentence = [] evalSentence = text if evalSentence: evalSentence = evalSentence.lower() sentence.append(evalSentence) eval_text = tok_sam.texts_to_sequences(sentence) text_test = pad_sequences(eval_text, maxlen=sample.shape[1], padding=pad[0]) # pred = model.predict(testArr) pred = model.predict(text_test) print("%s; độ tin cậy %f%%" % (labels[np.argmax(pred)], pred[0][np.argmax(pred)] * 100)) del evalSentence sentence = [] return labels[np.argmax(pred)]
def classify_review(review_stmt): #returns 1 for positive,0 for negative english_stops = set(stopwords.words('english')) max_length = 130 #From train data #Loading stored model loaded_model = load_model( '/home/dsdroid/Desktop/6th Sem/SWLAB/Assignment 4/Sentiment-Analysis/review_classifier/review/sentiment.h5' ) # Pre-process input regex = re.compile(r'[^a-zA-Z\s]') review_stmt = regex.sub('', review_stmt) print('Cleaned: ', review_stmt) words = review_stmt.split(' ') filtered = [w for w in words if w not in english_stops] filtered = ' '.join(filtered) filtered = [filtered.lower()] # print('Filtered: ', filtered) #token = Tokenizer(lower=False) with open( '/home/dsdroid/Desktop/6th Sem/SWLAB/Assignment 4/Sentiment-Analysis/review_classifier/review/data.txt' ) as json_file: token = json.load(json_file) token = tokenizer_from_json(token) tokenize_words = token.texts_to_sequences(filtered) tokenize_words = pad_sequences(tokenize_words, maxlen=max_length, padding='post', truncating='post') print(tokenize_words) result = loaded_model.predict(tokenize_words) #print(result) if result >= 0.5: #print('positive') return True #1 else: #print('negative') return False #0
def eval(data,batch, maxlen, selected_layer,model_dir): cleaned_test = preprocess(data['document'].values) if selected_layer=='bert': FullTokenizer = bert.bert_tokenization.FullTokenizer tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=False) eval_tokens = [["[CLS]"] + tokenizer.tokenize(sentence) + ["[SEP]"] for sentence in cleaned_test] eval_data = [tokenizer.convert_tokens_to_ids(token) for token in eval_tokens] eval_data = pad_sequences(eval_data, maxlen=maxlen, dtype="long", truncating="post", padding="post") else: with open('tokenizer.json') as f: json_data = json.load(f) tokenizer=tokenizer_from_json(json_data) eval_data = tokenizer.texts_to_sequences(cleaned_test) eval_data = pad_sequences(eval_data, padding='post', maxlen=maxlen) labels = np.array(data['label']) model= load_model(model_dir) test_loss, test_acc = model.evaluate(x=eval_data, y=labels, verbose=1, batch_size=batch) print("Test Loss: {}\nTest Accuracy:{}".format(test_loss,test_acc))
def load_dictionary(dict_path, encoding="utf-8"): with open(dict_path, mode="r", encoding=encoding) as file: return tokenizer_from_json(json.load(file))