def load_dataset(params): tokenizer = Tokenizer() tokenizer.load() # Датасет должен быть заранее сформирован скриптом ./preparation/prepare_req_interpretation_classif.py df = pd.read_csv(os.path.join(data_folder, 'req_interpretation_dataset.csv'), sep='\t', encoding='utf-8') samples = [ Sample(row['text'], int(row['label'])) for i, row in df.iterrows() ] # Токенизация сэмплов for sample in samples: sample.words = tokenizer.tokenize(sample.phrase) nb_0 = sum(sample.y == 0 for sample in samples) nb_1 = sum(sample.y == 1 for sample in samples) logging.info('nb_0={} nb_1={}'.format(nb_0, nb_1)) max_wordseq_len = max(len(sample.words) for sample in samples) logging.info('max_wordseq_len={}'.format(max_wordseq_len)) if params['padding'] == 'left': for sample in samples: sample.words = lpad_wordseq(sample.words, max_wordseq_len) else: for sample in samples: sample.words = rpad_wordseq(sample.words, max_wordseq_len) computed_params = { 'max_wordseq_len': max_wordseq_len, 'nb_0': nb_0, 'nb_1': nb_1 } return samples, computed_params
def extract_entity(self, entity_name, phrase, text_utils, embeddings): model = self.models[self.entity2index[entity_name]] self.X_probe.fill(0) words = text_utils.tokenize(phrase) if self.padding == 'right': words = rpad_wordseq(words, self.max_inputseq_len) else: words = lpad_wordseq(words, self.max_inputseq_len) embeddings.vectorize_words(self.w2v_filename, words, self.X_probe, 0) inputs = dict() inputs['input'] = self.X_probe y = model.predict(x=inputs)[0] predicted_labels = np.argmax(y, axis=-1) selected_words = [ word for word, label in zip(words, predicted_labels) if label == 1 ] entity_text = u' '.join(selected_words).strip() return entity_text
def pad_wordseq(self, words, n): if self.padding == 'left': return lpad_wordseq(words, n) else: return rpad_wordseq(words, n)
computed_params['embeddings'] = embeddings computed_params['word_dims'] = embeddings.vector_size logging.info('Restoring model architecture from "%s"', arch_filepath) with open(arch_filepath, 'r') as f: model = model_from_json(f.read()) logging.info('Loading model weights from "%s"', weights_path) model.load_weights(weights_path) tokenizer = Tokenizer() tokenizer.load() while True: phrase = ruchatbot.utils.console_helpers.input_kbd(':> ').strip() sample1 = Sample(phrase, 0) sample1.words = tokenizer.tokenize(phrase) if padding == 'left': sample1.words = lpad_wordseq(sample1.words, max_wordseq_len) else: sample1.words = rpad_wordseq(sample1.words, max_wordseq_len) for istep, xy in enumerate( generate_rows([sample1], 2, computed_params, 1)): x = xy[0] y_pred = model.predict(x=x, verbose=0)[0] print('y==0 --> {}'.format(y_pred[0])) print('y==1 --> {}'.format(y_pred[1])) break
def pad_wordseq(words, n, padding): if padding == 'right': return rpad_wordseq(words, n) else: return lpad_wordseq(words, n)