예제 #1
0
def load_dataset(params):
    tokenizer = Tokenizer()
    tokenizer.load()

    # Датасет должен быть заранее сформирован скриптом ./preparation/prepare_req_interpretation_classif.py
    df = pd.read_csv(os.path.join(data_folder,
                                  'req_interpretation_dataset.csv'),
                     sep='\t',
                     encoding='utf-8')
    samples = [
        Sample(row['text'], int(row['label'])) for i, row in df.iterrows()
    ]

    # Токенизация сэмплов
    for sample in samples:
        sample.words = tokenizer.tokenize(sample.phrase)

    nb_0 = sum(sample.y == 0 for sample in samples)
    nb_1 = sum(sample.y == 1 for sample in samples)
    logging.info('nb_0={} nb_1={}'.format(nb_0, nb_1))

    max_wordseq_len = max(len(sample.words) for sample in samples)
    logging.info('max_wordseq_len={}'.format(max_wordseq_len))

    if params['padding'] == 'left':
        for sample in samples:
            sample.words = lpad_wordseq(sample.words, max_wordseq_len)
    else:
        for sample in samples:
            sample.words = rpad_wordseq(sample.words, max_wordseq_len)

    computed_params = {
        'max_wordseq_len': max_wordseq_len,
        'nb_0': nb_0,
        'nb_1': nb_1
    }

    return samples, computed_params
예제 #2
0
    def extract_entity(self, entity_name, phrase, text_utils, embeddings):
        model = self.models[self.entity2index[entity_name]]

        self.X_probe.fill(0)

        words = text_utils.tokenize(phrase)
        if self.padding == 'right':
            words = rpad_wordseq(words, self.max_inputseq_len)
        else:
            words = lpad_wordseq(words, self.max_inputseq_len)

        embeddings.vectorize_words(self.w2v_filename, words, self.X_probe, 0)

        inputs = dict()
        inputs['input'] = self.X_probe

        y = model.predict(x=inputs)[0]
        predicted_labels = np.argmax(y, axis=-1)

        selected_words = [
            word for word, label in zip(words, predicted_labels) if label == 1
        ]
        entity_text = u' '.join(selected_words).strip()
        return entity_text
예제 #3
0
 def pad_wordseq(self, words, n):
     if self.padding == 'left':
         return lpad_wordseq(words, n)
     else:
         return rpad_wordseq(words, n)
예제 #4
0
    computed_params['embeddings'] = embeddings
    computed_params['word_dims'] = embeddings.vector_size

    logging.info('Restoring model architecture from "%s"', arch_filepath)
    with open(arch_filepath, 'r') as f:
        model = model_from_json(f.read())

    logging.info('Loading model weights from "%s"', weights_path)
    model.load_weights(weights_path)

    tokenizer = Tokenizer()
    tokenizer.load()

    while True:
        phrase = ruchatbot.utils.console_helpers.input_kbd(':> ').strip()
        sample1 = Sample(phrase, 0)
        sample1.words = tokenizer.tokenize(phrase)

        if padding == 'left':
            sample1.words = lpad_wordseq(sample1.words, max_wordseq_len)
        else:
            sample1.words = rpad_wordseq(sample1.words, max_wordseq_len)

        for istep, xy in enumerate(
                generate_rows([sample1], 2, computed_params, 1)):
            x = xy[0]
            y_pred = model.predict(x=x, verbose=0)[0]
            print('y==0 --> {}'.format(y_pred[0]))
            print('y==1 --> {}'.format(y_pred[1]))
            break
예제 #5
0
def pad_wordseq(words, n, padding):
    if padding == 'right':
        return rpad_wordseq(words, n)
    else:
        return lpad_wordseq(words, n)