Пример #1
0
    def run(self):
        clause_df = pd.DataFrame(self.clauses)
        print("after:::", clause_df.head(5))

        # change column headers, for processing by Siamese-LSTM
        clause_df.columns = ['no', 'question1', 'question2']
        for q in ['question1', 'question2']:
            clause_df[q + '_n'] = clause_df[q]

        # Make word2vec embeddings
        embedding_dim = 300
        max_seq_length = 20
        clause_df, embeddings = make_w2v_embeddings(
            clause_df, embedding_dim=embedding_dim, empty_w2v=False)

        # Split to dicts and append zero padding.
        X_test = split_and_zero_padding(clause_df, max_seq_length)

        # Make sure everything is ok
        assert X_test['left'].shape == X_test['right'].shape

        model = tf.keras.models.load_model('./data/keras_model/SiameseLSTM.h5',
                                           custom_objects={'ManDist': ManDist})
        model.summary()

        # prediction = model.predict([X_test['left'], X_test['right']])
        prediction = model.predict([X_test['left'], X_test['right']],
                                   verbose=1)
        print(prediction)

        # zip section header w/ model-prediction, Ex: 'Section 1 : 0.54'
        result = zip([x[0] for x in self.clauses], prediction.tolist())

        return result
Пример #2
0
    def compare_si(self, senti, input_sentence):
        if senti == 0:
            datafile = self.data_directory / 'yelp_0.txt'
        elif senti == 1:
            datafile = self.data_directory / 'yelp_1.txt'

        with open(datafile) as f:
            data = f.readlines()

        for i in range(len(data)):
            data[i] = data[i].replace('.', '').replace('\n',
                                                       '').replace('!', '')

        result_index = []
        test_sentence_pairs = []
        for i in range(len(data)):
            test_sentence = (input_sentence, data[i])
            test_sentence_pairs.append(test_sentence)

        embedding_dict = {}

        test_df = pd.DataFrame(test_sentence_pairs,
                               columns=['question1', 'question2'])
        for q in ['question1', 'question2']:
            test_df[q + '_n'] = test_df[q]

        test_df, embeddings = make_w2v_embeddings(embedding_dict,
                                                  test_df,
                                                  embedding_dim=300)

        X_test = split_and_zero_padding(test_df, 10)

        assert X_test['left'].shape == X_test['right'].shape

        preds = list(
            self.model_similarity.predict([X_test['left'], X_test['right']]))

        results = [(x, y, z) for (x, y), z in zip(test_sentence_pairs, preds)]
        results.sort(key=itemgetter(2), reverse=True)

        return results[0:3]
借 呗 还款 信息   借 呗 还款 日期    0

变成:
question1   question2   is_duplicate    question1_n question2_n
借 呗 还款 信息   借 呗 还款 日期   0   借 呗 还款 信息   借 呗 还款 日期

变成id以后:
question1   question2   is_duplicate    question1_n question2_n
借 呗 还款 信息   借 呗 还款 日期   0   [31, 639]   [31, 255]
'''

# 分割训练集
X = train_df[['question1_n', 'question2_n']]
Y = train_df['is_duplicate']
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=0.1)
X_train = split_and_zero_padding(X_train, max_seq_length)
X_validation = split_and_zero_padding(X_validation, max_seq_length)

# 将标签转化为数值
Y_train = Y_train.values
Y_validation = Y_validation.values

# 确认数据准备完毕且正确
assert X_train['left'].shape == X_train['right'].shape
assert len(X_train['left']) == len(Y_train)


# -----------------基础函数------------------ #

def shared_model(_input):
    # 词向量化
Пример #4
0
positive_indexes = []
for i, _is_true in enumerate(is_true):
    if _is_true:
        positive_indexes.append(i)
positive_indexes.append(len(is_true))
print("original dataframe size", demo_df.shape)

for q in ['question1', 'question2']:
    demo_df[q + '_n'] = demo_df[q]

# Make word2vec embeddings
max_seq_length = 20
test_df, _ = load_embedding_and_vectorize(demo_df)

# Split to dicts and append zero padding.
X_test = split_and_zero_padding(test_df, max_seq_length)

# Make sure everything is ok
assert X_test['left'].shape == X_test['right'].shape

# --
print(np.array(test_df["is_duplicate"]).shape)
print(X_test['left'].shape, X_test['right'].shape)
with tf.device('/device:GPU:{}'.format(params.gpu)):
    model = tf.keras.models.load_model('./models/SiameseLSTM.h5',
                                       custom_objects={'ManDist': ManDist})
    model.summary()

    print(
        model.evaluate([X_test['left'], X_test['right']],
                       batch_size=512,
Пример #5
0
    return df, embeddings


df_ = pd.DataFrame([[
    "What are the best career growth technologies for automation engineers apart from automation tools?",
    "Himalayan or Duke KTM 200 for touring?"
]],
                   columns=["question1", "question2"])
for q in ['question1', 'question2']:
    df_[q + '_n'] = df_[q]
df_.head()

train_df, embeddings = make_w2v_embeddings(word2vec=embeddings,
                                           df=df_,
                                           embedding_dim=embedding_dim)
split_df = split_and_zero_padding(train_df, max_seq_length)
print(split_df)

# In[15]:

assert split_df['left'].shape == split_df['right'].shape

# In[16]:


def find_similar_sentence(user_input):
    is_duplicate = model.predict([split_df['left'], split_df['right']])
    return is_duplicate


# In[17]: