예제 #1
0
    def run(self):
        clause_df = pd.DataFrame(self.clauses)
        print("after:::", clause_df.head(5))

        # change column headers, for processing by Siamese-LSTM
        clause_df.columns = ['no', 'question1', 'question2']
        for q in ['question1', 'question2']:
            clause_df[q + '_n'] = clause_df[q]

        # Make word2vec embeddings
        embedding_dim = 300
        max_seq_length = 20
        clause_df, embeddings = make_w2v_embeddings(
            clause_df, embedding_dim=embedding_dim, empty_w2v=False)

        # Split to dicts and append zero padding.
        X_test = split_and_zero_padding(clause_df, max_seq_length)

        # Make sure everything is ok
        assert X_test['left'].shape == X_test['right'].shape

        model = tf.keras.models.load_model('./data/keras_model/SiameseLSTM.h5',
                                           custom_objects={'ManDist': ManDist})
        model.summary()

        # prediction = model.predict([X_test['left'], X_test['right']])
        prediction = model.predict([X_test['left'], X_test['right']],
                                   verbose=1)
        print(prediction)

        # zip section header w/ model-prediction, Ex: 'Section 1 : 0.54'
        result = zip([x[0] for x in self.clauses], prediction.tolist())

        return result
예제 #2
0
    def compare_si(self, senti, input_sentence):
        if senti == 0:
            datafile = self.data_directory / 'yelp_0.txt'
        elif senti == 1:
            datafile = self.data_directory / 'yelp_1.txt'

        with open(datafile) as f:
            data = f.readlines()

        for i in range(len(data)):
            data[i] = data[i].replace('.', '').replace('\n',
                                                       '').replace('!', '')

        result_index = []
        test_sentence_pairs = []
        for i in range(len(data)):
            test_sentence = (input_sentence, data[i])
            test_sentence_pairs.append(test_sentence)

        embedding_dict = {}

        test_df = pd.DataFrame(test_sentence_pairs,
                               columns=['question1', 'question2'])
        for q in ['question1', 'question2']:
            test_df[q + '_n'] = test_df[q]

        test_df, embeddings = make_w2v_embeddings(embedding_dict,
                                                  test_df,
                                                  embedding_dim=300)

        X_test = split_and_zero_padding(test_df, 10)

        assert X_test['left'].shape == X_test['right'].shape

        preds = list(
            self.model_similarity.predict([X_test['left'], X_test['right']]))

        results = [(x, y, z) for (x, y), z in zip(test_sentence_pairs, preds)]
        results.sort(key=itemgetter(2), reverse=True)

        return results[0:3]
# 是否启用预训练的词向量,默认使用随机初始化的词向量
o = input("type yes or no for choosing pre-trained w2v or not:")
if o == 'yes':
    # 加载词向量
    print("Loading word2vec model(it may takes 2-3 mins) ...")
    embedding_dict = KeyedVectors.load_word2vec_format(embedding_path, binary=True)
else:
    embedding_dict = {}

# 读取并加载训练集
train_df = pd.read_csv(TRAIN_CSV)
for q in ['question1', 'question2']:
    train_df[q + '_n'] = train_df[q]

# 将训练集词向量化
train_df, embeddings = make_w2v_embeddings(flag, embedding_dict, train_df, embedding_dim=embedding_dim)
'''
把训练数据从:
question1   question2   is_duplicate
借 呗 还款 信息   借 呗 还款 日期    0

变成:
question1   question2   is_duplicate    question1_n question2_n
借 呗 还款 信息   借 呗 还款 日期   0   借 呗 还款 信息   借 呗 还款 日期

变成id以后:
question1   question2   is_duplicate    question1_n question2_n
借 呗 还款 信息   借 呗 还款 日期   0   [31, 639]   [31, 255]
'''

# 分割训练集
예제 #4
0
from util import make_w2v_embeddings
from util import split_and_zero_padding
from util import ManDist

# File paths
TEST_CSV = './data/test.csv'

# Load training set
test_df = pd.read_csv(TEST_CSV)
for q in ['question1', 'question2']:
    test_df[q + '_n'] = test_df[q]

# Make word2vec embeddings
embedding_dim = 300
max_seq_length = 20
test_df, embeddings = make_w2v_embeddings(test_df, embedding_dim=embedding_dim, empty_w2v=False)

# Split to dicts and append zero padding.
X_test = split_and_zero_padding(test_df, max_seq_length)

# Make sure everything is ok
assert X_test['left'].shape == X_test['right'].shape

# --

model = tf.keras.models.load_model('./data/SiameseLSTM.h5', custom_objects={'ManDist': ManDist})
model.summary()

prediction = model.predict([X_test['left'], X_test['right']])
print(prediction)
예제 #5
0
# File paths
TRAIN_CSV = './data/train.csv'

# Load training set
train_df = pd.read_csv(TRAIN_CSV)
for q in ['question1', 'question2']:
    train_df[q + '_n'] = train_df[q]

# Make word2vec embeddings
embedding_dim = 300
max_seq_length = 20
use_w2v = True

train_df, embeddings = make_w2v_embeddings(train_df,
                                           embedding_dim=embedding_dim,
                                           empty_w2v=not use_w2v)

# Split to train validation
validation_size = int(len(train_df) * 0.1)
training_size = len(train_df) - validation_size

X = train_df[['question1_n', 'question2_n']]
Y = train_df['is_duplicate']

X_train, X_validation, Y_train, Y_validation = train_test_split(
    X, Y, test_size=validation_size)

X_train = split_and_zero_padding(X_train, max_seq_length)
X_validation = split_and_zero_padding(X_validation, max_seq_length)
예제 #6
0
from util import ManDist

# File paths
TEST_CSV = './data/test-20.csv'
EMBEDDING_FILE = './data/GoogleNews-vectors-negative300.bin.gz'

# Load training set
test_df = pd.read_csv(TEST_CSV)
for q in ['question1', 'question2']:
    test_df[q + '_n'] = test_df[q]

# Make word2vec embeddings
embedding_dim = 300
max_seq_length = 20
test_df, embeddings = make_w2v_embeddings(test_df,
                                          file=EMBEDDING_FILE,
                                          embedding_dim=embedding_dim,
                                          empty_w2v=False)

# Split to dicts and append zero padding.
X_test = split_and_zero_padding(test_df, max_seq_length)

# Make sure everything is ok
assert X_test['left'].shape == X_test['right'].shape

# --

model = tf.keras.models.load_model('./data/malstm.h5',
                                   custom_objects={'ManDist': ManDist})
model.summary()

prediction = model.predict([X_test['left'], X_test['right']])
예제 #7
0
# Parameters
max_features = 5000
maxlen = 50
gpus = 1
batch_size = 1024 * gpus
embedding_dims = 300
epochs = 10

print('Loading data...')
DATA_FILE = "~/.kaggle/datasets/uciml/news-aggregator-dataset/uci-news-aggregator.csv"
# DATA_FILE = "../uci-news-aggregator.csv"
df = pd.read_csv(DATA_FILE)

df['TITLE_n'] = df['TITLE']
df, embeddings = make_w2v_embeddings(df,
                                     embedding_dim=embedding_dims,
                                     empty_w2v=True)

y = OneHotEncoder().fit_transform(LabelEncoder().fit_transform(
    df['CATEGORY']).reshape(-1, 1)).toarray()

x_train, x_test, y_train, y_test = train_test_split(df['TITLE_n'],
                                                    y,
                                                    test_size=0.1)
# (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
print(len(embeddings), 'embeddings input_dim')

print('Pad sequences (samples x time)')
                'question1': ["".join(sen1)],
                'question2': ["".join(sen2)]
            })

        dataframe.to_csv("./data/test.csv",
                         index=False,
                         sep=',',
                         encoding='utf-8')
        TEST_CSV = './data/test.csv'

        # 读取并加载测试集
        test_df = pd.read_csv(TEST_CSV)
        for q in ['question1', 'question2']:
            test_df[q + '_n'] = test_df[q]

        # 将测试集词向量化
        test_df, embeddings = make_w2v_embeddings(flag,
                                                  embedding_dict,
                                                  test_df,
                                                  embedding_dim=embedding_dim)

        # 预处理
        X_test = split_and_zero_padding(test_df, max_seq_length)

        # 确认数据准备完毕且正确
        assert X_test['left'].shape == X_test['right'].shape

        # 预测并评估准确率
        prediction = model.predict([X_test['left'], X_test['right']])
        print(prediction)
for q in ['question1', 'question2']:
	train_df[q + '_n'] = train_df[q]
	test_df[q + '_n'] = test_df[q]

test_df = test_df[train_df.columns]

use_w2v = True
print('-------------')
# print(train_df.head())
# print(test_df.head())
train_size= train_df.shape[0]
print('train size: {}'.format(train_size))
print('-------------')
if BUILD_EMBED == True:
	full_df = train_df.append(test_df, ignore_index=True)
	full_df, embeddings = make_w2v_embeddings(full_df, embedding_dim=embedding_dim, empty_w2v=not use_w2v)
	print("sentences embedded")

else:
	# full_df= pd.read_csv('./data/full_embeddings_A1.csv')
	# embeddings = np.load('./data/embeddings/embedding_matrix_A1.npy')
	print('embeddings loaded')
train_df = full_df.iloc[:train_size].copy()
test_df = full_df.iloc[train_size:].copy()
print('--------------------------')
# print(train_df.head())
# print(test_df.head())
print('--------------------------')
# test_df, embeddingsx = make_w2v_embeddings(test_df, embedding_dim=embedding_dim, empty_w2v=not use_w2v)
# print("sentences embedded")
# test_df.to_csv('./data/test_embeddings.csv', index= False)
예제 #10
0
    del word2vec

    return df, embeddings


df_ = pd.DataFrame([[
    "What are the best career growth technologies for automation engineers apart from automation tools?",
    "Himalayan or Duke KTM 200 for touring?"
]],
                   columns=["question1", "question2"])
for q in ['question1', 'question2']:
    df_[q + '_n'] = df_[q]
df_.head()

train_df, embeddings = make_w2v_embeddings(word2vec=embeddings,
                                           df=df_,
                                           embedding_dim=embedding_dim)
split_df = split_and_zero_padding(train_df, max_seq_length)
print(split_df)

# In[15]:

assert split_df['left'].shape == split_df['right'].shape

# In[16]:


def find_similar_sentence(user_input):
    is_duplicate = model.predict([split_df['left'], split_df['right']])
    return is_duplicate
예제 #11
0
# File paths
# TRAIN_CSV = './data/train.csv'
TRAIN_CSV = './data/quora.csv'

# Load training set
train_df = pd.read_csv(TRAIN_CSV)
for q in ['question1', 'question2']:
    train_df[q + '_n'] = train_df[q]

# Make word2vec embeddings
embedding_dim = 300
max_seq_length = 20
use_w2v = True

train_df, embeddings = make_w2v_embeddings(train_df,
                                           embedding_dim=embedding_dim)

# Split to train validation
validation_size = int(len(train_df) * 0.1)
training_size = len(train_df) - validation_size

X = train_df[['question1_n', 'question2_n']]
Y = train_df['is_duplicate']

X_train, X_validation, Y_train, Y_validation = train_test_split(
    X, Y, test_size=validation_size)

X_train = split_and_zero_padding(X_train, max_seq_length)
X_validation = split_and_zero_padding(X_validation, max_seq_length)

# Convert labels to their numpy representations