X2.append(iclasses[label]) labels.append(0) categoricals = {"匹配":1, "不匹配":0} return X1, X2, labels, categoricals X1, X2, y, classes = convert_to_pairs(X, y, classes) X1_train = X1[:-1000] X2_train = X2[:-1000] y_train = y[:-1000] X1_test = X1[-1000:] X2_test = X2[-1000:] y_test = y[-1000:] num_classes = len(classes) tokenizer = SimpleTokenizer() tokenizer.fit(X1 + X2) X1_train = tokenizer.transform(X1_train) X2_train = tokenizer.transform(X2_train) maxlen = 48 hdims = 128 epochs = 2 X1_train = sequence.pad_sequences( X1_train, maxlen=maxlen, dtype="int32", padding="post", truncating="post", value=0
delta = self.epsilon * grads / (tf.norm(grads) + 1e-6) # 计算扰动 embeddings.assign_add(delta) # 添加扰动到Embedding矩阵 results = super(AdversarialTrainer, self).train_step(data) # 执行普通的train_step embeddings.assign_sub(delta) # 删除Embedding矩阵上的扰动 return results X, y, classes = load_hotel_comment() X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.6, random_state=7384672) num_classes = len(classes) tokenizer = SimpleTokenizer() tokenizer.fit(X_train) maxlen = find_best_maxlen(X_train) # maxlen = 256 def create_dataset(X, y, maxlen=maxlen): X = tokenizer.transform(X) X = sequence.pad_sequences(X, maxlen=maxlen, dtype="int32", padding="post", truncating="post", value=0.0) y = tf.keras.utils.to_categorical(y)
from pooling import MaskGlobalMaxPooling1D from pooling import MaskGlobalAveragePooling1D from dataset import SimpleTokenizer, find_best_maxlen from dataset import load_THUCNews_title_label from dataset import load_weibo_senti_100k from dataset import load_simplifyweibo_4_moods from dataset import load_hotel_comment X, y, classes = load_hotel_comment() X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=7384672) num_classes = len(classes) tokenizer = SimpleTokenizer() tokenizer.fit(X_train) X_train = tokenizer.transform(X_train) maxlen = 48 maxlen = find_best_maxlen(X_train) X_train = sequence.pad_sequences(X_train, maxlen=maxlen, dtype="int32", padding="post", truncating="post", value=0.0) y_train = tf.keras.utils.to_categorical(y_train) num_words = len(tokenizer)
from dataset import load_THUCNews_title_label from dataset import load_weibo_senti_100k from dataset import load_simplifyweibo_4_moods from dataset import load_simplifyweibo_3_moods from dataset import load_hotel_comment # 来自Transformer的激活函数,效果略有提升 def gelu(x): return 0.5 * x * (1.0 + tf.math.erf(x / tf.sqrt(2.0))) X, y, classes = load_hotel_comment() X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=73672) class_weight = balance_class_weight(y_train) num_classes = len(classes) tokenizer = SimpleTokenizer(min_freq=32) tokenizer.fit(X_train) X_train = tokenizer.transform(X_train) maxlen = 48 maxlen = find_best_maxlen(X_train) X_train = sequence.pad_sequences( X_train, maxlen=maxlen, dtype="int32", padding="post", truncating="post", value=0 ) y_train = tf.keras.utils.to_categorical(y_train, num_classes)
i, j = split_index(size=len(files)) files_train = files[:i] files_val = files[i:j] files_test = files[j:] # train tokenizer def Xiter(files): for content, label in gen(files): yield content tokenizer = SimpleTokenizer() tokenizer.fit(*[Xiter(files)]) class DataGenerator: def __init__(self, files, loop): self.files = files self.loop = loop def __call__(self): for _ in range(self.loop): random.shuffle(self.files) for content, label in gen(self.files): content = content[:maxlen] content = tokenizer.transform([content])[0] label = tf.keras.utils.to_categorical(label, num_classes)
loss = tf.math.maximum(ploss - nloss + self.margin, 0.0) self.add_loss(tf.reduce_mean(loss)) return loss Xa, Xp, Xn, classes = convert_to_triplet(load_lcqmc) Xa_train = Xa[:-1000] Xp_train = Xp[:-1000] Xn_train = Xn[:-1000] Xa_test = Xa[:-1000] Xp_test = Xp[:-1000] Xn_test = Xn[:-1000] num_classes = len(classes) tokenizer = SimpleTokenizer() tokenizer.fit(Xa) def pad(X, maxlen): X = sequence.pad_sequences(X, maxlen=maxlen, dtype="int32", padding="post", truncating="post", value=0) return X def create_dataset(Xa, Xp, Xn, maxlen): Xa = tokenizer.transform(Xa)