def run_training(model: str) -> None: """ Training our Machine Learning model and serializing to disc """ # read train and test data df_train = pd.read_csv(config.ORIGINAL_TRAIN) df_test = pd.read_csv(config.TEST_DATA) # relabel mislabeled samples df_train = data_clean.relabel_target(df_train) # shuffle data df_train = df_train.sample(frac=1, random_state=42).reset_index(drop=True) # clean the text df_train[config.CLEANED_TEXT] = df_train[config.TEXT].apply(pp.clean_tweet) df_test[config.CLEANED_TEXT] = df_test[config.TEXT].apply(pp.clean_tweet) # save the modified train and test data df_train.to_csv(config.MODIFIED_TRAIN, index=False) df_test.to_csv(config.MODIFIED_TEST, index=False) del df_test # convert text to numerical representation tokenizer = Tokenizer(oov_token="<unk>") tokenizer.fit_on_texts(df_train[config.CLEANED_TEXT]) # path to save model model_path = f"{config.MODEL_DIR}/PRETRAIN_WORD2VEC_{model}/" # checking the folder exist if not os.path.exists(model_path): os.makedirs(model_path) # saving tokenizer with open(f'{model_path}tokenizer.pkl', 'wb') as handle: pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) # pad the sequences X_padded = pad_sequences(tokenizer.texts_to_sequences( df_train[config.CLEANED_TEXT].values), maxlen=config.MAXLEN) # get the pretrained word embeddings and prepare embedding layer embedding_matrix = f.get_word2vec_enc(tokenizer.word_index.items(), config.PRETRAINED_WORD2VEC) embedding_layer = Embedding(input_dim=config.VOCAB_SIZE, output_dim=config.EMBED_SIZE, weights=[embedding_matrix], input_length=config.MAXLEN, trainable=False) # target values y = df_train[config.RELABELED_TARGET].values # train a single model clf = my_LSTM(embedding_layer) clf.fit(X_padded, y, epochs=config.N_EPOCHS, verbose=1) # persist the model clf.save(f"{model_path}/{model}_Word2Vec.h5")
def main(args): print(args) startime = time.time() os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true' # Set hyper-parameters. batch_size = 128 epochs = 100 maxlen = 300 model_path = 'models/model_{}.h5' num_words = 40000 num_label = 2 # Data loading. print(return_time(startime), "1. Loading data ...") x, y = load_dataset('data/amazon_reviews_multilingual_JP_v1_00.tsv') # pre-processing. print(return_time(startime), "2. Preprocessing dataset ...") x = preprocess_dataset(x) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) vocab = build_vocabulary(x_train, num_words) x_train = vocab.texts_to_sequences(x_train) x_test = vocab.texts_to_sequences(x_test) x_train = pad_sequences(x_train, maxlen=maxlen, truncating='post') x_test = pad_sequences(x_test, maxlen=maxlen, truncating='post') # Preparing word embedding. if args.loadwv: print(return_time(startime), "3. Loading word embedding ...") wv_path = 'data/wv_{0}_{1}.npy'.format(maxlen, num_words) if os.path.exists(wv_path): wv = np.load(wv_path) print(return_time(startime), "Loaded word embedding successfully!") else: print(return_time(startime), "Word embedding file doesn't exist") exit() else: print(return_time(startime), "3. Preparing word embedding ...") wv = load_fasttext('data/cc.ja.300.vec.gz') wv = filter_embeddings(wv, vocab.word_index, num_words) # Saving word embedding. if args.savewv: wv_path = 'data/wv_{0}_{1}.npy'.format(maxlen, num_words) np.save(wv_path, wv) print(return_time(startime), "Saved word embedding successfully!", wv_path) # Build models. models = [ RNNModel(num_words, num_label, embeddings=None).build(), LSTMModel(num_words, num_label, embeddings=None).build(), CNNModel(num_words, num_label, embeddings=None).build(), RNNModel(num_words, num_label, embeddings=wv).build(), LSTMModel(num_words, num_label, embeddings=wv).build(), CNNModel(num_words, num_label, embeddings=wv).build(), CNNModel(num_words, num_label, embeddings=wv, trainable=False).build() ] model_names = [ "RNN-None", "LSTM-None", "CNN-None", "RNN-wv", "LSTM-wv", "CNN-wv", "CNN-wv-notrain" ] print(return_time(startime), "4. Start training ...") for i, model in enumerate(models): print("***********************************") print(return_time(startime), "Model:", model_names[i]) model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc']) # Preparing callbacks. callbacks = [ EarlyStopping(patience=3), ModelCheckpoint(model_path.format(model_names[i]), save_best_only=True) ] # Train the model. model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2, callbacks=callbacks, shuffle=True) # Inference. model = load_model(model_path.format(model_names[i])) api = InferenceAPI(model, vocab, preprocess_dataset) y_pred = api.predict_from_sequences(x_test) print('precision: {:.4f}'.format(precision_score(y_test, y_pred, average='binary'))) print('recall : {:.4f}'.format(recall_score(y_test, y_pred, average='binary'))) print('f1 : {:.4f}'.format(f1_score(y_test, y_pred, average='binary')))
def main(args): training_data = pd.read_csv( os.path.join(args.data_dir, "Keras_latest_training_data.csv")) # Version control model_trained_time = time.strftime("%Y%m%d-%H%M") model_version = 'model_' + model_trained_time # Intent training begins traindata = training_data[['text', 'intent']] traindata = traindata[pd.notna(traindata['text'].values)] # Randomly sample 5% of validation validation_splitrate = 0.05 traindata_validation = traindata.sample(frac=validation_splitrate) traindata_train = traindata.loc[~traindata.index.isin(traindata_validation. index)] # reshape the traindate as first 95% train, last 5% validation traindata = pd.concat([traindata_train, traindata_validation]) # tokenizer tokenizer = Tokenizer() tokenizer.fit_on_texts(traindata['text'].values) X = tokenizer.texts_to_sequences(traindata['text'].values) X = pad_sequences(X, maxlen=50) # Y labels Ylabel = np.array(sorted(set(traindata['intent']))) Y = pd.get_dummies(traindata['intent'], prefix='', prefix_sep='') Y = Y.T.reindex(Ylabel).T.fillna(0).values intent_tokens = {"X": X, "Y": Y, "Ylabel": Ylabel, "tokenizer": tokenizer} with open( '{}/{}_intent_tokens.pickle'.format(args.model_dir, model_version), 'wb') as handle: pickle.dump(intent_tokens, handle, protocol=pickle.HIGHEST_PROTOCOL) ## parameters max_features = np.max(X) Ndense = len(set(traindata['intent'])) # LSTM train model = Sequential() model.add( Embedding(max_features + 1, args.embed_dim, input_length=X.shape[1])) model.add(SpatialDropout1D(args.sdropoutrate)) model.add( LSTM(args.lstm_out, dropout=args.dropoutrate, recurrent_dropout=args.rdropoutrate)) model.add(Dense(Ndense, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) intent_history = model.fit(X, Y, epochs=5, validation_split=validation_splitrate, batch_size=args.batch_size, verbose=2) # Saving intent model intent_history.model.save('{}/LSTM_history.h5'.format(args.model_dir), save_format='h5')
# convert from series to a list text = df['twitts'].tolist() y = df['sentiment'] token = Tokenizer() token.fit_on_texts(text) vocab_size = len(token.word_index) + 1 encoded_text = token.texts_to_sequences(text) # Pad the sequences max_len = max([len(s.split()) for s in text]) X = pad_sequences(encoded_text, maxlen=max_len, padding='post') # How to work with GloVe vectors using the 200Dimension one. # The embedding layer will contain words represented in 200 dimension glove_vectors = dict() file = open('../../../../Data/glove.twitter.27B.200d.txt', encoding='utf-8') # Create the word embeddings for line in file: value = line.split() word = value[0] vector = np.asarray(value[1:]) glove_vectors[word] = vector file.close()
def sent_anly_prediction(): if request.method == 'POST': text = request.form['text'] model = keras.models.load_model('sarcasm.h5') with open("sarcasm.json", 'r') as f: datastore = json.load(f) sentences = [] labels = [] for item in datastore: sentences.append(item['headline']) labels.append(item['is_sarcastic']) vocab_size = 10000 embedding_dim = 16 max_length = 100 trunc_type = 'post' padding_type = 'post' oov_tok = "<OOV>" training_size = 20000 training_sentences = sentences[0:training_size] testing_sentences = sentences[training_size:] training_labels = labels[0:training_size] testing_labels = labels[training_size:] tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok) tokenizer.fit_on_texts(training_sentences) word_index = tokenizer.word_index training_sequences = tokenizer.texts_to_sequences(training_sentences) training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type) testing_sequences = tokenizer.texts_to_sequences(testing_sentences) testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type) sentence = [text] sequences = tokenizer.texts_to_sequences(sentence) padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type) predict = model.predict(padded) classes = model.predict_classes(padded) if classes[0] == 0: senti = "not sarcastic" else: senti = "sarcastic" return render_template('setiment.html', text=text, sentiment=senti, probability=predict[0])
testing_labels_final = np.array(testing_labels) vocab_size = 10000 embedding_dim = 16 max_length = 120 trunc_type = 'post' oov_tok = "<OOV>" from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok) tokenizer.fit_on_texts(training_sentences) word_index = tokenizer.word_index sequences = tokenizer.texts_to_sequences(training_sentences) padded = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type) testing_sequences = tokenizer.texts_to_sequences(testing_sentences) testing_padded = pad_sequences(testing_sequences, maxlen=max_length) model = tf.keras.Sequential([ tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length), tf.keras.layers.Flatten(), tf.keras.layers.Dense(6, activation='relu'), tf.keras.layers.Dense(1, activation='sigmoid') ]) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
def attack(self, seq, target, l, max_change=0.5): seq = seq.cpu().detach().numpy().squeeze( ) #'''label of change; convert''' seq_orig, seq_orig_string, l_orig = self.orig_sentence(seq) # print(seq_orig) # seq_adv = seq.copy() # seq_len = np.sum(np.sign(seq)) l = l.cpu() # print(self.tokenizer.convert_ids_to_tokens(seq.tolist())) # To calculate the sampling probability tmp = [ glove_utils.pick_most_similar_words(self.compute_dist(seq_orig[i]), 50, 0.5) for i in range(l_orig) ] # tmp = [glove_utils.pick_most_similar_words(self.compute_dist(self.dataset.dict[self.tokenizer.convert_ids_to_tokens([seq[i]])[0]]), ret_count = 50, threshold = 0.5) if self.tokenizer.convert_ids_to_tokens([seq[i]])[0] in self.dataset.dict else ([], []) for i in range(l)] neighbour_list = [t[0] for t in tmp] neighbour_dist = [t[1] for t in tmp] neighbour_len = [len(i) for i in neighbour_list] for i in range(l_orig): if (seq_orig[i] < 27): # To prevent replacement of words like 'the', 'a', 'of', etc. neighbour_len[i] = 0 prob_select = neighbour_len / np.sum(neighbour_len) # print(prob_select) # tmp = [glove_utils.pick_most_similar_words( # self.compute_dist(self.dataset.dict[self.tokenizer.convert_ids_to_tokens([seq[i]])[0]]), self.top_n1, 0.5 # ) if self.tokenizer.convert_ids_to_tokens([seq[i]])[0] in self.dataset.dict else ([], []) for i in range(l)] tmp = [ glove_utils.pick_most_similar_words(self.compute_dist(seq_orig[i]), self.top_n1, 0.5) for i in range(l_orig) ] neighbour_list = [t[0] for t in tmp] neighbour_dist = [t[1] for t in tmp] # print('synonyms') # print(tmp) # print([[self.dataset.inv_dict[j] for j in i if j in self.dataset.inv_dict] for i in neighbour_list]) seq_adv = seq_orig_string.copy() # pop = [self.perturb(seq_adv, seq, seq_orig, l_orig, neighbour_list, neighbour_dist, prob_select, seq_len, target, l) for _ in range(self.pop_size)] pop = [ self.perturb(seq_adv, seq_orig_string, l_orig, neighbour_list, neighbour_dist, prob_select, target, l) for _ in range(self.pop_size) ] l_tensor = torch.ones([len(pop)]).type(torch.LongTensor) pop_np = [[self.tokenizer.cls_token_id] + self.tokenizer.convert_tokens_to_ids( self.tokenizer.tokenize(' '.join(pop[0]).strip())) + [self.tokenizer.sep_token_id]] l_tensor[0] = len(pop_np[0]) # print(l_tensor) for p in range(1, len(pop)): token_ids = [ self.tokenizer.cls_token_id ] + self.tokenizer.convert_tokens_to_ids( self.tokenizer.tokenize(' '.join( pop[p]).strip())) + [self.tokenizer.sep_token_id] pop_np.append(token_ids) l_tensor[p] = len(token_ids) l_max = torch.max(l_tensor) # print(l_max, l_tensor, len(pop_np)) pop_np = pad_sequences(pop_np, maxlen=l_max.item(), padding='post') pop_tensor = torch.tensor(pop_np) # print(torch.tensor(pop_np)) sort = torch.sort(l_tensor, descending=True)[1] # print(len(sort), sort) pop_tensor = pop_tensor[sort] l_tensor = l_tensor[sort] pop = np.array(pop)[sort].tolist() # print(l_tensor) for i in range(self.max_iters): pop_tensor = pop_tensor.type(torch.LongTensor).to(self.device) l_tensor = l_tensor.to(self.device) # print('pop_tensor:',pop_tensor) # print(pop_tensor.shape) # print(l_tensor) self.batch_model.eval() with torch.no_grad(): pop_preds = self.batch_model.pred( pop_tensor, l_tensor, False)[1].cpu().detach().numpy() # print(sort) # print(pop_preds) # print(pop_tensor) pop_scores = pop_preds[:, target] print('\t\t', i, ' -- ', np.max(pop_scores)) pop_ranks = np.argsort(pop_scores)[::-1] # print(l_tensor) # print(pop_ranks) top_attack = pop_ranks[0] # print(top_attack) ampl = pop_scores / self.temp # print(ampl) covariance = np.cov(ampl) # print('pop:', pop) print(covariance) if covariance > 10e-6: mean = np.mean(ampl) # print(mean) ampl_update = (ampl - mean) / np.sqrt(covariance + 0.001) # print(ampl_update) logits = np.exp(ampl_update) else: if np.max(ampl) > 100: ampl = ampl / (np.max(ampl) / 5) logits = np.exp(ampl) # logits = np.exp(ampl) select_probs = logits / np.sum(logits) # print('prob:', select_probs) # print([self.tokenizer.convert_ids_to_tokens([i]) for i in pop_np[top_attack]]) if np.argmax(pop_preds[top_attack, :]) == target: print('Success and score: {:.4f}'.format( pop_scores[top_attack])) print(seq_orig_string) print(pop[top_attack]) return pop[top_attack], seq_orig_string # for i in pop: # print(i) # print('\t') elite = [pop[top_attack]] # elite # print(elite) # print(select_probs.shape) parent1_idx = np.random.choice(self.pop_size, size=self.pop_size - 1, p=select_probs) parent2_idx = np.random.choice(self.pop_size, size=self.pop_size - 1, p=select_probs) childs = [ self.crossover(pop[parent1_idx[i]], pop[parent2_idx[i]]) for i in range(self.pop_size - 1) ] childs = [ self.perturb(x, seq_orig_string, l_orig, neighbour_list, neighbour_dist, prob_select, target, l) for x in childs ] # print(childs) pop = elite + childs # print(len(pop)) # print('pop:', pop) l_tensor = torch.ones([len(pop)]).type(torch.LongTensor) pop_np = [[self.tokenizer.cls_token_id] + self.tokenizer.convert_tokens_to_ids( self.tokenizer.tokenize(' '.join(pop[0]).strip())) + [self.tokenizer.sep_token_id]] l_tensor[0] = len(pop_np[0]) # print(pop_np) # print(l_tensor) # print(pop_np) for p in range(1, len(pop)): token_ids = [ self.tokenizer.cls_token_id ] + self.tokenizer.convert_tokens_to_ids( self.tokenizer.tokenize(' '.join( pop[p]).strip())) + [self.tokenizer.sep_token_id] pop_np.append(token_ids) l_tensor[p] = len(token_ids) # print(l_tensor) # print(pop_np) l_max = torch.max(l_tensor) pop_np = pad_sequences(pop_np, maxlen=l_max.item(), padding='post') pop_tensor = torch.tensor(pop_np) # print(torch.tensor(pop_np)) sort = torch.sort(l_tensor, descending=True)[1] # print(len(sort), sort) pop_tensor = pop_tensor[sort] l_tensor = l_tensor[sort] pop = np.array(pop)[sort].tolist() # print(np.array(pop).shape) # pop_np = np.expand_dims(pop[0], 0) # for p in pop[1:]: # pop_np = np.concatenate((pop_np, np.expand_dims(p, 0)),0) return None, seq_orig
def create_ml_1m_dataset(file, trans_score=2, embed_dim=8, maxlen=40, test_neg_num=100): """ :param file: A string. dataset path. :param trans_score: A scalar. Greater than it is 1, and less than it is 0. :param embed_dim: A scalar. latent factor. :param maxlen: A scalar. maxlen. :param test_neg_num: A scalar. The number of test negative samples :return: user_num, item_num, train_df, test_df """ print('==========Data Preprocess Start=============') data_df = pd.read_csv(file, sep="::", engine='python', names=['user_id', 'item_id', 'label', 'Timestamp']) # filtering data_df['item_count'] = data_df.groupby('item_id')['item_id'].transform( 'count') data_df = data_df[data_df.item_count >= 5] # trans score data_df = data_df[data_df.label >= trans_score] # sort data_df = data_df.sort_values(by=['user_id', 'Timestamp']) # split dataset and negative sampling print('============Negative Sampling===============') train_data, val_data, test_data = defaultdict(list), defaultdict( list), defaultdict(list) item_id_max = data_df['item_id'].max() for user_id, df in tqdm(data_df[['user_id', 'item_id']].groupby('user_id')): pos_list = df['item_id'].tolist() def gen_neg(): neg = pos_list[0] while neg in set(pos_list): neg = random.randint(1, item_id_max) return neg neg_list = [gen_neg() for i in range(len(pos_list) + test_neg_num)] for i in range(1, len(pos_list)): hist_i = pos_list[:i] if i == len(pos_list) - 1: test_data['hist'].append(hist_i) test_data['pos_id'].append(pos_list[i]) test_data['neg_id'].append(neg_list[i:]) elif i == len(pos_list) - 2: val_data['hist'].append(hist_i) val_data['pos_id'].append(pos_list[i]) val_data['neg_id'].append(neg_list[i]) else: train_data['hist'].append(hist_i) train_data['pos_id'].append(pos_list[i]) train_data['neg_id'].append(neg_list[i]) # item feature columns user_num, item_num = data_df['user_id'].max() + 1, data_df['item_id'].max( ) + 1 item_feat_col = sparseFeature('item_id', item_num, embed_dim) # shuffle random.shuffle(train_data) random.shuffle(val_data) # padding print('==================Padding===================') train = [ pad_sequences(train_data['hist'], maxlen=maxlen), np.array(train_data['pos_id']), np.array(train_data['neg_id']) ] val = [ pad_sequences(val_data['hist'], maxlen=maxlen), np.array(val_data['pos_id']), np.array(val_data['neg_id']) ] test = [ pad_sequences(test_data['hist'], maxlen=maxlen), np.array(test_data['pos_id']), np.array(test_data['neg_id']) ] print('============Data Preprocess End=============') return item_feat_col, train, val, test # create_ml_1m_dataset('../dataset/ml-1m/ratings.dat')
data, label = read_files(args.data_dir) data = list(zip(data, label)) random.shuffle(data) train_data, test_data = train_test_split(data) data_train = encode_sentences([content[0] for content in train_data], word_to_id) label_train = to_categorical( encode_cate([content[1] for content in train_data], cat_to_id)) data_test = encode_sentences([content[0] for content in test_data], word_to_id) label_test = to_categorical( encode_cate([content[1] for content in test_data], cat_to_id)) data_train = sequence.pad_sequences(data_train, maxlen=args.max_len) data_test = sequence.pad_sequences(data_test, maxlen=args.max_len) model = TextRNN(args.max_len, args.max_features, args.embedding_size).build_model() model.compile('adam', 'categorical_crossentropy', metrics=['accuracy']) logger.info('开始训练...') callbacks = [ ModelCheckpoint('./model.h5', verbose=1), EarlyStopping(monitor='val_accuracy', patience=2, mode='max') ] history = model.fit(data_train, label_train, batch_size=args.batch_size,
for element in tokens_content_english: tmp_counter += 1 if max_size_english < len(element): max_size_english = len(element) tmp_counter = 0 max_size_german = 0 for element in tokens_content_german: tmp_counter += 1 if max_size_german < len(element): max_size_german = len(element) print(max_size_english) print(max_size_german) pad_english_sentence = pad_sequences(tokens_content_english, max_size_english) pad_german_sentence = pad_sequences(tokens_content_german, max_size_german) print("Padded EncodedEnglish Sentences: ") print(pad_english_sentence[:10]) print("Padded Encoded German Sentences") print(pad_german_sentence[:10]) print("Padded English Sentence Shape: ") print(pad_english_sentence.shape) print("Padded German Sentence Shape: ") print(pad_german_sentence.shape) # # define NMT model def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units): """ Defines and Creates the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc']) return model # 3.2 划分数据集 # 3.2.1 划分测试训练集 # X_padded=pad_sequences(X_train, maxlen=300) # Y=to_categorical(Y_train, len(class_index)) # x_train, x_test, y_train, y_test = train_test_split(X_padded, Y, test_size=0.2) Y = to_categorical(Y_train, len(class_index)) x_train, x_test, y_train, y_test = train_test_split(X_train, Y, test_size=0.2) x_train_raw = pad_sequences(x_train, maxlen=model_max_len) x_test_raw = pad_sequences(x_test, maxlen=model_max_len) # 3.3 训练 def model_fit(model, x, y): return model.fit(x, y, batch_size=10, epochs=5, validation_split=0.1) model = get_lstm_model() model_train = model_fit(model, x_train_raw, y_train) # 3.4 测试 print(model.evaluate(x_test_raw, y_test)) # 聚类
args = parser.parse_args() data_path = config.data_path training = get_data(data_path + 'snli_1.0_train.jsonl') validation = get_data(data_path + 'snli_1.0_dev.jsonl') test = get_data(data_path + 'snli_1.0_test.jsonl') tokenizer = Tokenizer(lower=False, filters='') tokenizer.fit_on_texts(training[0] + training[1]) tokenizer.fit_on_texts(validation[0] + validation[1]) VOCAB = len(tokenizer.word_counts) + 1 LABELS = {'contradiction': 0, 'neutral': 1, 'entailment': 2} to_seq = lambda X: pad_sequences(tokenizer.texts_to_sequences(X), maxlen=config.max_len) prepare_data = lambda data: (to_seq(data[0]), to_seq(data[1]), data[2]) training = prepare_data(training) validation = prepare_data(validation) test = prepare_data(test) print('Build model...') print('Vocab size =', VOCAB) config.vocab_size = VOCAB config.out_dim = len(LABELS) """ Load Glova Embedding """ GLOVE_STORE = data_path + 'precomputed_glove.weights' if config.use_glove: if not os.path.exists(GLOVE_STORE + '.npy'):
import tensorflow as tf from tensorflow import keras from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences sentences = [ "My name is Shinjini", "My plushie's name is Bruno", "My friend loves my plushie", "I love it too!", "My plushie is very cute", "Don't you all think my plushie is really cute?" ] # tokenize sentences tokenizer = Tokenizer(num_words=100, oov_token="<OOV>") tokenizer.fit_on_texts(sentences) word_index = tokenizer.word_index print('The word index is\n', word_index) # generate sequences out of tokens sequences = tokenizer.texts_to_sequences(sentences) padded = pad_sequences(sequences, maxlen=10) print('Training data sequences are\n', sequences) print('Training data padded sequences are\n', padded) # testing test_data = ["But I really love my plushie", "My friend wants a plushie too!"] test_seq = tokenizer.texts_to_sequences(test_data) test_padded = pad_sequences(test_seq, maxlen=10) print('Testing data sequences are\n', test_seq) print('Testing data padded sequences are\n', test_padded)
with open('./clean_test_review_okt.pkl', 'wb') as f: pickle.dump(clean_test_review, f) from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences tokenizer = Tokenizer() tokenizer.fit_on_texts(clean_train_review) train_sequences = tokenizer.texts_to_sequences(clean_train_review) test_sequences = tokenizer.texts_to_sequences(clean_test_review) word_vocab = tokenizer.word_index MAX_SEQUENCE_LENGTH = 8 train_inputs = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post') train_labels = np.array(df_train['label']) # 학습 데이터의 라벨 test_inputs = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post') test_labels = np.array(df_test['label']) x_train, x_val, y_train, y_val = train_test_split(train_inputs, train_labels, test_size=0.2) from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, Conv1D, GlobalMaxPooling1D, Flatten, Concatenate from tensorflow.keras.constraints import MaxNorm from tensorflow.keras.models import Model from tensorflow.keras.optimizers import Adam
for word_index in range(len(sentence)): word = sentence[word_index] word_vector = word2vec_model.wv[word] sentence_vector.append(word_vector) word2vec_features.append(sentence_vector) word2vec_features = np.asarray(word2vec_features) print("the shape of sentence embedding is ", word2vec_features.shape) # pad the input sentence encoding from tensorflow.keras.preprocessing.sequence import pad_sequences MAX_LEN = 10 padded_sentence_encoding = pad_sequences(word2vec_features, padding="post", truncating="post", maxlen=MAX_LEN) print("padded sentence shape is ", padded_sentence_encoding.shape) # prepare DA tagging embedding da_label_vectors = [] for index in range(len(Tags_List)): tag_vector = [] # for class 'ES' if Tags_List[index] == 'ES': tag_vector = [1, 0, 0, 0, 0, 0, 0, 0, 0] # for class 'EO'
total_words = len(tokenizer.word_index) + 1 print(tokenizer.word_index) print(total_words) input_sequences = [] for line in corpus: token_list = tokenizer.texts_to_sequences([line])[0] for i in range(1, len(token_list)): n_gram_sequence = token_list[:i + 1] input_sequences.append(n_gram_sequence) # pad sequences max_sequence_len = max([len(x) for x in input_sequences]) input_sequences = np.array( pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')) # create predictors and label xs, labels = input_sequences[:, :-1], input_sequences[:, -1] ys = tf.keras.utils.to_categorical(labels, num_classes=total_words) model = Sequential() model.add(Embedding(total_words, 240, input_length=max_sequence_len - 1)) model.add((LSTM(150, return_sequences=True))) model.add((LSTM(75))) model.add(Dense(total_words, activation='softmax')) adam = Adam(lr=0.01) model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
y = df['points'].values # One hot encode categorical columns encoded_countries = pd.get_dummies(df['country']) encoded_provinces = pd.get_dummies(df['province']) # Embedding for titles titles = [] for index, row in df.iterrows(): titles.append(row['title']) tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok) tokenizer.fit_on_texts(titles) print("Fit tokenizer on wine titles") word_index = tokenizer.word_index titles_sequences = tokenizer.texts_to_sequences(titles) titles_padded = pad_sequences(titles_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type) # Embedding for descriptions descriptions = [] for index, row in df.iterrows(): descriptions.append(row['description']) tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok) tokenizer.fit_on_texts(descriptions) print("Fit tokenizer on wine descriptions") word_index = tokenizer.word_index descriptions_sequences = tokenizer.texts_to_sequences(descriptions) descriptions_padded = pad_sequences(descriptions_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type) # Drop preprocessed columns and unwanted columns df = df.drop(['taster_twitter_handle','taster_name','region_2',
Train_corpus = corpus[:spliting] Train_lable = lable[:spliting] Test_corpus = corpus[spliting:] Test_lable = lable[spliting:] #%% text Tokenizing and padding tokenizer = Tokenizer(num_words =10000, oov_token = '<oov>') tokenizer.fit_on_texts(Train_corpus) # tokenizing and padding the training set train_sequences = tokenizer.texts_to_sequences(Train_corpus) train_padding = pad_sequences(train_sequences, padding= 'post', maxlen= max_len) # tokenizing and padding the test set test_sequences = tokenizer.texts_to_sequences(Test_corpus) test_padding = pad_sequences(test_sequences, padding= 'post', maxlen= max_len) #%% Classification Model definition # It is a sequential model: # layer 1 : Embedding layer # layer 2 : Bidirectional LSTM layer # layer 3 : Bidirectional LSTM layer # layer 4-6: fully connected layers (Dense layers) model= tf.keras.models.Sequential([tf.keras.layers.Embedding(10000, 32, input_length= max_len), tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True,activation= 'tanh')),
def select_best_replacement(self, pos, seq_cur, seq, l_orig, target, replace_list, l): infor_list = [self.replace(seq_cur, pos, w) if w != 0 and seq[pos].strip()!=self.dataset.inv_dict[w] else \ (([self.tokenizer.cls_token_id] + self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(' '.join(seq).strip()))\ + [self.tokenizer.sep_token_id]), seq_cur, l) for w in replace_list] n_seq_list = len(infor_list) new_seq_list = [] cur_seq_list = [] l_bert_list = [] for i in range(n_seq_list): new_seq_list.append(infor_list[i][0]) cur_seq_list.append(infor_list[i][1]) l_bert_list.append(infor_list[i][2]) # print(cur_seq_list) # print(l_bert_list) # print([self.tokenizer.convert_ids_to_tokens([i]) for i in new_seq_list[0]]) l_bert_list = torch.tensor(l_bert_list) sort = torch.argsort(l_bert_list, descending=True) l_max_bert = torch.max(l_bert_list) new_seq_list = pad_sequences(new_seq_list, maxlen=l_max_bert, padding='post') new_seq_list_tensor = torch.tensor(new_seq_list)[sort].type( torch.LongTensor).to(self.device) replace_list = replace_list[sort] # print('replace_list:', replace_list) l_tensor = l_bert_list[sort].type(torch.LongTensor) l_tensor = l_tensor.to(self.device) # print(new_seq_list_tensor) self.neighbour_model.eval() with torch.no_grad(): new_seq_preds = self.neighbour_model.pred( new_seq_list_tensor, l_tensor, False)[1].cpu().detach().numpy() # print(new_seq_preds) # print(target) new_seq_scores = new_seq_preds[:, target] # print(new_seq_scores) # print(' '.join([self.dataset.inv_dict[i] if i!=50000 else '[UNK]' for i in seq_cur]).strip()) seq_np = np.expand_dims( [self.tokenizer.cls_token_id] + self.tokenizer.convert_tokens_to_ids( self.tokenizer.tokenize(' '.join(seq_cur).strip())) + [self.tokenizer.sep_token_id], axis=0) seq_tensor = torch.tensor(seq_np).type(torch.LongTensor).to( self.device ) #torch.tensor(np.expand_dims(seq_cur, axis = 0)).type(torch.LongTensor).to(self.device) # print([self.tokenizer.convert_ids_to_tokens([i]) for i in seq_tensor[0]]) l_tensor = torch.tensor([seq_tensor.shape[1]]).to(self.device) # print(seq_tensor) self.model.eval() with torch.no_grad(): orig_score = self.model.pred( seq_tensor, l_tensor, False)[1].cpu().detach().numpy()[0, target] new_seq_scores -= orig_score # print(new_seq_scores) new_seq_scores[self.top_n1:] = -10000000 # print(new_seq_scores) if self.use_lm: prefix = [''] suffix = [''] if pos > 0 and pos <= self.n_prefix: prefix = [seq_cur[pos - i - 1] for i in range(int(pos))[::-1]] elif pos > self.n_prefix: prefix = [ seq_cur[pos - i - 1] for i in range(self.n_prefix)[::-1] ] # orig_word = self.i_w_dict[seq[loc]] if self.use_suffix and pos < l_orig - self.n_suffix: suffix = [ seq_cur[pos + i] for i in range(1, self.n_suffix + 1) ] elif self.use_suffix and pos < l_orig: suffix = [seq_cur[pos + i] for i in range(1, l_orig - pos)] # if self.use_lm: # prefix = [''] # suffix = [''] # if loc > 0 and loc<=self.n_prefix: # prefix = [self.tokenizer.convert_ids_to_tokens([seq_cur[loc-i-1]])[0] for i in range(0, int(loc)-1)[::-1]] # elif loc>self.n_prefix: # prefix = [self.tokenizer.convert_ids_to_tokens([seq_cur[loc-i-1]])[0] for i in range(self.n_prefix)[::-1]] # # orig_word = self.tokenizer.convert_ids_to_tokens([seq[loc]])[0] # if self.use_suffix and loc < l-self.n_suffix and seq_cur[loc+self.n_suffix+1]!=0: # suffix = [self.tokenizer.convert_ids_to_tokens([seq_cur[loc+i]])[0] for i in range(1,self.n_suffix+1)] # elif self.use_suffix and loc < l: # suffix = [self.tokenizer.convert_ids_to_tokens([seq_cur[loc+i]])[0] for i in range(1,l-loc-1)] # if self.use_lm: # prefix = [''] # suffix = [''] # print(loc) # if loc > 0 and loc<=self.n_prefix: # prefix = [] # for i in range(0, int(loc)-1)[::-1]: # w = self.tokenizer.convert_ids_to_tokens([seq_cur[loc-i-1]])[0] # if len(w)>2: # if w[:2] == '##' and i != int(loc)-2: # print(w) # print(prefix) # w = prefix[-1]+w[2:] # prefix[-1] = w # elif w[:2] == '##' and i == int(loc)-2: # print(w) # print(prefix) # w = self.word_pre(w, i+1, loc, seq_cur) # prefix.append(w) # else: # prefix.append(w) # else: # prefix.append(w) # print(w) # print('pre:',prefix) # # prefix = [self.tokenizer.convert_ids_to_tokens([seq_cur[loc-i-1]])[0] for i in range(0, int(loc)-1)[::-1]] # elif loc>self.n_prefix: # prefix = [] # for i in range(self.n_prefix)[::-1]: # print(loc-i-1) # w = self.tokenizer.convert_ids_to_tokens([seq_cur[loc-i-1]])[0] # if len(w)>2: # print(i, int(loc)-2) # if w[:2] == '##' and i != self.n_prefix-1: # print(w) # print(prefix) # w = prefix[-1]+w[2:] # prefix[-1] = w # elif w[:2] == '##' and i == self.n_prefix-1: # print(w) # print(prefix) # w = self.word_pre(w, i+1, loc, seq_cur) # prefix.append(w) # else: # prefix.append(w) # else: # prefix.append(w) # print(w) # print(prefix) # # prefix = [self.tokenizer.convert_ids_to_tokens([seq_cur[loc-i-1]])[0] for i in range(self.n_prefix)[::-1]] # print('prefix:', prefix) # print(loc+self.n_suffix) # # orig_word = self.tokenizer.convert_ids_to_tokens([seq[loc]])[0] # if self.use_suffix and loc < l-self.n_suffix-1 and seq_cur[loc+self.n_suffix]!=0: # suffix = [] # for i in range(1,self.n_suffix+1): # print(loc+i) # w = self.tokenizer.convert_ids_to_tokens([seq_cur[loc+i]])[0] # if len(w)>2: # if w[:2] == '##' and i != 1: # print(w) # print(suffix) # w = suffix[-1]+w[2:] # suffix[-1] = w # elif w[:2] == '##' and i == 1: # print(w) # print(suffix) # w = self.word_pre(w, i+1, loc, seq_cur) # suffix.append(w) # else: # suffix.append(w) # else: # suffix.append(w) # print(suffix) # # suffix = [self.tokenizer.convert_ids_to_tokens([seq_cur[loc+i]])[0] for i in range(1,self.n_suffix+1)] # elif self.use_suffix and loc < l: # suffix = [] # for i in range(1,l-loc-1): # w = self.tokenizer.convert_ids_to_tokens([seq_cur[loc+i]])[0] # if len(w)>2: # if w[:2] == '##' and i != int(loc)+1: # print(w) # print(suffix) # w = suffix[-1]+w[2:] # suffix[-1] = w # elif w[:2] == '##' and i == int(loc)+1: # print(w) # print(suffix) # w = self.word_pre(w, i+1, loc, seq_cur) # suffix.append(w) # else: # suffix.append(w) # else: # suffix.append(w) # print('suffix:', suffix) # suffix = [self.tokenizer.convert_ids_to_tokens([seq_cur[loc+i]])[0] for i in range(1,l-loc-1)] # print(orig_word, [self.dataset.inv_dict[w] for w in replace_list[:self.top_n1] if w in self.dataset.inv_dict]) # print(prefix, suffix) word_list = [ prefix + [self.dataset.inv_dict[w]] + suffix if w in self.dataset.inv_dict else prefix + ['UNK'] + suffix for w in replace_list ] #[prefix+[self.dataset.inv_dict[w]]+suffix if w in self.dataset.inv_dict else prefix+['UNK']+suffix for w in replace_list[:self.top_n1]] # replace_words_orig = [self.dataset.inv_dict[w] if w in self.dataset.inv_dict else 'UNK' for w in replace_list[:self.top_n1]] + [orig_word] # print(word_list) # print(word_list) # print('replace_list:', [self.dataset.inv_dict[i] if i in self.dataset.inv_dict else i for i in replace_list]) # seqs = [self.seq_list(seq) for seq in word_list] # replace_words_scores = self.scorer.sentence_score(seqs, reduce = 'prod') # new_words_scores = np.array(replace_words_scores) # rank_replaces_by_lm = np.argsort(new_words_scores)[::-1] # # print(new_words_scores[rank_replaces_by_lm]) # # print(rank_replaces_by_lm) replace_words_scores = self.lm.get_probs(word_list) new_words_scores = np.array(replace_words_scores) rank_replaces_by_lm = np.argsort(new_words_scores) filtered_words_idx = rank_replaces_by_lm[self.top_n2:] new_seq_scores[filtered_words_idx] = -10000000 if np.max(new_seq_scores) > 0: # print([self.dataset.inv_dict[i] for i in cur_seq_list[np.argsort(new_seq_scores)[-1]]]) return cur_seq_list[np.argsort(new_seq_scores)[-1]] return seq_cur
x_left = SimpleRNN(128, return_sequences=True)(embedding_left) x_right = SimpleRNN(128, return_sequences=True, go_backwards=True)(embedding_right) x_right = Lambda(lambda x: K.reverse(x, axes=1))(x_right) x = Concatenate(axis=2)([x_left, embedding_current, x_right]) x = Conv1D(64, kernel_size=1, activation='tanh')(x) x = GlobalMaxPooling1D()(x) output = Dense(self.class_num, activation=self.last_activation)(x) model = Model(inputs=[input_current, input_left, input_right], outputs=output) return model labelIds, reviewIds, uniqueword_len = gen_data(filePathTrain) ont_hot_labelIds = keras.utils.to_categorical(labelIds, num_classes) reviewIds = sequence.pad_sequences(reviewIds, maxlen) print(reviewIds.shape) print(ont_hot_labelIds.shape) print(type(reviewIds)) textrcnn = RCNN(maxlen, uniqueword_len, embedding_dims, num_classes) model = textrcnn.get_model() model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) left_temp = np.array([reviewIds[:, 0]]) left_temp2 = np.c_[left_temp.T, np.array(reviewIds)] left = left_temp2[:, 0:-1] right_temp = np.array([reviewIds[:, -1]]) right_temp2 = np.c_[right_temp.T, np.array(reviewIds)] right = right_temp2[:, 1:]
def train(): args = parser.parse_args() learning_rate = args.learning_rate nlayer = args.nlayer bidirection = args.bidirection save_path = args.save_path kept_prob = args.kept_prob MAX_VOCAB_SIZE = 50000 with open(('aux_files/dataset_%d.pkl' % MAX_VOCAB_SIZE), 'rb') as f: dataset = pickle.load(f) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") embedding_matrix = np.load('aux_files/embeddings_glove_%d.npy' % (MAX_VOCAB_SIZE)) embedding_matrix = torch.tensor(embedding_matrix.T).to(device) # pytorch max_len = 400 padded_train_raw = pad_sequences(dataset.train_seqs2, maxlen=max_len, padding='post') padded_test_raw = pad_sequences(dataset.test_seqs2, maxlen=max_len, padding='post') # TrainSet data_set_train = Data_infor(padded_train_raw, dataset.train_y) num_train = len(data_set_train) indx = list(range(num_train)) all_train_set = Subset(data_set_train, indx) train_indx = random.sample(indx, int(num_train * 0.8)) vali_indx = [i for i in indx if i not in train_indx] train_set = Subset(data_set_train, train_indx) vali_set = Subset(data_set_train, vali_indx) # TestSet data_set_test = Data_infor(padded_test_raw, dataset.test_y) num_test = len(data_set_test) indx = list(range(num_test)) # indx = random.sample(indx, SAMPLE_SIZE) test_set = Subset(data_set_test, indx) batch_size = 64 hidden_size = 128 all_train_loader = DataLoader(all_train_set, batch_size=batch_size, shuffle=True) train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True) vali_loader = DataLoader(vali_set, batch_size=len(vali_indx) // batch_size) test_loader = DataLoader(test_set, batch_size=int(num_test / 10), shuffle=True) best_save_path = os.path.join( save_path, 'best_lstm_' + str(kept_prob) + '_' + str(learning_rate) + '_' + str(max_len)) rnn = SentimentAnalysis(batch_size, embedding_matrix, hidden_size, kept_prob, nlayer, bidirection) rnn = rnn.to(device) # class my_loss(nn.Module): # def __init__(self): # super().__init__() # self.relu = nn.ReLU() # def forward(self, x, y): # loss = torch.mean((1-y)*x+torch.log(1+torch.exp(-abs(x)))+self.relu(-x)) # return loss criterion = nn.CrossEntropyLoss() optimiser = torch.optim.AdamW(rnn.parameters(), lr=learning_rate) # optimiser = torch.optim.SGD(rnn.parameters(), lr = learning_rate) epoches = 20 best_epoch = 0 best_acc = 0 patience = 15 for epoch in range(epoches): test_pred = torch.tensor([]) test_targets = torch.tensor([]) train_pred = torch.tensor([]) train_targets = torch.tensor([]) test_loss = [] train_loss = [] rnn.train() for batch_index, (seqs, length, target) in enumerate(all_train_loader): seqs = seqs.type(torch.LongTensor) len_order = torch.argsort(length, descending=True) length = length[len_order] seqs = seqs[len_order] target = target[len_order].type(torch.LongTensor) optimiser.zero_grad() seqs, target, length = seqs.to(device), target.to( device), length.to(device) output, pred_out = rnn(seqs, length, True) loss = criterion(output, target) loss.backward() optimiser.step() train_pred = torch.cat( (train_pred, pred_out.type(torch.float).cpu()), dim=0) train_targets = torch.cat( (train_targets, target.type(torch.float).cpu())) train_loss.append(loss) if batch_index % 100 == 0: print('Train Batch:{}, Train Loss:{:.4f}.'.format( batch_index, loss.item())) train_accuracy = rnn.evaluate_accuracy(train_pred.detach().numpy(), train_targets.detach().numpy()) print( 'Epoch:{}, Train Accuracy:{:.4f}, Train Mean loss:{:.4f}.'.format( epoch, train_accuracy, sum(train_loss) / len(train_loss))) rnn.eval() with torch.no_grad(): for batch_index, (seqs, length, target) in enumerate(test_loader): seqs = seqs.type(torch.LongTensor) len_order = torch.argsort(length, descending=True) length = length[len_order] seqs = seqs[len_order] target = target[len_order].type(torch.LongTensor) seqs, target, length = seqs.to(device), target.to( device), length.to(device) output, pred_out = rnn(seqs, length, False) test_pred = torch.cat( (test_pred, pred_out.type(torch.float).cpu()), dim=0) test_targets = torch.cat( (test_targets, target.type(torch.float).cpu())) loss = criterion(output, target) test_loss.append(loss.item()) if batch_index % 100 == 0: print('Vali Batch:{}, Validation Loss:{:.4f}.'.format( batch_index, loss.item())) accuracy = rnn.evaluate_accuracy(test_pred.numpy(), test_targets.numpy()) print('Epoch:{}, Vali Accuracy:{:.4f}, Vali Mean loss:{:.4f}.'. format(epoch, accuracy, sum(test_loss) / len(test_loss))) print('\n\n') # # best save # if accuracy > best_acc: # best_acc = accuracy # best_epoch = epoch # torch.save(rnn.state_dict(), best_save_path) # # early stop # if epoch-best_epoch >=patience: # print('Early stopping') # print('Best epoch: {}, Best accuracy: {:.4f}.'.format(best_epoch, best_acc)) # break torch.save(rnn.state_dict(), best_save_path) rnn.load_state_dict(torch.load(best_save_path)) rnn.to(device) rnn.eval() test_pred = torch.tensor([]) test_targets = torch.tensor([]) test_loss = [] with torch.no_grad(): for batch_index, (seqs, length, target) in enumerate(test_loader): seqs = seqs.type(torch.LongTensor) len_order = torch.argsort(length, descending=True) length = length[len_order] seqs = seqs[len_order] target = target[len_order] seqs, target, length = seqs.to(device), target.to( device), length.to(device) output, pred_out = rnn(seqs, length, False) test_pred = torch.cat( (test_pred, pred_out.type(torch.float).cpu()), dim=0) test_targets = torch.cat( (test_targets, target.type(torch.float).cpu())) loss = criterion(output, target) test_loss.append(loss.item()) accuracy = rnn.evaluate_accuracy(test_pred.numpy(), test_targets.numpy()) print('Test Accuracy:{:.4f}, Test Mean loss:{:.4f}.'.format( accuracy, sum(test_loss) / len(test_loss)))
import os os.environ['CUDA_VISIBLE_DEVICES'] = '0' # 초기화할 GPU number # set parameters max_features = 6000 # max_features : 최대 단어수 max_length = 400 # 학습 데이터는 자주 등장하는 단어 6,000개로 구성한다. (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features) wind = imdb.get_word_index() revind = dict((v,k) for k,v in wind.items()) # Pad sequences for computational efficiency x_train = sequence.pad_sequences(x_train, maxlen=max_length) x_test = sequence.pad_sequences(x_test, maxlen=max_length) # Deep Learning architecture parameters batch_size = 32 embedding_dims = 60 num_kernels = 260 # convolution filter 개수 kernel_size = 3 # convolution filter size hidden_dims = 300 epochs = 10 nOutput = 1 xInput = Input(batch_shape=(None, max_length)) xEmbed = Embedding(max_features, embedding_dims)(xInput)
Y_train = [] X_train = [] pattern_words_dict = {} # loop through each sentence in our intents patterns for intent in data_as_an['intents']: for entities in intent['entities']: if entities: pattern_words_dict[entities] = intent['tag'][0:2] for sentences in intent['ask']: Y_train.append(intent['tag']) X_train.append(word2charArr(sentences, pattern_words_dict)) return np.array(X_train), np.array(Y_train), pattern_words_dict X_train, Y_train, pattern_words_dict = sentence2vec() X_train = pad_sequences(X_train, maxlen=100) highest_unicode = 8100 X_train = np.where(X_train <= highest_unicode, X_train, 0) print(X_train, X_train.shape, highest_unicode) from sklearn import preprocessing cate_enc = preprocessing.LabelEncoder() label = Y_train Y_train = cate_enc.fit_transform(Y_train) print(Y_train.shape) print(Y_train) print(len(np.unique(Y_train))) model = Sequential() model.add(Embedding(highest_unicode + 1, 60, input_length=X_train.shape[1]))
sentences.append(item['field_0'])##insert json keys here labels.append(item['field_1']) train_sentences = sentences[0:TRAIN_SIZE] test_sentences = sentences[TRAIN_SIZE:] train_labels = labels[0:TRAIN_SIZE] test_labels = labels[TRAIN_SIZE:] ##Assign Tokens for the words, and convert the sentences to token sequences. tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token=OOV_TOK) tokenizer.fit_on_texts(train_sentences) wordIndex = tokenizer.word_index train_sequences = tokenizer.texts_to_sequences(train_sentences) train_padded = pad_sequences(train_sequences, maxlen=MAX_LENGTH, padding=PADDING_TYPE, truncating=TRUNC_TYPE) test_sequences = tokenizer.texts_to_sequences(test_sentences) test_padded = pad_sequences(test_sequences, maxlen=MAX_LENGTH, padding=PADDING_TYPE, truncating=TRUNC_TYPE) #Convert to ndarrays train_padded = np.array(train_padded) train_labels = np.array(train_labels) test_padded = np.array(test_padded) test_labels = np.array(test_labels) ##The Neural Net Architechture quacker = tf.keras.Sequential([ tf.keras.layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_LENGTH),
sequences = list() for line in String.split('.'): # Wn을 기준으로 문장 토큰화 encoded = t.texts_to_sequences([line])[0] for i in range(1, len(encoded)): sequence = encoded[:i + 1] sequences.append(sequence) print('학습에 사용할 샘플의 개수: %d' % len(sequences)) #샘플 확인 print(sequences) #가장 긴 길이로 샘플 길이 맞춰주기 max_len = max(len(l) for l in sequences) # 모든 샘플에서 길이가 가장 긴 샘플의 길이 출력 print('샘플의 최대 길이 : {}'.format(max_len)) sequences = pad_sequences(sequences, maxlen=max_len, padding='pre') print(sequences) sequences = np.array(sequences) X = sequences[:, :-1] y = sequences[:, -1] # 리스트의 마지막 값을 제외하고 저장한 것은 X # 리스트의 마지막 값만 저장한 것은 y. 이는 레이블에 해당 #분리된 X값 print(X) #분리된 y값 print(y) #레이블 분리완료
# Use LabelEncoder lbl_encoder = LabelEncoder() lbl_encoder.fit(training_labels) training_labels = lbl_encoder.transform(training_labels) #Vectorize the data using Tokenizer vocab_size = 1000 embedding_dim = 16 max_len = 20 oov_token = "<OOV>" tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token) tokenizer.fit_on_texts(training_sentences) word_index = tokenizer.word_index sequences = tokenizer.texts_to_sequences(training_sentences) padded_sequences = pad_sequences(sequences, truncating='post', maxlen=max_len) #Neural Network Training model = Sequential() model.add(Embedding(vocab_size, embedding_dim, input_length=max_len)) model.add(GlobalAveragePooling1D()) model.add(Dense(16, activation='relu')) model.add(Dense(16, activation='relu')) model.add(Dense(num_classes, activation='softmax')) model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy']) model.summary() epochs = 500 history = model.fit(padded_sequences, np.array(training_labels), epochs=epochs)
def load_lstm_inv_data(): # =============训练集================= train_sequences = list() for index, group in train.groupby(by='fragment_id'): train_sequences.append(group[use_fea].values) # 找到序列的最大长度 len_sequences = [] for one_seq in train_sequences: len_sequences.append(len(one_seq)) print(pd.Series(len_sequences).describe()) # 最长的序列有61个 # 填充序列 to_pad = 61 train_new_seq = [] for one_seq in train_sequences: len_one_seq = len(one_seq) last_val = one_seq[-1] n = to_pad - len_one_seq # to_concat = np.repeat(last_val, n).reshape(len(use_fea), n).transpose() # new_one_seq = np.concatenate([one_seq, to_concat]) if n != 0: to_concat = one_seq[:n] new_one_seq = np.concatenate([one_seq, to_concat]) else: new_one_seq = one_seq train_new_seq.append(new_one_seq) train_final_seq = np.stack(train_new_seq) # final_seq.shape (314, 129, 4) print("train_final_seq.shape", train_final_seq.shape) # 进行截断 seq_len = 60 train_final_seq = sequence.pad_sequences(train_final_seq, maxlen=seq_len, padding='post', dtype='float', truncating='post') print("train_final_seq.shape", train_final_seq.shape) # =============测试集================= test_sequences = list() for index, group in test.groupby(by='fragment_id'): test_sequences.append(group[use_fea].values) # 填充到最大长度 to_pad = 61 test_new_seq = [] for one_seq in test_sequences: len_one_seq = len(one_seq) last_val = one_seq[-1] n = to_pad - len_one_seq # to_concat = np.repeat(last_val, n).reshape(len(use_fea), n).transpose() # new_one_seq = np.concatenate([one_seq, to_concat]) if n != 0: to_concat = one_seq[:n] new_one_seq = np.concatenate([one_seq, to_concat]) else: new_one_seq = one_seq test_new_seq.append(new_one_seq) test_final_seq = np.stack(test_new_seq) print("test_final_seq.shape", test_final_seq.shape) # 进行截断 seq_len = 60 test_final_seq = sequence.pad_sequences(test_final_seq, maxlen=seq_len, padding='post', dtype='float', truncating='post') print("test_final_seq.shape", test_final_seq.shape) return train_final_seq, y_train, test_final_seq, seq_len, len(use_fea)
token = " " + word + " " sentence = sentence.replace(token, " ") sentence = sentence.replace(" ", " ") sentences.append(sentence) # print(len(sentences)) # print(sentences[0]) # print(labels[0]) tokenizer = Tokenizer(oov_token='<OOV>') tokenizer.fit_on_texts(sentences) word_index = tokenizer.word_index print(len(word_index)) sequences = tokenizer.texts_to_sequences(sentences) padded = pad_sequences(sequences, padding='post') padded = padded / 2442. print(padded[0]) print(padded.shape) label_tokenizer = Tokenizer() label_tokenizer.fit_on_texts(labels) label_word_index = label_tokenizer.word_index label_seq = label_tokenizer.texts_to_sequences(labels) # label_seq = [item for sublist in label_seq for item in sublist] # label_seq = np.array(label_seq) label_seq = keras.utils.to_categorical(label_seq) print(label_seq[:10]) print(label_word_index) model = keras.models.Sequential()
tokenizer.fit_on_texts(sentences) word2idx = tokenizer.word_index idx2word = {v: k for k, v in word2idx.items()} train_to_idx = tokenizer.texts_to_sequences(sentences) train_inputs = [] y_label = [] for i in range(len(train_to_idx)): for j in range(1, len(train_to_idx[i])): train_inputs.append(train_to_idx[i][:j]) y_label.append(train_to_idx[i][j]) train_inputs_pad = pad_sequences(train_inputs, maxlen=7) df_train = pd.DataFrame({'x_emb': train_inputs_pad, 'label': y_label}) train = np.array(train_inputs_pad) label = np.array(y_label).reshape(-1, 1) vocab_size = len(word2idx) + 1 x_train, x_test, y_train, y_test = train_test_split(train, label, test_size=0.1) x_input = Input(shape=(7, )) # batch_shape하고 shape을 쓸 때의 차이가 무엇인가? x_emb = Embedding(input_dim=vocab_size, output_dim=8, name='emb')(x_input) # H-network
#TEST_DATA_FILE='liwc_test.csv' train = pd.read_csv(r"liwc_input.csv") test = pd.read_csv(r"liwc_test.csv") list_sentences_train = train["text"].fillna("_na_").values y_train = np.array(train['rating']) y_test = np.array(test['rating']) Y_train = np_utils.to_categorical(y_train, nb_classes) Y_test = np_utils.to_categorical(y_test, nb_classes) list_sentences_test = test["text"].fillna("_na_").values tokenizer = Tokenizer(num_words=max_features) tokenizer.fit_on_texts(list(list_sentences_train)) list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train) list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test) unigrams = pad_sequences(list_tokenized_train, maxlen=maxlen) unigrams_t = pad_sequences(list_tokenized_test, maxlen=maxlen) liwc_scaler = preprocessing.StandardScaler() liwc = liwc_scaler.fit_transform(train.ix[:, "WC":"OtherP"]) liwc_t = liwc_scaler.transform(test.ix[:, "WC":"OtherP"]) X_t = np.hstack(unigrams) X_te = np.hstack(unigrams_t) def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32') embeddings_index = dict( get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE)) all_embs = np.stack(embeddings_index.values())