def tokenlize_text(max_num_words, max_seq_length, x_train): """Tokenlize text. Vectorize a text corpus by transform each text in texts to a sequence of integers. Args: max_num_words: Int, max number of words in the dictionary. max_seq_length: Int, the length of each text sequence, padding if shorter, trim is longer. x_train: List contains text data. Returns: x_train: Tokenlized input data. word_index: Dictionary contains word with tokenlized index. """ from keras_preprocessing.sequence import pad_sequences from keras_preprocessing.text import Tokenizer print("tokenlizing texts...") tokenizer = Tokenizer(num_words=max_num_words) tokenizer.fit_on_texts(x_train) sequences = tokenizer.texts_to_sequences(x_train) word_index = tokenizer.word_index x_train = pad_sequences(sequences, maxlen=max_seq_length) print("data readed and convert to %d length sequences" % max_seq_length) return x_train, word_index
from keras.models import load_model from keras_preprocessing.sequence import pad_sequences from keras_preprocessing.text import Tokenizer import numpy as np model = load_model('sentiment_model.h5') test_data = [ "A lot of good things are happening. We are respected again throughout the world, and that's a great thing.@realDonaldTrump" ] max_features = 200 tokenizer = Tokenizer(num_words=max_features, split=' ') tokenizer.fit_on_texts(test_data) X = tokenizer.texts_to_sequences(test_data) max_len = 28 X = pad_sequences(X, maxlen=max_len) class_names = ['positive', 'negative'] preds = model.predict(X) print(preds) classes = model.predict_classes(X) print(classes) print(class_names[classes[0]])
#print(dict(zip(labels,responses))) enc = LabelEncoder() enc.fit(training_labels) ## 'Y' | Dependent Variable training_labels = enc.transform(training_labels) vocab_size = 8000 max_len = 24 trunc_type = "post" oov_token = "<OOV>" tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token) tokenizer.fit_on_texts(training_sentences) word_index = tokenizer.word_index sequences = tokenizer.texts_to_sequences(training_sentences) padded = pad_sequences(sequences, truncating=trunc_type, maxlen=max_len) classes = len(labels) ''' print(type(padded)) print('\n\n') print(type(sequences),'\n') print(padded[5], '\n\n', sequences[5]) print(word_index['whats'], word_index['up']) print(training_sentences[5]) ''' embeddings_index = {} with open('../glove.6B/glove.6B.200d.txt', encoding='utf-8') as f: for line in f:
def run_keras_experiment(): print('Reading files') # Reading File Section - This should change full = pd.read_csv("data/hindi_dataset.tsv", sep='\t', names=['text_id', 'text', 'task_1', 'task_2', 'task_3']) is_hof = full['task_1'] == 'HOF' full = full[is_hof] train, test = train_test_split(full, test_size=0.2) print('Completed reading') ############# print("Train shape : ", train.shape) print("Test shape : ", test.shape) # Variables TEXT_COLUMN = "text" LABEL_COLUMN = "task_3" configParser = configparser.RawConfigParser() configFilePath = "config.txt" configParser.read(configFilePath) EMBEDDING_FILE = configParser.get('hindi_task_3_model-config', 'EMBEDDING_FILE') MODEL_PATH = configParser.get('hindi_task_3_model-config', 'MODEL_PATH') PREDICTION_FILE = configParser.get('hindi_task_3_model-config', 'PREDICTION_FILE') print(train.head()) print("Removing URLs") train[TEXT_COLUMN] = train[TEXT_COLUMN].apply(lambda x: remove_url(x)) test[TEXT_COLUMN] = test[TEXT_COLUMN].apply(lambda x: remove_url(x)) print(train.head()) print("Removing usernames") train[TEXT_COLUMN] = train[TEXT_COLUMN].apply(lambda x: remove_names(x)) test[TEXT_COLUMN] = test[TEXT_COLUMN].apply(lambda x: remove_names(x)) print(train.head()) # # print("Identifying names") # # train[TEXT_COLUMN] = train[TEXT_COLUMN].apply(lambda x: entity_recognizing(x)) # test[TEXT_COLUMN] = test[TEXT_COLUMN].apply(lambda x: entity_recognizing(x)) # print(train.head()) print("Converting to lower-case") train[TEXT_COLUMN] = train[TEXT_COLUMN].str.lower() test[TEXT_COLUMN] = test[TEXT_COLUMN].str.lower() print(train.head()) print("Cleaning punctuation marks") train[TEXT_COLUMN] = train[TEXT_COLUMN].apply(lambda x: clean_text(x)) test[TEXT_COLUMN] = test[TEXT_COLUMN].apply(lambda x: clean_text(x)) print(train.head()) train['doc_len'] = train[TEXT_COLUMN].apply( lambda words: len(words.split(" "))) max_seq_len = np.round(train['doc_len'].mean() + train['doc_len'].std()).astype(int) embed_size = 300 # how big is each word vector max_features = None # how many unique words to use (i.e num rows in embedding vector) maxlen = max_seq_len # max number of words in a question to use #99.99% # fill up the missing values X = train[TEXT_COLUMN].fillna("_na_").values X_test = test[TEXT_COLUMN].fillna("_na_").values # Tokenize the sentences tokenizer = Tokenizer(num_words=max_features, filters='') tokenizer.fit_on_texts(list(X)) X = tokenizer.texts_to_sequences(X) X_test = tokenizer.texts_to_sequences(X_test) # Pad the sentences X = pad_sequences(X, maxlen=maxlen) X_test = pad_sequences(X_test, maxlen=maxlen) # Get the target values Y = train[LABEL_COLUMN].values le = LabelEncoder() le.fit(Y) encoded_Y = le.transform(Y) word_index = tokenizer.word_index max_features = len(word_index) + 1 print('Loading Embeddings') embedding_matrix = get_emb_matrix(word_index, max_features, EMBEDDING_FILE) print('Finished loading Embeddings') print('Start Training') kfold = StratifiedKFold(n_splits=5, random_state=10, shuffle=True) bestscore = [] y_test = np.zeros((X_test.shape[0], )) for i, (train_index, valid_index) in enumerate(kfold.split(X, encoded_Y)): X_train, X_val, Y_train, Y_val = X[train_index], X[ valid_index], encoded_Y[train_index], encoded_Y[valid_index] filepath = MODEL_PATH checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=2, save_best_only=True, mode='min') reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.6, patience=1, min_lr=0.0001, verbose=2) earlystopping = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=2, verbose=2, mode='auto') callbacks = [checkpoint, reduce_lr] model = capsule(maxlen, max_features, embed_size, embedding_matrix, 1) if i == 0: print(model.summary()) model.fit( X_train, Y_train, batch_size=64, epochs=20, validation_data=(X_val, Y_val), verbose=2, callbacks=callbacks, ) model.load_weights(filepath) y_pred = model.predict([X_val], batch_size=64, verbose=2) y_test += np.squeeze(model.predict([X_test], batch_size=64, verbose=2)) / 5 f1, threshold = f1_smart(np.squeeze(Y_val), np.squeeze(y_pred)) print('Optimal F1: {:.4f} at threshold: {:.4f}'.format(f1, threshold)) bestscore.append(threshold) print('Finished Training') y_test = y_test.reshape((-1, 1)) pred_test_y = (y_test > np.mean(bestscore)).astype(int) test['predictions'] = le.inverse_transform(pred_test_y) # save predictions file_path = PREDICTION_FILE test.to_csv(file_path, sep='\t', encoding='utf-8') print('Saved Predictions') # post analysis tn, fp, fn, tp = confusion_matrix(test[LABEL_COLUMN], test['predictions']).ravel() weighted_f1 = f1_score(test[LABEL_COLUMN], test['predictions'], average='weighted') accuracy = accuracy_score(test[LABEL_COLUMN], test['predictions']) weighted_recall = recall_score(test[LABEL_COLUMN], test['predictions'], average='weighted') weighted_precision = precision_score(test[LABEL_COLUMN], test['predictions'], average='weighted') print("Confusion Matrix (tn, fp, fn, tp) {} {} {} {}".format( tn, fp, fn, tp)) print("Accuracy ", accuracy) print("Weighted F1 ", weighted_f1) print("Weighted Recall ", weighted_recall) print("Weighted Precision ", weighted_precision)
train_text, test_text, train_label, test_label = train_test_split( tweets['Tweet'], tweets['label'], random_state=2020, test_size=0.25, stratify=tweets['label']) # seq_len = [len(i.split()) for i in train_text] # pd.Series(seq_len).hist(bins=30) # plt.show() max_words = 20000 max_len = 100 tok = Tokenizer(num_words=max_words) tok.fit_on_texts(train_text) sequences = tok.texts_to_sequences(train_text) sequences_matrix = sequence.pad_sequences(sequences, maxlen=max_len) model = RNN() model.summary() model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy']) #model.fit(sequences_matrix,train_label,batch_size=128,epochs=10, #validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)]) checkpoint = ModelCheckpoint("lstm_model", monitor="val_acc", save_best_only=True, mode='max',
class KerasTokenizerAdapter(FitTransformMixin): def __init__(self, **kwargs): kwargs['oov_token'] = '<mis>' tok = Tokenizer(oov_token=kwargs['oov_token']) word_counts = kwargs.pop('word_counts', tok.word_counts) word_docs = kwargs.pop('word_docs', tok.word_docs) index_docs = kwargs.pop('index_docs', tok.index_docs) index_word = kwargs.pop('index_word', tok.index_word) word_index = kwargs.pop('word_index', tok.word_index) self._encoder = Tokenizer(**kwargs) self._encoder.word_counts = word_counts self._encoder.word_docs = word_docs self._encoder.index_docs = index_docs self._encoder.word_index = word_index self._encoder.index_word = index_word def fit(self, texts): self._encoder.fit_on_texts(['<eos> <pad>']) self._encoder.fit_on_texts(texts) special = {'<pad>': 0, '<mis>': 1, '<eos>': 2} vocab = sorted([( k, f, ) for k, f in self._encoder.word_counts.items() if k not in special], reverse=True, key=lambda x: x[1]) first_slot = max(special.values()) + 1 word_index = special.copy() word_index.update( {k: idx + first_slot for idx, (k, _) in enumerate(vocab)}) self._encoder.word_index = word_index self._encoder.index_word = {idx: k for k, idx in word_index.items()} def transform(self, X: pd.Series) -> List[List[int]]: def _transform(X: pd.Series) -> List[List[int]]: eos = self._encoder.word_index['<eos>'] tokens = self._encoder.texts_to_sequences(X) for i in range(len(tokens)): tokens[i].append(eos) return tokens logger.debug('KerasTokenizerAdapter::transform - Start') try: return _transform(X) finally: logger.debug('KerasTokenizerAdapter::transform - Done') @property def params(self): return { 'num_words': self._encoder.num_words, 'filters': self._encoder.filters, 'lower': self._encoder.lower, 'split': self._encoder.split, 'char_level': self._encoder.char_level, 'oov_token': self._encoder.oov_token, 'document_count': self._encoder.document_count, 'word_counts': dict(self._encoder.word_counts), 'word_docs': dict(self._encoder.word_docs), 'index_docs': dict(self._encoder.index_docs), 'index_word': self._encoder.index_word, 'word_index': self._encoder.word_index }
testMovie_df = pd.read_csv('test.tsv', delimiter='\t', encoding='utf-8') trainMovie_df = pd.read_csv('train.tsv', delimiter='\t', encoding='utf-8') # Keeping only the necessary columns - cleaning the data set trainMovie_df = trainMovie_df.drop(columns=['PhraseId', 'SentenceId']) testMovie_df = testMovie_df.drop(columns=['PhraseId', 'SentenceId']) trainMovie_df['Phrase'] = trainMovie_df['Phrase'].apply( lambda x: re.sub('[^a-zA-z0-9\s]', '', x.lower())) testMovie_df['Phrase'] = testMovie_df['Phrase'].apply( lambda x: re.sub('[^a-zA-z0-9\s]', '', x.lower())) max_features = 2000 tokenizer = Tokenizer(num_words=max_features, split=' ') tokenizer.fit_on_texts(trainMovie_df['Phrase'].values) X_train = tokenizer.texts_to_sequences(trainMovie_df['Phrase'].values) X_train = pad_sequences(X_train) tokenizer.fit_on_texts(testMovie_df['Phrase'].values) X_test = tokenizer.texts_to_sequences(testMovie_df['Phrase'].values) X_test = pad_sequences(X_train) print("handing data") # Creating the model embed_dim = 256 lstm_out = 156 # Design the model using classification # Model defined model = Sequential() # Input layer of the model for processing
# 读取持久化的对象 with open('data.pkl', 'rb') as f: train_data = pickle.load(f) test_data = pickle.load(f) max_len = 6000 labels = to_categorical(np.asarray(train_data['label'].tolist()), num_classes=8) tokenizer = Tokenizer() tokenizer.fit_on_texts(train_data['apis'].tolist()) tokenizer.fit_on_texts(test_data['apis'].tolist()) vocab = tokenizer.word_index x_train_word_ids = tokenizer.texts_to_sequences(train_data['apis'].tolist()) x_test_word_ids = tokenizer.texts_to_sequences(test_data['apis'].tolist()) x_train_padded_seqs = pad_sequences(x_train_word_ids, maxlen=max_len) x_test_padded_seqs = pad_sequences(x_test_word_ids, maxlen=max_len) def text_cnn(): kernel_size = [2, 4, 6, 8, 10] conv_activation = 'relu' _input = Input(shape=(max_len,), dtype='int32') _embed = Embedding(304, 256, input_length=max_len)(_input) _embed = SpatialDropout1D(0.15)(_embed) warppers = [] for _kernel_size in kernel_size:
# max_length = 64 # padded_train = pad_sequences(encoded_train, maxlen=max_length, padding='post') # print(padded_train) # test_ids = df1['id'] # test_ids = test['Code'] # train_labels = train['Code'] # print(train_labels) # encoded_train_labels = train_labels # le.inverse_transform(encoded_train_labels) # integer encode the documents encoded_test = t.texts_to_sequences(test['Desc']) max_length = 64 padded_test = pad_sequences(encoded_test, maxlen=max_length, padding='post') # print(padded_test) # test_labels = test['Code'] # print(test_labels) # padded_test = joblib.load('padded_test.vec') # test_labels = joblib.load('test_labels.vec') # padded_train = joblib.load('padded_train.vec') # encoded_train_labels = joblib.load('encoded_train_labels.vec') # le = joblib.load('label_encoder_le.vec') # LOAD WORDEMBEDDING
def run(self): # Path to toxicity_annotated_comments.merged.shuf.cleaned-68MB,_160k-rows.tsv raw_data_file = sys.argv[1] vocab_size = int(sys.argv[2]) # Optional path to embeddings file to create embeddings matrix. if len(sys.argv) > 3: embeddings_file = sys.argv[3] else: embeddings_file = None # Load raw data. print("Loading raw data") raw_df = pd.read_csv(raw_data_file, sep='\t') # Split data. print("Splitting data") splits = ['train', 'test', 'dev'] split_labels = {} raw_split_text = {} for split in splits: labels = raw_df.loc[raw_df['split'] == split]['Label'].tolist() text = raw_df.loc[raw_df['split'] == split]['comment'].tolist() split_labels[split] = labels raw_split_text[split] = text # Setup tokenizer. print("Setting up tokenizer.") all_text = raw_df['comment'].tolist() tokenizer = Tokenizer(num_words=vocab_size) tokenizer.fit_on_texts(all_text) # Tokenize text. print("Tokenizing") split_text = {} for split in splits: print(("Tokenizing", split)) tokenized = tokenizer.texts_to_sequences(raw_split_text[split]) split_text[split] = tokenized # Save to tokens file. print("Saving to file") with open('toxicity_labels.pkl', 'w') as my_file: pickle.dump(split_labels, my_file) with open('toxicity_tokens_{}_words.pkl'.format(vocab_size), 'w') as my_file: pickle.dump(split_text, my_file) # Create embeddings matrix # Code copied from https://www.kaggle.com/tunguz/bi-gru-lstm-cnn-poolings-fasttext. if embeddings_file is not None: max_features = vocab_size embed_size = 300 print("Loading embeddings") embedding_index = dict(self.get_coefs(*o.strip().split(" ")) for o in open(embeddings_file)) print("Creating embedding matrix") word_index = tokenizer.word_index nb_words = min(max_features, len(word_index)) embedding_matrix = np.zeros((nb_words, embed_size)) for word, i in list(word_index.items()): if i >= max_features: continue embedding_vector = embedding_index.get(word) if embedding_vector is not None: embedding_matrix[i] = embedding_vector embeddings_matrix_filename = 'embeddings_matrix_{}.pkl'.format(vocab_size) with open(embeddings_matrix_filename, 'w') as my_file: pickle.dump(embedding_matrix, my_file) print("Done.")
from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt from tensorflow.python.keras.layers import Embedding from tensorflow.python.keras.utils.np_utils import to_categorical # Read file train = pd.read_csv("train.tsv", sep="\t") # Assign Value X = train['Phrase'].values y = train['Sentiment'].values # Tokenizing tokenizer = Tokenizer(num_words=2000) tokenizer.fit_on_texts(X) X = tokenizer.texts_to_sequences(X) X = pad_sequences(X) # Encoding le = preprocessing.LabelEncoder() y = le.fit_transform(y) y = to_categorical(y) # Training and testing x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1000) # CNN layers model = Sequential()
import numpy as np import tensorflow as tf import unidecode from keras_preprocessing.text import Tokenizer tf.enable_eager_execution() file_path = ".\\Datasets\\Shakespear.txt" text = unidecode.unidecode(open(file_path).read()) tokenizer = Tokenizer() tokenizer.fit_on_texts([text]) encoded = tokenizer.texts_to_sequences([text])[0] vocab_size = len(tokenizer.word_index) + 1 word2idx = tokenizer.word_index idx2word = tokenizer.index_word sequences = list() for i in range(1, len(encoded)): sequence = encoded[i - 1:i + 1] sequences.append(sequence) sequences = np.array(sequences) X, Y = sequences[:, 0], sequences[:, 1] X = np.expand_dims(X, 1) Y = np.expand_dims(Y, 1)
seq = df_history.seq.iloc[0] seq_array = np.array(seq.split(' ')) all_sequences = seq_array.copy() for i in range(1, len(df_history)): seq = df_history.seq.iloc[i] seq_array = np.array(seq.split(' ')) all_sequences = np.concatenate([all_sequences, seq_array]) # use Keras' tokenizer to translate the str representations of airports into integers # with a mapping kept in the tokenizer. vocab_size will be a parameter for the network. tokenizer = Tokenizer(lower=False, char_level=False) # fit_on_texts builds the dict tokenizer.fit_on_texts(all_sequences.tolist()) sequences_from_tokenizer = np.array( tokenizer.texts_to_sequences(all_sequences)) acf_x = cal_acf(sequences_from_tokenizer, k=100) plot_acf_data(acf_x, title=f'ACF for all data with length {len(all_sequences)}') adf_test(pd.Series(sequences_from_tokenizer.reshape(-1)), name=f'ADF_Full') #%% Check a random sequence for an aircraft #i = 4582 i = (np.random.randint(low=0, high=len(df_history), size=1))[0] seq = df_history.seq.iloc[i] seq_array = np.array(seq.split(' ')) # use Keras' tokenizer to translate the str representations of airports into integers # with a mapping kept in the tokenizer. vocab_size will be a parameter for the network. tokenizer = Tokenizer(lower=False, char_level=False)
print('Pre-processing') def text_cleaning(text): text = re.sub('[^A-Za-z0-9]', ' ', text.lower()) text = ' '.join(text.split()) return text data['question1'] = data['question1'].apply(text_cleaning) data['question2'] = data['question2'].apply(text_cleaning) tokenizer = Tokenizer(num_words=max_nb_words, oov_token='oov_token_placeholder') tokenizer.fit_on_texts( list(data['question1'].values) + list(data['question2'].values)) sequences_1 = tokenizer.texts_to_sequences(data['question1'].values) sequences_2 = tokenizer.texts_to_sequences(data['question2'].values) word_index = tokenizer.word_index print('Found %s unique tokens' % len(word_index)) x1 = pad_sequences(sequences_1, maxlen=max_seq_len) x2 = pad_sequences(sequences_2, maxlen=max_seq_len) y = data['is_duplicate'].values ######################################## # retrieval embeddings ######################################## print('Indexing word vectors') word2vec = {} fin = io.open(file_emb, 'r', encoding='utf-8', newline='\n', errors='ignore') for line in fin:
if embedding_vector is not None: embedding_matrix[i] = embedding_vector num_words += 1 return embedding_matrix corpus = readCorpusData.readCorpusFromFile("../data/final_corpus.txt", 1000) docs = [] for post in corpus: docs.append(post["text"]) t = Tokenizer() t.fit_on_texts(docs) vocab_size = len(t.word_index) + 1 encoded_docs = t.texts_to_sequences(docs) for i in range(len(corpus)): corpus[i]["text_sequence"] = encoded_docs[i] with open('../data/final_corpus_dictionary_max_length_1000.pickle', 'wb') as handle: pickle.dump(corpus, handle, protocol=pickle.HIGHEST_PROTOCOL) corpus = readCorpusData.readCorpusFromFile("../data/final_corpus.txt", 500) docs = [] for post in corpus: docs.append(post["text"]) encoded_docs = t.texts_to_sequences(docs) for i in range(len(corpus)): corpus[i]["text_sequence"] = encoded_docs[i] with open('../data/final_corpus_dictionary_max_length_500.pickle',
def data_pre(data): # 得到标签 label = [[i] * len(data[i]) for i in range(len(data))][0] label = to_categorical(label) # 切词 context = [] for i in data: for j in i: context.append(jieba.lcut(j)) # 构建词典 tokenizer = Tokenizer(num_words=20000) tokenizer.fit_on_texts(context) train_tags_title = tokenizer.texts_to_sequences(context) train_tags_title_preprocessed = pad_sequences(train_tags_title, maxlen=45, padding='post') # 预训练词向量 # embedding_matrix = np.zeros((278028, 30), dtype=np.float32) # f = open('wiki.zh.text.vector', encoding='utf-8') # f = f.readlines() # for text in f: # text = text.split() # if text[0] in context: # embedding_matrix[context[text[0]]] = text[1:] # 模型 x_1 = Input(shape=(45, )) # 输入数据维度 embed_1 = Embedding(input_dim=45, output_dim=45)(x_1) # 将索引值转化为稠密向量,且只能做第一层 L_1 = (LSTM(64))(embed_1) # 第一个括号构建一个层 64是输出空间的维度,第二个括号用该层做计算 L_1 = Dropout(0.5)(L_1) # 防止过拟合,0.5在这里是需要丢弃的输入比例 L_1 = Dense(9, activation='softmax')(L_1) # 3是输出空间维度 model_one = Model(x_1, L_1) # x_1输入,L_1输出 model_one.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc']) # 'binary_crossentropy' history = model_one.fit(train_tags_title_preprocessed, label, batch_size=512, epochs=20, validation_split=0.1, shuffle=True) # 汇总acc函数历史数据 plt.plot(history.history['acc']) plt.plot(history.history['val_acc']) plt.title('model acc') plt.ylabel('accuracy') plt.xlabel('epoch') plt.legend(['train', 'val'], loc='upper left') plt.show() # 汇总损失函数历史数据 plt.plot(history.history['loss']) plt.plot(history.history['val_loss']) plt.title('model loss') plt.ylabel('loss') plt.xlabel('epoch') plt.legend(['train', 'val'], loc='upper left') plt.show()
tokenizer = Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n', lower=True, split=" ", char_level=False) # print(train_apis) # 通过训练和测试数据集丰富取词器的字典,方便后续操作 tokenizer.fit_on_texts(train_apis) # print(train_apis) # print(test_apis) tokenizer.fit_on_texts(test_apis) # print(test_apis) # print(tokenizer.word_index) # #获取目前提取词的字典信息 # # vocal = tokenizer.word_index train_apis = tokenizer.texts_to_sequences(train_apis) # 通过字典信息将字符转换为对应的数字 test_apis = tokenizer.texts_to_sequences(test_apis) # print(test_apis) # 序列化原数组为没有逗号的数组,默认在前面填充,默认截断前面的 train_apis = pad_sequences(train_apis, inputLen, padding='post', truncating='post') # print(test_apis) test_apis = pad_sequences(test_apis, inputLen, padding='post', truncating='post') # print(test_apis)
K = argp.k evaluate(model, ge, X_test, targets_test, tmode='k', k=K) else: raise Exception('You should pass a threshold mode') elif args.emode == 'keras': sources_train, targets_train = read_file( '../processed_data/games_train.json', emode='keras') sources_test, targets_test = read_file( '../processed_data/games_test.json', emode='keras') tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True) tokenizer.fit_on_texts(sources_train) word_index = tokenizer.word_index x_train = tokenizer.texts_to_sequences(sources_train) x_train = pad_sequences(x_train, maxlen=TOKENS_MAX_LENGTH) ge = GenresEncoder('../processed_data/genres') y_train = ge.transform(targets_train) print(x_train.shape) print(y_train.shape) model = train(x_train, y_train, (TOKENS_MAX_LENGTH, ), ge.num_genres, batch_size=BATCH_SIZE, max_epoch=100, use_es=True, emode='keras')
# https://wikidocs.net/22660 text = """경마장에 있는 말이 뛰고 있다\n 그의 말이 법이다\n 가는 말이 고와야 오는 말이 곱다\n""" from keras_preprocessing.text import Tokenizer token = Tokenizer() token.fit_on_texts([text]) encoded = token.texts_to_sequences([text]) # encoded = token.texts_to_sequences([text])[0] print(encoded) vocab_size = len(token.word_index) + 1 # 12 # 케라스 토크나이저의 정수 인코딩은 인덱스가 1부터 시작하지만, # 케라스 원-핫 인코딩에서 배열의 인덱스가 0부터 시작하기 때문에 # 배열의 크기를 실제 단어 집합의 크기보다 +1로 생성해야하므로 미리 +1 선언 print('단어 집합의 크기 : %d' % vocab_size) print(token.word_index) # 훈련 데이터를 만든다. sequences = list() for line in text.split('\n'): # \n을 기준으로 문장 토큰화 encoded = token.texts_to_sequences([line])[0] for i in range(1, len(encoded)): sequence = encoded[:i + 1] sequences.append(sequence) print('학습에 사용할 샘플의 개수: %d' % len(sequences))
epoch = 3 dropout = 0.1 num_filters = 60 #split to train and val #train_df, val_df = train_test_split(train_df, test_size=0.05, random_state=2018) #fill up the missing values train_X = train_df['clean_text'].fillna('_na_').values #1175509 #val_X = val_df['clean_text'].fillna('_na_').values #130613 test_X = test_df['clean_text'].fillna('_na_').values #56370 #Tokenize the sequences tokenizer = Tokenizer(num_words=words_size) tokenizer.fit_on_texts(list(train_X) + list(test_X)) train_X = tokenizer.texts_to_sequences(train_X) #val_X =tokenizer.texts_to_sequences(val_X) test_X = tokenizer.texts_to_sequences(test_X) #Pad the sequences train_X = pad_sequences(train_X, maxlen=max_len) #val_X = pad_sequences(val_X, maxlen=max_len) test_X = pad_sequences(test_X, maxlen=max_len) #Get the target values train_y = train_df['target'].values #1175509 #val_y = val_df['target'].values #130613 #numpy2tensor tensor_X = torch.from_numpy(train_X) tensor_y = torch.from_numpy(train_y)
train['doc_len'].std()).astype(int) embed_size = 300 # how big is each word vector max_features = None # how many unique words to use (i.e num rows in embedding vector) maxlen = max_seq_len # max number of words in a question to use #99.99% # fill up the missing values X = train[TEXT_COLUMN].fillna("_na_").values X_test = test[TEXT_COLUMN].fillna("_na_").values X_test_2019 = test_2019[TEXT_COLUMN].fillna("_na_").values # Tokenize the sentences tokenizer = Tokenizer(num_words=max_features, filters='') tokenizer.fit_on_texts(list(X)) X = tokenizer.texts_to_sequences(X) X_test = tokenizer.texts_to_sequences(X_test) X_test_2019 = tokenizer.texts_to_sequences(X_test_2019) # Pad the sentences X = pad_sequences(X, maxlen=maxlen) X_test = pad_sequences(X_test, maxlen=maxlen) X_test_2019 = pad_sequences(X_test_2019, maxlen=maxlen) # Get the target values Y = train[LABEL_COLUMN].values le = LabelEncoder() le.fit(Y) encoded_Y = le.transform(Y)
plt.legend() plt.grid(True) plt.savefig(filename) os.environ['KMP_DUPLICATE_LIB_OK'] = 'True' df = pd.read_csv("sample_training2_old.csv") texts = df.iloc[:, 0].to_list() tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK') tk.fit_on_texts(texts) print(tk.word_index) print("word index len: ", len(tk.word_index)) sequences = tk.texts_to_sequences(texts) print(texts[0]) print(sequences[0]) lens = [len(x) for i, x in enumerate(sequences)] print(lens) print("max: ", max(lens)) sum_ser = reduce(lambda x, y: x + y, lens) print("sum ", sum_ser) avg_len = (sum_ser * 1.0) / (len(lens)) print("avg_len: ", avg_len) data = pad_sequences(sequences, maxlen=1400, padding='post') print() print(data[0])
def training(): train = pd.read_csv(os.path.join(data_path, 'train.csv')) train = train.reindex(np.random.permutation(train.index)) train = train[['text', 'drug', 'sentiment']] # print(train.head()) train['text_comb'] = train['text'] + train['drug'] # sns.factorplot(x="sentiment", data=train, kind="count", size=6, aspect=1.5, palette="PuBuGn_d") # plt.show() train.text_comb = train.text_comb.apply(remove_stopwords) # print(train.head()) x_train, x_test, y_train, y_test = train_test_split(train.text_comb, train.sentiment, test_size=0.2, random_state=37) # print('# Train data samples:', x_train.shape[0]) # print('# Test data samples:', x_test.shape[0]) assert x_train.shape[0] == y_train.shape[0] assert x_test.shape[0] == y_test.shape[0] # Converting words to numbers tk = Tokenizer(num_words=NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=" ") tk.fit_on_texts(x_train) # print('Fitted tokenizer on {} documents'.format(tk.document_count)) # print('{} words in dictionary'.format(tk.num_words)) # print('Top 5 most common words are:', collections.Counter(tk.word_counts).most_common(5)) x_train_seq = tk.texts_to_sequences(x_train) x_test_seq = tk.texts_to_sequences(x_test) # print('"{}" is converted into {}'.format(x_train[0], x_train_seq[0])) x_train_oh = one_hot_seq(x_train_seq) x_test_oh = one_hot_seq(x_test_seq) # print('"{}" is converted into {}'.format(x_train_seq[0], x_train_oh[0])) # print('For this example we have {} features with a value of 1.'.format(x_train_oh[0].sum())) y_train_oh = to_categorical(y_train) y_test_oh = to_categorical(y_test) # print('"{}" is converted into {}'.format(y_train[0], y_train_oh[0])) # Splitting of a validation set x_train_rest, x_valid, y_train_rest, y_valid = train_test_split( x_train_oh, y_train_oh, test_size=0.2, random_state=37) assert x_valid.shape[0] == y_valid.shape[0] assert x_train_rest.shape[0] == y_train_rest.shape[0] # print('Shape of validation set:', x_valid.shape) # Baseline model base_model = models.Sequential() base_model.add( layers.Dense(64, activation='relu', input_shape=(NB_WORDS, ))) base_model.add(layers.Dense(64, activation='relu')) base_model.add(layers.Dense(3, activation='softmax')) base_model.summary() base_history = deep_model(base_model, x_train_rest, y_train_rest, x_valid, y_valid) # eval_metric(base_history, 'loss') # eval_metric(base_history, 'acc') # Handling over-fitting reduced_model = models.Sequential() reduced_model.add( layers.Dense(32, activation='relu', input_shape=(NB_WORDS, ))) reduced_model.add(layers.Dense(3, activation='softmax')) reduced_model.summary() reduced_history = deep_model(reduced_model, x_train_rest, y_train_rest, x_valid, y_valid) # compare_loss_with_baseline(reduced_history, 'Reduced Model', base_history) # Adding regularization reg_model = models.Sequential() reg_model.add( layers.Dense(64, kernel_regularizer=regularizers.l2(0.001), activation='relu', input_shape=(NB_WORDS, ))) reg_model.add( layers.Dense(64, kernel_regularizer=regularizers.l2(0.001), activation='relu')) reg_model.add(layers.Dense(3, activation='softmax')) reg_model.summary() reg_history = deep_model(reg_model, x_train_rest, y_train_rest, x_valid, y_valid) # compare_loss_with_baseline(reg_history, 'Regularized Model', base_history) # Adding dropout layers drop_model = models.Sequential() drop_model.add( layers.Dense(64, activation='relu', input_shape=(NB_WORDS, ))) drop_model.add(layers.Dropout(0.5)) drop_model.add(layers.Dense(64, activation='relu')) drop_model.add(layers.Dropout(0.5)) drop_model.add(layers.Dense(3, activation='softmax')) drop_model.summary() drop_history = deep_model(drop_model, x_train_rest, y_train_rest, x_valid, y_valid) # compare_loss_with_baseline(drop_history, 'Dropout Model', base_history) # Training on the full train data and evaluation on test data base_results = test_model(base_model, NB_START_EPOCHS, x_train_oh, y_train_oh, x_test_oh, y_test_oh) print('Test accuracy of baseline model: {0:.2f}%\n'.format( base_results[1] * 100)) reduced_results = test_model(reduced_model, NB_START_EPOCHS, x_train_oh, y_train_oh, x_test_oh, y_test_oh) print('Test accuracy of reduced model: {0:.2f}%\n'.format( reduced_results[1] * 100)) reg_results = test_model(reg_model, NB_START_EPOCHS, x_train_oh, y_train_oh, x_test_oh, y_test_oh) print('Test accuracy of regularized model: {0:.2f}%\n'.format( reg_results[1] * 100)) drop_results = test_model(drop_model, NB_START_EPOCHS, x_train_oh, y_train_oh, x_test_oh, y_test_oh) print('Test accuracy of dropout model: {0:.2f}%\n'.format(drop_results[1] * 100)) base_model.save(os.path.join('./data/', 'base_model.h5')) reduced_model.save(os.path.join('./data/', 'reduced_model.h5')) reg_model.save(os.path.join('./data/', 'reg_model.h5')) drop_model.save(os.path.join('./data/', 'drop_model.h5'))
tokenizer = Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', split=' ', char_level=False, oov_token=None) tokenizer.fit_on_texts(files) tokenizer.fit_on_texts(outfiles) # with open("wordsdic.pkl", 'wb') as f: # pickle.dump(tokenizer, f) vocab = tokenizer.word_index print(tokenizer.word_index) print(len(vocab)) x_train_word_ids = tokenizer.texts_to_sequences(files) x_out_word_ids = tokenizer.texts_to_sequences(outfiles) x_train_padded_seqs = pad_sequences(x_train_word_ids, maxlen=maxlen) x_out_padded_seqs = pad_sequences(x_out_word_ids, maxlen=maxlen) # with open('datasets.pkl', 'wb') as f: # pickle.dump(x_train_padded_seqs, f) # pickle.dump(x_out_padded_seqs, f) # pickle.dump(labels, f) # with open('datasets.pkl', 'rb') as f: # x_train_padded_seqs = pickle.load(f) # # x_test_padded_seqs = pickle.load(f) # x_out_padded_seqs = pickle.load(f)
querytweet['Query'].split()) <= max_query_len: cleaned_tweet.append(querytweet['Tweet']) cleaned_query.append(querytweet['Query']) x_train, x_validation, y_train, y_validation = train_test_split(cleaned_tweet, cleaned_query, test_size=0.1, random_state=0, shuffle=True) #prepare a tokenizer for tweets on training data x_tokenizer = Tokenizer() x_tokenizer.fit_on_texts(list(x_train)) #convert text sequences to one hot encoding sequences x_tr_seq = x_tokenizer.texts_to_sequences(x_train) x_val_seq = x_tokenizer.texts_to_sequences(x_validation) #post-padding text sequences x_train = pad_sequences(x_tr_seq, maxlen=max_tweet_len, padding='post') x_validation = pad_sequences(x_val_seq, maxlen=max_tweet_len, padding='post') #size of vocabulary ( +1 for padding token) x_vocab_size = len(x_tokenizer.word_counts.items()) + 1 print("Size of vocabulary in X = {}".format(x_vocab_size)) #prepare a tokenizer for queries on training data y_tokenizer = Tokenizer() y_tokenizer.fit_on_texts(list(y_train))
gru_output_size = 70 # Training batch_size = 128 epochs = 1 print('Loading data...') (x_train, y_train), (x_val, y_val), (x_test, y_test) = sentiment_140_neg.load_data() print('Fitting tokenizer...') tokenizer = Tokenizer() tokenizer.fit_on_texts(np.concatenate((x_train, x_val, x_test))) print('Convert text to sequences') x_train = tokenizer.texts_to_sequences(x_train) x_val = tokenizer.texts_to_sequences(x_val) x_test = tokenizer.texts_to_sequences(x_test) print(len(x_train), 'train sequences') print(len(x_val), 'validation sequences') print(len(x_test), 'test sequences') print('Pad sequences (samples x time)') x_train = sequence.pad_sequences(x_train, maxlen=maxlen) x_val = sequence.pad_sequences(x_val, maxlen=maxlen) x_test = sequence.pad_sequences(x_test, maxlen=maxlen) print('x_train shape:', x_train.shape) print('x_val shape:', x_val.shape)
test = pd.read_csv('../input/scenic_score_prediction/predict_first.csv') # 分词 def participle(data): data['word'] = data['Discuss'].map(lambda x: jieba.lcut(x)) participle(train) participle(test) max_features = 80000 ## 词汇量 token = Tokenizer(num_words=max_features) token.fit_on_texts(train.word.values) train['Discuss_seq'] = token.texts_to_sequences(train.word.values) test['Discuss_seq'] = token.texts_to_sequences(test.word.values) maxlen = 150 def get_keras_data(data): return {'Discuss_seq': pad_sequences(data.Discuss_seq, maxlen=maxlen)} x_train = get_keras_data(train) x_test = get_keras_data(test) y_train = train.Score.values embed_size = 200 # emb 长度
for i in range(len(finaldf)): print(i, finaldf['title'][i]) print(len(finaldf)) finaldf = finaldf.sample(frac=1) X_train, X_test, y_train, y_test = train_test_split(df5.text, df5.target, test_size=0.3, random_state=37) tk = Tokenizer(num_words=10000, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{"}~\t\n', lower=True, split=" ") tk.fit_on_texts(X_train) X_train_seq = tk.texts_to_sequences(X_train) X_test_seq = tk.texts_to_sequences(X_test) X_train_seq_trunc = pad_sequences(X_train_seq, maxlen=100) X_test_seq_trunc = pad_sequences(X_test_seq, maxlen=100) model = Sequential() # initilaizing the Sequential nature for CNN model print(len(tk.index_word)) # Adding the embedding layer which will take in maximum of 450 words as input and provide a 32 dimensional output of those words which belong in the top_words dictionary model.add(Embedding(len(tk.index_word), 32, input_length=100)) model.add(LSTM(100)) #CNN # model.add(Conv1D(16, 4, padding='valid', activation='relu')) # model.add(MaxPooling1D()) # model.add(Conv1D(32, 4, activation='relu'))
'tSentimentScore', 'label' ]) except: print('ERRORRRRRR!!!!') if True: train_emo_feat = pd.read_csv( 'datasets/train_articles_emotion_features.csv', index_col='index') dev_emo_feat = pd.read_csv( 'datasets/dev_articles_emotion_features.csv', index_col='index') # pd.read_csv() print("Extracting tokens...") tokenizer = Tokenizer(num_words=MAX_NB_WORDS) tokenizer.fit_on_texts(articles + dev_articles) articles_id = tokenizer.texts_to_sequences(articles) dev_articles_id = tokenizer.texts_to_sequences(dev_articles) # print(articles_id) # print(dev_articles_id) wordIndex = tokenizer.word_index print("Found %s unique tokens." % len(wordIndex)) print("Populating embedding matrix...") embeddingMatrix = getEmbeddingMatrix(wordIndex) train_seq_len = [] dev_seq_len = [] for x in articles_id: train_seq_len.append(len(x))
def train(max_length=128, embeddings_size=100, validation_split=0.2, model_path='model'): """ Trains the model :param model_path: the path to the folder containing the model :param validation_split: percentage of the samples kept for validation :param max_length: maximum sequence length :param embeddings_size: the size of the embeddings :return: """ titles, descriptions, types = load_dataset('dataset/movies_metadata.csv') tokenizer = Tokenizer() tokenizer.fit_on_texts(titles + descriptions) label_binarizer = MultiLabelBinarizer() types = label_binarizer.fit_transform(types) with open(model_path + '/tokenizer.pkl', 'wb') as f: pickle.dump(tokenizer, f) with open(model_path + '/label_binarizer.pkl', 'wb') as f: pickle.dump(label_binarizer, f) # Converts texts to sequences of token ids titles = tokenizer.texts_to_sequences(titles) descriptions = tokenizer.texts_to_sequences(descriptions) # Pads the sequences with zeros titles = pad_sequences(titles, padding='post', maxlen=max_length) descriptions = pad_sequences(descriptions, padding='post', maxlen=max_length) # Split the dataset to train and validation train_num = int((1 - validation_split) * len(titles)) train_titles, train_descriptions, train_types = titles[: train_num], descriptions[: train_num], types[: train_num] val_titles, val_descriptions, val_types = titles[train_num:], descriptions[ train_num:], types[train_num:] total_labels = len(label_binarizer.classes_) model = get_model(embeddings_size=embeddings_size, tokenizer=tokenizer, total_labels=total_labels) class MyCallback(keras.callbacks.Callback): """ A custom Keras callback for running the evaluation every k batches and storing the best model """ def __init__(self, model, val_data, label_binarizer): super(MyCallback, self).__init__() self.model = model self.val_data = val_data self.label_binarizer = label_binarizer self.f1 = 0 def on_batch_end(self, batch, logs={}): if (batch + 1) % 100 == 0: precision, recall, f1, acc, avg_precision, avg_recall, avg_f1 = evaluate( self.model, val_data=self.val_data, label_binarizer=self.label_binarizer) logs['micro precision'] = precision logs['micro recall'] = recall logs['micro F1'] = f1 logs['macro precision'] = avg_precision logs['macro recall'] = avg_recall logs['macro F1'] = avg_f1 logs['accuracy'] = acc if f1 > self.f1: print(str(self.f1) + ' -> ' + str(f1)) self.f1 = f1 model.save('model/model.h5') # A tensorboard callback to visualize the metrics tensorboard_callback = keras.callbacks.TensorBoard( log_dir="logs/scalars/" + datetime.now().strftime("%Y%m%d-%H%M%S"), update_freq='batch', write_grads=True, write_graph=True, write_images=True) callback = MyCallback(model=model, val_data={ 'titles': val_titles, 'descriptions': val_descriptions, 'types': val_types }, label_binarizer=label_binarizer) model.fit([train_titles, train_descriptions], [train_types], epochs=32, verbose=True, batch_size=64, callbacks=[callback, tensorboard_callback])
def load_data(): user_seq = json.load(open(DATA_PATH, 'r')) scores = [] with open(TRAIN_DATA, "r") as f: train_data = f.readlines() scores = [] for raw_data in train_data: raw = raw_data.split(',') scores.append({raw[0]: raw[1].strip("\n")}) user_tweet = [] text = [] text_score = [] for user in user_seq: for seq in user: for score in scores: if seq["id_str"] in score.keys() and seq["text"] != "": # user_tweet.append({"post_content": seq["text"], "post_time": seq["time"], # "score": score[seq["id_str"]]}) text.append(str(seq["text"]).strip("\n")) text_score.append(score[seq["id_str"]]) break tokenizer = Tokenizer(num_words=MAX_WORDS_NUM) tokenizer.fit_on_texts(text) # print(tokenizer.word_index) x_seq = tokenizer.texts_to_sequences(text) x_train = pad_sequences(x_seq, maxlen=MAX_LEN) # print(x_train) y_train = np.array(text_score) # print(y_train) with open(TEST_DATA, "r") as ft: test_data = ft.readlines() test_scores = [] for raw_data in test_data: raw = raw_data.split(',') test_scores.append({raw[0]: raw[1].strip("\n")}) test_text = [] test_score = [] for user in user_seq: for seq in user: for score in test_scores: if seq["id_str"] in score.keys() and seq["text"] != "": # user_tweet.append({"post_content": seq["text"], "post_time": seq["time"], # "score": score[seq["id_str"]]}) test_text.append(str(seq["text"]).strip("\n")) test_score.append(score[seq["id_str"]]) break tokenizer = Tokenizer(num_words=MAX_WORDS_NUM) tokenizer.fit_on_texts(test_text) # print(tokenizer.word_index) test_seq = tokenizer.texts_to_sequences(test_text) x_test = pad_sequences(test_seq, maxlen=MAX_LEN) # print(x_test) y_test = np.array(test_score) # print(y_test) return (x_train, y_train), (x_test, y_test)