def preprocess_data(): # GET DATA data = pd.read_csv("data/StockTwits_SPY_Sentiment_2017.gz", encoding="utf-8", compression="gzip", index_col=0) # GET MESSAGES AND VALUS messages = data.message.values labels = data.sentiment.values messages = np.array( [utl.preprocess_ST_message(message) for message in messages]) full_lexicon = " ".join(messages).split() vocab_to_int, int_to_vocab = utl.create_lookup_tables(full_lexicon) messages_lens = Counter([len(x) for x in messages]) print("Zero-length messages: {}".format(messages_lens[0])) print("Maximum message length: {}".format(max(messages_lens))) print("Average message length: {}".format( np.mean([len(x) for x in messages]))) messages, labels = utl.drop_empty_messages(messages, labels) messages = utl.encode_ST_messages(messages, vocab_to_int) labels = utl.encode_ST_labels(labels) messages = utl.zero_pad_messages(messages, seq_len=244) train_x, val_x, test_x, train_y, val_y, test_y = utl.train_val_test_split( messages, labels, split_frac=0.80) return train_x, val_x, test_x, train_y, val_y, test_y, vocab_to_int
def preprocess_data(self): self.vocab_to_int, self.int_to_vocab = create_lookup_tables( self.counter) for row in range(len(self.X_train)): #self.X_train[row][0] = self.string_to_vocab(self.X_train[row][0], self.max_sentence_length) self.X_train[row] = self.string_to_vocab(self.X_train[row], self.max_sentence_length) test_data_size = 5000 self.X_train = np.array(self.X_train, dtype=np.float) self.Y_train = np.array(self.Y_train, dtype=np.float)
def preprocess_data(text): """ :param text: raw text :return: tokenized data """ token_dict = token_lookup() for key, token in token_dict.items(): text = text.replace(key, ' {} '.format(token)) text = text.lower() text = text.split() vocab_to_int, int_to_vocab = create_lookup_tables( text + list(SPECIAL_WORDS.values())) int_text = [vocab_to_int[word] for word in text] return int_text, vocab_to_int, int_to_vocab, token_dict
def _test_lookup_tables(): text = pd.Series([ "this is a toy", "I mean not really a toy", "I mean a toy vocabulary" ]) vocab_to_int, int_to_vocab = create_lookup_tables(text) # Make sure the dicts make the same lookup missmatches = [(word, id, id, int_to_vocab[id]) for word, id in vocab_to_int.items() if int_to_vocab[id] != word] assert not missmatches,\ 'Found {} missmatche(s). First missmatch: vocab_to_int[{}] = {} and int_to_vocab[{}] = {}'.format(len(missmatches), *missmatches[0])
def read_data_from_file(data_path): maybe_download() with open(data_path) as f: text = f.read() ########################################################### # ------------------- Preprocessing ----------------------- # 1. Tokenize punctuations e.g. period -> <PERIOD> # 2. Remove words that show up five times or fewer words = utils.preprocess(text) # Hmm, let's take a look at the processed data print('First 30 words:', words[:30]) print('Total words:', len(words)) print('Total unique words:', len(set(words))) # Create two dictionaries to convert words to integers vocab_to_int, int_to_vocab = utils.create_lookup_tables(words) n_vocab = len(int_to_vocab) # Convert words into integers int_words = [vocab_to_int[w] for w in words] ########################################################### # ------------------- Subsampling ------------------------- # Some words like "the", "a", "of" etc don't provide much # information. So we might want to remove some of them. # This results in faster and better result. # The probability that a word is discarded is # P(w) = 1 - sqrt(1 / frequency(w)) each_word_count = Counter(int_words) total_count = len(int_words) threshold = 1e-5 # FLAGS.drop_word_threshold freqs = {word: count/total_count for word, count in each_word_count.items()} probs = {word: 1 - np.sqrt(threshold/freqs[word]) for word in each_word_count} train_words = [word for word in int_words if random.random() < (1 - probs[word])] print('After subsampling, first 30 words:', train_words[:30]) print('After subsampling, total words:', len(train_words)) # Subsampling makes it worse for eliminating contextual info # return train_words, int_to_vocab, vocab_to_int, n_vocab return int_words, int_to_vocab, vocab_to_int, n_vocab
def read_data_from_file(data_path: str) -> tuple: """ 生成训练的词列表,以及列表的长度。 :param data_path: :return: """ maybe_download() with open(data_path) as f: text = f.read() # 将文本中的特殊标点符号用指定的字符进行替换。 words = utils.preprocess(text) print('First 30 words:', words[:30]) print('Total words:', len(words)) print('Total unique words:', len(set(words))) # 根据文本生成的单词频率进行由高到低的排序,过滤掉低频词(词出现的次数<5),生成字典id2word以及word2id。 vocab_to_int, int_to_vocab = utils.create_lookup_tables(words) n_vocab = len(int_to_vocab) # 由原来的词频进而转化成词的序列,序列通过enumerate来实现的。 int_words = [vocab_to_int[w] for w in words] ########################################################### # ------------------- Subsampling ------------------------- # Some words like "the", "a", "of" etc don't provide much # information. So we might want to remove some of them. # This results in faster and better result. # The probability that a word is discarded is # P(w) = 1 - sqrt(1 / frequency(w)) each_word_count = Counter(int_words) total_count = len(int_words) threshold = FLAGS.drop_word_threshold # 统计词频 freq_s = { word: count / total_count for word, count in each_word_count.items() } prob_s = { word: 1 - np.sqrt(threshold / freq_s[word]) for word in each_word_count } train_words = [ word for word in int_words if random.random() < (1 - prob_s[word]) ] print('After subsampling, first 30 words:', train_words[:30]) print('After subsampling, total words:', len(train_words)) return train_words, int_to_vocab, vocab_to_int, n_vocab
def create_neural_network(): global vocab_to_int, int_to_vocab, counter vocab_to_int, int_to_vocab = create_lookup_tables(counter) preprocess_data() print('X_train', X_train.shape) print('Y_train', Y_train.shape) print('X_test', X_test.shape) print('Y_test', Y_test.shape) print('size of vocabulary', len(vocab_to_int)) model = RNN() model.summary() model.compile(loss='binary_crossentropy', optimizer=RMSprop(), metrics=['accuracy']) model.fit(X_train, Y_train, batch_size=128, epochs=10, validation_split=0.2, callbacks=[EarlyStopping(monitor='val_loss', min_delta=0.0001)])
def preprocess(text): # get list of words words = utils.preprocess(text) vocab_to_int, int_to_vocab = utils.create_lookup_tables(words) int_words = [vocab_to_int[word] for word in words] ## Subsampling threshold = 1e-5 word_counts = Counter(int_words) # print(list(word_counts.items())[0]) # dictionary of int_words, how many times they appear total_count = len(int_words) freqs = {word: count / total_count for word, count in word_counts.items()} p_drop = {word: 1 - np.sqrt(threshold / freqs[word]) for word in word_counts} # discard some frequent words, according to the subsampling equation # create a new list of words for training train_words = [word for word in int_words if random.random() < (1 - p_drop[word])] preprocessed = {'train_words': train_words, 'vocab_to_int': vocab_to_int, 'int_to_vocab': int_to_vocab, 'freqs': freqs} return preprocessed
df["clean_tweet"] = tweets # Get cleaned tweets df["word_count"] = df.clean_tweet.apply( lambda x: len(x.split())) # Get their word count # Remove outliers old_tweet = df.loc[df.word_count == df.word_count.max(), ].tweet.values[0] new_tweet = old_tweet[:old_tweet.find("\r")] df.loc[df.word_count == df.word_count.max(), "tweet"] = new_tweet df.loc[df.word_count == df.word_count.max(), "clean_tweet"] = preprocess(new_tweet) df.loc[df.word_count == df.word_count.max(), "word_count"] = len(preprocess(new_tweet).split()) print("Testing create lookup table function...\n") _test_lookup_tables() vocab_to_int, int_to_vocab = create_lookup_tables(tweets) print("Testing padding function...\n") _test_pad_tweets() MAX_LENGTH = df.word_count.max() pad_tweets = create_pad_fn(MAX_LENGTH) df["padded_tweets"] = df.clean_tweet.map(pad_tweets) print("Testing hate classification function...\n") _test_hate_classification() print("Testing change hate labels function...\n") _test_hate_labels(tweets, raw_labels) tweets_ints = np.array([[vocab_to_int[word] for word in tweet.split()]
def word_mapping(words): vocab_to_int, int_to_vocab = utils.create_lookup_tables(words) int_words = [vocab_to_int[word] for word in words] return vocab_to_int, int_to_vocab, int_words
pbar.hook) if not isdir(dataset_folder_path): with zipfile.ZipFile(dataset_filename) as zip_ref: zip_ref.extractall(dataset_folder_path) # 解压 import os with open(os.path.join(dataset_folder_path, 'text8')) as f: text = f.read() # words为文本中的所有单词序列 words = utils.preprocess(text) # 大写转小写,以及符号替换,去掉低频词 print(words[:30]) print("Total words: {}".format(len(words))) #16680599 print("Unique words: {}".format(len(set(words)))) #63641 vocab_to_int, int_to_vocab = utils.create_lookup_tables( words) # word ->index , index -> word int_words = [vocab_to_int[word] for word in words] from collections import Counter import random # 计算丢弃概率,与其出现频率正相关 # 注意:丢词是针对文本里的所有单词而言,而非针对某个窗口 threshold = 1e-5 word_counts = Counter(int_words) total_count = len(int_words) freqs = {word: count / total_count for word, count in word_counts.items()} p_drop = {word: 1 - np.sqrt(threshold / freqs[word]) for word in word_counts} train_words = [ word for word in int_words if random.random() < (1 - p_drop[word]) ]
import pickle # load ascii text and covert to lowercase filename = "wonderland.txt" raw_text = codecs.open(filename, encoding = "utf8", errors ='replace').read() raw_text = raw_text.lower() # print(raw_text) # create mapping of unique chars to integers words, sentences = utils.preprocess(raw_text) unique_words = sorted(list(set(words))) # print(words) word_to_int, int_to_word = utils.create_lookup_tables(unique_words) # print(words_to_int) # print(int_to_words) n_vocab = len(unique_words) # print ("Total Vocab: ", n_vocab) # prepare the dataset of input to output pairs encoded as integers seq_length = 3 dataX = [] dataY = [] for sentence in sentences: sentence_words = sentence.split() if(len(sentence_words)>seq_length):
def create_neural_network(): global vocab_to_int, int_to_vocab vocab_to_int, int_to_vocab = create_lookup_tables(counter) preprocess_data() print('X_train', X_train.shape) print('Y_train', Y_train.shape) print('X_test', X_test.shape) print('Y_test', Y_test.shape) print('size of vocabulary', len(vocab_to_int)) #124188 sequence_length = max_sentence_length embedding_length = len(vocab_to_int) num_classes = 2 print('sequence_length', sequence_length) print('embedding_length', embedding_length) input_data = tf.placeholder(tf.float32, [None, sequence_length, embedding_length]) inputs = tf.placeholder(tf.int32, [batch_size, num_steps], name='inputs') targets = tf.placeholder(tf.int32, [batch_size, num_steps], name='targets') keep_prob = tf.placeholder(tf.float32, name='keep_prob') hidden_vector_size = 100 rnn_cell = tf.contrib.rnn.LSTMCell(hidden_vector_size) initial_zero_h = tf.matmul(tf.reduce_mean(tf.zeros_like(input_data), 2), tf.zeros([sequence_length, hidden_vector_size])) initial_state = tf.contrib.rnn.LSTMStateTuple(initial_zero_h, initial_zero_h) outputs, state = tf.nn.dynamic_rnn(rnn_cell, input_data, initial_state=initial_state, dtype=tf.float32) prediction, logits = build_output(outputs, hidden_vector_size, num_classes) loss = build_loss(logits, targets, hidden_vector_size, num_classes) optimizer = build_optimizer(loss, learning_rate, grad_clip) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # Use the line below to load a checkpoint and resume training #saver.restore(sess, 'checkpoints/______.ckpt') counter = 0 for e in range(epochs): # Train network new_state = sess.run(tf.global_variables_initializer()) loss = 0 for x, y in get_batches(encoded, batch_size, num_steps): counter += 1 start = time.time() feed = { input_data: x, model.targets: y, model.keep_prob: keep_prob, model.initial_state: new_state } batch_loss, new_state, _ = sess.run( [model.loss, model.final_state, model.optimizer], feed_dict=feed) end = time.time() print('Epoch: {}/{}... '.format(e + 1, epochs), 'Training Step: {}... '.format(counter), 'Training loss: {:.4f}... '.format(batch_loss), '{:.4f} sec/batch'.format((end - start)))
def create_neural_network(): global vocab_to_int, int_to_vocab, counter vocab_to_int, int_to_vocab = create_lookup_tables(counter) preprocess_data() print('X_train', X_train.shape) print('Y_train', Y_train.shape) print('X_test', X_test.shape) print('Y_test', Y_test.shape) print('size of vocabulary', len(vocab_to_int)) time_steps = 128 num_units = 128 #hidden LSTM units n_input = 500 #rows of 28 pixels learning_rate = 0.001 #learning rate for adam n_classes = 2 #mnist is meant to be classified in 10 classes(0-9). batch_size = 128 #size of batch tf.reset_default_graph() out_weights = tf.Variable(tf.random_normal([n_input, n_classes])) out_bias = tf.Variable(tf.random_normal([n_classes])) x = tf.placeholder("float", [None, n_input]) y = tf.placeholder("float", [None, n_classes]) #input = tf.unstack(x, n_input, 0) lstm_layer = BasicLSTMCell(num_units, forget_bias=1) outputs, _ = rnn.rnn(lstm_layer, x, dtype=tf.float32) prediction = tf.matmul(outputs[-1], out_weights) + out_bias loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y)) opt = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss) #model evaluation correct_prediction = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) iter = 1 while iter < 800: for batch_x, batch_y in batch_features_labels( X_train, Y_train, batch_size): #batch_x,batch_y=mnist.train.next_batch(batch_size=batch_size) print('batch_x', batch_x.shape) print('batch_y', batch_y.shape) #batch_x = batch_x.reshape((batch_size,time_steps,n_input)) sess.run(opt, feed_dict={x: batch_x, y: batch_y}) if iter % 10 == 0: acc = sess.run(accuracy, feed_dict={ x: batch_x, y: batch_y }) los = sess.run(loss, feed_dict={x: batch_x, y: batch_y}) print("For iter ", iter) print("Accuracy ", acc) print("Loss ", los) print("__________________") iter = iter + 1
def create_neural_network(): global vocab_to_int, int_to_vocab, counter vocab_to_int, int_to_vocab = create_lookup_tables(counter) preprocess_data() print('X_train', X_train.shape) print('Y_train', Y_train.shape) print('X_test', X_test.shape) print('Y_test', Y_test.shape) print('size of vocabulary', len(vocab_to_int)) epochs = 20 #sequence_length = max_sentence_length #embedding_length = len(vocab_to_int) num_classes = 2 grad_clip = 5 batch_size = 10 # Sequences per batch num_steps = 500 # Number of sequence steps per batch lstm_size = 128 # Size of hidden layers in LSTMs num_layers = 2 # Number of LSTM layers learning_rate = 0.01 # Learning rate keep_prob = 0.5 # Dropout keep probability tf.reset_default_graph() # Build the input placeholder tensors inputs, targets, keep_prob = build_inputs(batch_size, num_steps) # Build the LSTM cell cell, initial_state = build_lstm(lstm_size, num_layers, batch_size, keep_prob) ### Run the data through the RNN layers # First, one-hot encode the input tokens x_one_hot = tf.one_hot(inputs, num_classes) print('inputs', inputs.shape) print('num_classes', num_classes) print('x_one_hot', x_one_hot.shape) # Run each sequence step through the RNN with tf.nn.dynamic_rnn outputs, state = tf.nn.dynamic_rnn(cell, x_one_hot, initial_state=initial_state) print('outputs', outputs.shape) final_state = state # Get softmax predictions and logits prediction, logits = build_output(outputs, lstm_size, num_classes) # Loss and optimizer (with gradient clipping) loss = build_loss(logits, targets, lstm_size, num_classes) optimizer = build_optimizer(loss, learning_rate, grad_clip) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) counter = 0 for e in range(epochs): # Train network new_state = sess.run(initial_state) total_loss = 0 for x, y in batch_features_labels(X_train, Y_train, batch_size): print('x', x.shape) print('y', y.shape) counter += 1 start = time.time() feed = { inputs: x, targets: y, keep_prob: 0.5, initial_state: new_state } batch_loss, new_state, _ = sess.run( [loss, final_state, optimizer], feed_dict=feed) end = time.time() print('Epoch: {}/{}... '.format(e + 1, epochs), 'Training Step: {}... '.format(counter), 'Training loss: {:.4f}... '.format(batch_loss), '{:.4f} sec/batch'.format((end - start)))
with open('data/text8') as f: text = f.read() # all sequential text data # Preprocessing the data # process the raw data, replace systex with text and return a list in sequence of word words = utils.preprocess(text) print("Total words: {}".format(len(words))) # 16,680,599 print("Unique words: {}".format(len(set(words)))) # 63,641 # making a look up table # vocab_to_int['a'] = 5, which the index of token 'a' # int_to_vocab[5] = 'a' vocab_to_int, int_to_vocab = utils.create_lookup_tables(words) # converting the entire data represented in form of foken number int_words = [vocab_to_int[word] for word in words] # Subsampling from collections import Counter import random threshold = 1e-5 number_of_words = len(int_words) word_counter = Counter(int_words) frequencies = dict()
encoding="utf-8", compression="gzip", index_col=0) # get messages and sentiment labels messages = data.message.values labels = data.sentiment.values # View sample of messages with sentiment print(data[:10]) messages = np.array( [utl.preprocess_ST_message(message) for message in messages]) full_lexicon = " ".join(messages).split() vocab_to_int, int_to_vocab = utl.create_lookup_tables(full_lexicon) messages_lens = Counter([len(x) for x in messages]) print("Zero-length messages: {}".format(messages_lens[0])) print("Maximum message length: {}".format(max(messages_lens))) print("Average message length: {}".format(np.mean([len(x) for x in messages]))) messages, labels = utl.drop_empty_messages(messages, labels) messages = utl.encode_ST_messages(messages, vocab_to_int) labels = utl.encode_ST_labels(labels) messages = utl.zero_pad_messages(messages, seq_len=244) train_x, val_x, test_x, train_y, val_y, test_y = utl.train_val_test_split( messages, labels, split_frac=0.80) print("Data Set Size")