def get_sequence_of_tokens(corpus, refresh=True): """ :param corpus: :param refresh: :return: """ # tokenization if refresh: tokenizer = Tokenizer() # fit the tokenizer on the text tokenizer.fit_on_texts(corpus) else: with open("tokenizer.json", 'r') as tj: tokenizer = tokenizer_from_json(json.load(tj)) tokenizer_json = tokenizer.to_json() with open('tokenizer.json', 'w') as fobj: json.dump(tokenizer_json, fobj) index_dict = tokenizer.word_index seq = tokenizer.texts_to_sequences(corpus) # calculate the vocab size total_words = len(tokenizer.word_index) + 1 print(total_words) return total_words, seq
def prepare_input_data(dataframe, save, maxlen=50, max_words=10000, model_name=None): """ Prepapre the data to be fed to the model. Parameters ---------- dataframe: pandas.DataFrame save: bool maxlen: int Maximum number of tokens per tweet max_words: int Maximum of words for the tokenizer model_name: str If save, save the variables needed for prediction, including the name of the model in the output file Returns ------- X: numpy.array Array of tweets fit for model input y: pandas.DataFrame Onehot encoding of the labels """ tokenizer = Tokenizer(num_words=max_words, filters='!"#$%&()*+,-./:;<=>?@[\\]^\'_`{|}~\t\n') tokenizer.fit_on_texts(dataframe['tweets']) dataframe = dataframe.sample(frac=1).reset_index(drop=True) X = tokenizer.texts_to_sequences(dataframe['tweets']) X = pad_sequences(X, maxlen=maxlen) y = pd.get_dummies(dataframe['emojis']) if save: saved_tokenizer = tokenizer.to_json() with open(f'models_utils/{model_name}_tokenizer.json', 'w', encoding='utf-8') as jsonfile: json.dump(saved_tokenizer, jsonfile, ensure_ascii=False) emojis = [emoji for emoji in dataframe['emojis']] emojis_indices = {} for i in range(len(emojis)): if emojis[i] in emojis_indices.keys(): emojis_indices[emojis[i]].append(i) else: emojis_indices[emojis[i]] = [i] with open(f'models_utils/{model_name}_emojis_indices.json', 'w') as jsonfile: json.dump(emojis_indices, jsonfile) return X, y
def WordtoInt(arr: List[List[str]]): t = Tokenizer(num_words=500) t.fit_on_texts(arr) sequences = t.texts_to_sequences(arr) # print('sequences : ', sequences, '\n') # print('word_index : ', t.word_index) tokenizer_json = t.to_json() with io.open('tokenizer.json', 'w', encoding='utf-8') as f: f.write(json.dumps(tokenizer_json, ensure_ascii=False)) # print(tokenizer_json) return sequences
def fit_on_text(data): """ 1. Updates internal vocabulary based on a list of texts according to given params. 2. Setting max length for pad_sequences and embedding layer. Embedding layer is capable of processing sequence of heterogenous length, if you don't pass an explicit input_length argument to the layer). If max is big, reveiws will have too many padded values left for short reviews and decreases the accuracy in turn. 4 so setting a maximum length by (max-avg). 3. This method creates the vocabulary index based on word frequency. So if you give it something like, "the boy drove on the road." It will create a dictionary s.t. word_index["the"] = 1; word_index["boy"] = 2; 0 is reserved for padding. so this way, each word gets a unique integer value. So lower integer means more frequent word. Args: data(list) Arguments: data(list): list of list of tokens Returns: Max_length for padding. token object.=> A dictionary word_index with values as indexes . lowest one is most frequent. """ print("Inside fit on text..") # length_list=[len(seq) for seq in data] # avg=sum(length_list)/len(length_list) # max_length= int(max(length_list)-avg)/5 #max-average number of words in each sentence. max_length = 450 #defining after finding optimal value # print("Max length for pad sequences: ",max_length) token = Tokenizer() #Defining the Tokenizer object list_of_strings_full_data = [' '.join(seq[:]) for seq in data ] # Making list of strings for token ob token.fit_on_texts(list_of_strings_full_data) tokenizer_json = token.to_json() with io.open(os.path.join("models", 'tokenizer.json'), 'w', encoding='utf-8') as f: f.write(json.dumps(tokenizer_json, ensure_ascii=False)) return max_length, token
def preprocess(self, tokenizer_string=None): """ Preprocess the textual data. Returns ------- x_train: The processed-sequenced training data. y_train: Processed training labels x_val: The processed-sequenced validation data y_val: processed validation labels word_index: A dictionary containing the word-tokens and their indices for the sequencing. """ if tokenizer_string is None: tokenizer = Tokenizer(num_words=self.MAX_NB_WORDS) tokenizer.fit_on_texts(self.texts) self.tokenizer_string = tokenizer.to_json() else: self.tokenizer_string = tokenizer_string from keras.preprocessing.text import tokenizer_from_json tokenizer = tokenizer_from_json(tokenizer_string) sequences = tokenizer.texts_to_sequences(self.texts) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) data = pad_sequences(sequences, maxlen=self.MAX_SEQUENCE_LENGTH) labels = to_categorical(np.asarray(self.labels)) print('Shape of data tensor:', data.shape) print('Shape of label tensor:', labels.shape) # split the data into a training set and a validation set if (self.VALIDATION_SPLIT): indices = np.arange(data.shape[0]) np.random.shuffle(indices) data = data[indices] labels = labels[indices] num_validation_samples = int(self.VALIDATION_SPLIT * data.shape[0]) x_train = data[:-num_validation_samples] y_train = labels[:-num_validation_samples] x_val = data[-num_validation_samples:] y_val = labels[-num_validation_samples:] else: x_train = data y_train = labels x_val = None y_val = None return x_train, y_train, x_val, y_val, word_index
def fetch_tokenizer(sentences): global tokenizer if tokenizer is not None: return tokenizer if os.path.isfile('models/tokenizer.json'): with open('models/tokenizer.json') as f: data = json.load(f) tokenizer = tokenizer_from_json(data) else: tokenizer = Tokenizer(num_words=vocabulary_size) tokenizer.fit_on_texts(sentences) tokenizer_json = tokenizer.to_json() with io.open('models/tokenizer.json', 'w', encoding='utf-8') as f: f.write(json.dumps(tokenizer_json, ensure_ascii=False)) return tokenizer
def load_tokenizer( texts=None, num_words=MAX_WORDS ): file = os.path.join( DATA_HOME, SAVE_DIR, __TOKENIZER_FILE.format( num_words ) ) # tokenizer config file exists. load it and return tokenizer if os.path.exists( file ): print( 'loading tokenizer' ) with open( file, 'r' ) as f: return tokenizer_from_json( f.readline() ) if texts is None: texts, _ = load_raw_text() # load the review data tokenizer = Tokenizer( num_words=MAX_WORDS ) print( 'fitting tokenizer' ) tokenizer.fit_on_texts( texts ) json = tokenizer.to_json() print( 'saving tokenizer' ) with open( file, 'w' ) as f: f.write( json ) return tokenizer
def get_preprocessor(x1, x2): max_vocab = 10000 tokenizer = Tokenizer(num_words=max_vocab) tokenizer.fit_on_texts(x1) x_train = tokenizer.texts_to_sequences(x1) x_test = tokenizer.texts_to_sequences(x2) x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train, padding='post', maxlen=256) x_test = tf.keras.preprocessing.sequence.pad_sequences(x_test, padding='post', maxlen=256) #writting the tokenizer to a json file with open(tokenizer_path, 'w') as f: tokenizer_json_string = tokenizer.to_json() f.write(tokenizer_json_string) return x_train, x_test
testdat = traindat traincat = cat[:trainlen] #valcat = cat[trainlen:trainlen+vallen] #testcat = cat[trainlen+vallen:] valcat = traincat testcat = traincat tokenizer = Tokenizer(lower=False, num_words=vocab_size, oov_token="UNK") tokenizer.fit_on_texts(traindat) #with open("data/abnb_pets_tok.json") as f: # data = json.load(f) # tokenizer = tokenizer_from_json(json.dumps(data)) # deployment config tokenizer_json = tokenizer.to_json() # This is not working correctly it seems with open("cities.json", "w", encoding="utf-8") as f: f.write(tokenizer_json) #Xtrain = tokenizer.texts_to_matrix(traindat, mode='freq') Ytrain = np.asarray(traincat) Yval = np.asarray(valcat) #Xtest = tokenizer.texts_to_matrix(testdat, mode='freq') Ytest = np.asarray(testcat) Xtrain = tokenizer.texts_to_sequences(traindat) Xval = tokenizer.texts_to_sequences(valdat)
class SiameseXSimilarity(Similarity): """Siamese neural network similarity with extra feature.""" def __init__(self): """Segmentation and normalization are not allowed in Siamese similarity.""" super().__init__() self.max_sequence_length = 10 self.embedding_dimension = 50 self.number_lstm_units = 50 self.number_dense_units = 50 self.rate_drop_lstm = 0.17 self.rate_drop_dense = 0.25 self.activation_function = 'relu' self.epochs = 20 self.model_cache = 'cache/models/siamx' self.tokenizer_cache = 'cache/models/tokenizerx' def similarity(self, x, y): return self.run_similarity([x, y]) def run_similarity(self, df): """Predict similarity for each pair.""" comments1, comments2, word_counts, name_similarities = self.features( df) return np.array( list( self.model.predict( [comments1, comments2, word_counts, name_similarities]).ravel())) def load(self, cache): """Load trained model.""" self.model = load_model(self.model_cache) with open(self.tokenizer_cache) as f: self.tokenizer = tokenizer_from_json(json.load(f)) super().load(cache) def train(self, df, verbose=False, cache=None): """Define and train the neural network.""" # Flatten list of comment pairs pairs = df[['comment1', 'comment2']] comments = list(np.ravel(pairs)) comments = list(map(text_to_word_sequence, comments)) # Train word2vec embeddings word2vec = Word2Vec(comments, min_count=1, size=self.embedding_dimension) # Train tokenizer self.tokenizer = Tokenizer() self.tokenizer.fit_on_texts(comments) if cache: with open(self.tokenizer_cache, 'w') as f: json.dump(self.tokenizer.to_json(), f) word_index = self.tokenizer.word_index vocab_size = len(word_index) + 1 # Generate embedding matrix self.embedding_matrix = np.zeros( (vocab_size, self.embedding_dimension)) for word, i in word_index.items(): self.embedding_matrix[i] = word2vec.wv[word] del word2vec # Define the neural network # Word embedding layer embedding_layer = Embedding(vocab_size, self.embedding_dimension, weights=[self.embedding_matrix], input_length=self.max_sequence_length, trainable=False) # LSTM encoder lstm_layer = Bidirectional( LSTM(self.number_lstm_units, dropout=self.rate_drop_lstm, recurrent_dropout=self.rate_drop_lstm)) # LSTM encoder layer for the 1st comment input1 = Input(shape=(self.max_sequence_length, ), dtype='int32') embedding1 = embedding_layer(input1) lstm1 = lstm_layer(embedding1) # LSTM encoder layer for the 2nd comment input2 = Input(shape=(self.max_sequence_length, ), dtype='int32') embedding2 = embedding_layer(input2) lstm2 = lstm_layer(embedding2) # Word count layer input3 = Input(shape=(3, )) dense3 = Dense(int(self.number_dense_units / 2), activation=self.activation_function)(input3) # Name similarity layer input4 = Input(shape=(1, )) dense4 = Dense(int(self.number_dense_units / 5), activation=self.activation_function)(input4) # Merge two LSTM layers and dense word count and name similarity layers merged = concatenate([lstm1, lstm2, dense3, dense4]) # [Normalization + dropout + dense] x2 merged = BatchNormalization()(merged) merged = Dropout(self.rate_drop_dense)(merged) merged = Dense(self.number_dense_units, activation=self.activation_function)(merged) merged = BatchNormalization()(merged) merged = Dropout(self.rate_drop_dense)(merged) output = Dense(1, activation='sigmoid')(merged) # Initialize the model self.model = Model(inputs=[input1, input2, input3, input4], outputs=output) self.model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['acc']) # Define early stopping callback es = EarlyStopping(patience=3) # Extract features comments1, comments2, word_counts, name_similarities = self.features( df) labels = df['label'].to_numpy() # Train the model self.model.fit([comments1, comments2, word_counts, name_similarities], labels, epochs=self.epochs, validation_split=0.1, callbacks=[es], verbose=0) # Save model if cache: self.model.save(self.model_cache) super().train(df, labels, verbose, cache) def features(self, df): """Get features from comment pairs: tokenized sequences, word counts, name similarities.""" comments1 = df['comment1'].to_numpy() comments2 = df['comment2'].to_numpy() comments1 = self.tokenizer.texts_to_sequences(comments1) comments2 = self.tokenizer.texts_to_sequences(comments2) word_counts = [[ len(set(x1)), len(set(x2)), len(set(x1).intersection(x2)) ] for x1, x2 in zip(comments1, comments2)] comments1 = pad_sequences(comments1, self.max_sequence_length) comments2 = pad_sequences(comments2, self.max_sequence_length) names = df[['name1', 'name2']].to_numpy() return comments1, comments2, np.array( word_counts), self.get_name_similarities(names) def get_name_similarities(self, name_pairs): """Get char LCS similarity for each name pair.""" return np.array([ difflib.SequenceMatcher(None, n1, n2).ratio() for n1, n2 in name_pairs ])
class LSTM_network(): """ Train a BLSTM network on a cleartext password dataset. """ def __init__(self): # load variables from the config file self.model_name = variables['model']['name'] self.gpu_count = variables['model']['gpu_count'] self.bucket = variables['S3']['bucket_name'] self.folder = variables['S3']['folder'] self.tokenizer_name = variables['S3']['tokenizer_name'] self.training_params = variables['S3']['training_params'] self.history_pkl = variables['S3']['history_pkl'] # parse the arguments parser = argparse.ArgumentParser() parser.add_argument('--epochs', type=int, default=10) parser.add_argument('--batch_size', type=int, default=128) parser.add_argument('--hidden_units', type=int, default=100) parser.add_argument('--training', type=str) args, _ = parser.parse_known_args() # store the arguments as variables self.epochs = args.epochs self.batch_size = args.batch_size self.hidden_units = args.hidden_units self.training_path = args.training self.output_location = '%s/%s/output' % (self.bucket, self.folder) def data_load(self): """ Load and clean the dataset from a specified location in S3. Parameters ---------- training_path : str The path to the password dataset in S3. Returns ------- data The cleaned dataset containing all of the passwords. """ # read the dataset from an S3 bucket and store it as a pandas dataframe self.data = pd.read_csv(self.training_path, usecols=[0]) # drop the rows with NaN values self.data = self.data.dropna() # get rid of duplicate rows self.data = self.data.drop_duplicates() # truncate dataset self.data = self.data.head(10000) def parse_data(self): """ Parse the data and determine some dataset properties. Parameters ---------- data The cleaned dataset containing all of the passwords. Returns ------- data_length : int The number of passwords in the dataset. unique_characters : int A sorted list of the unique characters in the dataset. vocabulary_size : int The number of unique characters in the dataset. max_length : int The length of the longest password in the dataset. """ self.data_length = len(self.data) self.unique_characters = list(set(''.join(self.data['Password']))) self.vocabulary_size = len(self.unique_characters) self.max_length = self.data['Password'].str.len().max() def tokenization(self): """ Tokenize the characters in the passwords. Parameters ---------- data : pd.DataFrame The dataframe containing the passwords. vocabulary_size : int The number of unique characters in the dataset. max_length : int The length of the longest password. bucket : str The name of the S3 bucket in which the results are stored. training_params : str The name of the pickle object to store in S3. tokenizer_name : str The name of the tokenizer object to be store in S3. Returns ------- tokenizer : The Keras tokenizer object. character_to_ix : The character-to-index dictionary. ix_to_character : The index-to-character dictionary. data : pd.DataFrame The dataset, including the tokenized passwords. """ # get the password column as its own array passwords = self.data['Password'] # define the tokenizer self.tokenizer = Tokenizer(num_words=None, oov_token='UNK', char_level=True) # generate the tokenized passwords self.tokenizer.fit_on_texts(passwords) # generate the character-to-index dictionary self.character_to_ix = self.tokenizer.word_index # generate the index-to-character dictionary too self.ix_to_character = {i: j for j, i in self.character_to_ix.items()} # persist the tokenizer with s3.open('%s/%s' % (self.output_location, self.tokenizer_name), 'w') as f: f.write(json.dumps(self.tokenizer.to_json(), ensure_ascii=False)) # save the index-to-character dictionary and self.vocabulary_size values with s3.open('%s/%s' % (self.output_location, self.training_params), 'wb') as f: pickle.dump( [self.ix_to_character, self.vocabulary_size, self.max_length], f) # this encodes the passwords tokens = self.tokenizer.texts_to_sequences(passwords) # save the tokenized passwords in a column of the dataframe self.data['Tokenized'] = tokens # turn the tokenized column into a column of arrays (not lists) self.data['Tokenized'] = self.data['Tokenized'].apply( lambda x: np.array(x)) # this gets rid of the <PAD> character self.data['Output'] = self.data['Tokenized'] - 1 def model_construction(self): """ Construct the model. Parameters ---------- vocabulary_size : int The number of unique characters in the dataset. max_length : int The length of the longest password. hidden_units : int The number of hidden units in the LSTM network. Outputs ------- model : The Keras model. """ # handle model loading # build the model self.model = Sequential() self.model.add( Embedding( input_dim=self.vocabulary_size + 1, # vocabulary size plus an extra element for <PAD> output_dim=int(self.vocabulary_size**( 1. / 4)), # size of embeddings; fourth root of cardinality input_length=self.max_length - 1)) # length of the padded sequences self.model.add(Bidirectional(LSTM(self.hidden_units)) ) # size of hidden layer; n_h ~= n_s / (2(n_i + n_o)) self.model.add(Dense(self.vocabulary_size, activation='softmax')) # output self.model.compile('rmsprop', 'categorical_crossentropy') log.info(self.model.summary()) def model_training(self): """ Train the model. The dataset of tokenized passwords is split, using a sliding window, into sublists of sequences of each password. The sliding window step is handled by the generator defined in generator.py. This process is used to generate additional data that allows the network to learn the expected character given an input sequence. This is ultimately how the probability of a given password is calculated. Parameters ---------- data : pd.DataFrame The dataset containing the passwords. vocabulary_size : int The number of unique characters in the dataset. max_length : int The length of the longest password. batch_size : int The number of samples to train during a single iteration. epoch_size : int The number of steps to train the model. model : The Keras model created in model_construction. bucket : str The name of the S3 bucket in which the results are stored. folder : str The name of the folder in the above S3 bucket in which the results are stored. history_pkl : str The name of the pickle object to store in S3. model_name : str The name of the model to be store in S3. Returns ------- history : obj The Keras history object. """ # define the generator parameters paramaters = { 'vocabulary_size': self.vocabulary_size, 'max_length': self.max_length, 'batch_size': self.batch_size, 'shuffle': True } # split the data into training and testing sets training, testing = train_test_split(self.data, test_size=0.1) # check memory log.info("these are the memory stats prior to training: ") log.info(psutil.virtual_memory()) log.info("starting training of model") # define the generators for the training and test datasets training_generator = DataGenerator(training, **paramaters) test_generator = DataGenerator(testing, **paramaters) log.info(psutil.virtual_memory()) # callbacks during training save_checkpoint = ModelCheckpoint(filepath='%s.h5' % self.model_name, monitor='val_accuracy', save_best_only=True) early_stopping = EarlyStopping(monitor='loss', patience=5) # add support for multiple GPUs if self.gpu_count > 1: self.model = multi_gpu_model(self.model, gpus=self.gpu_count) # train the network self.history = self.model.fit_generator( generator=training_generator, validation_data=test_generator, epochs=self.epochs, steps_per_epoch=(len(training) // self.batch_size), validation_steps=(len(testing) // self.batch_size), callbacks=[save_checkpoint, early_stopping], use_multiprocessing=True, workers=multiprocessing.cpu_count(), max_queue_size=multiprocessing.cpu_count() * 2, verbose=1).history # save the history variable with s3.open('%s/%s' % (self.output_location, self.history_pkl), 'wb') as f: pickle.dump(self.history, f) # save the hdf5 model in an S3 bucket self.model.save('%s.h5' % self.model_name) with open('%s.h5' % self.model_name, "rb") as f: client.upload_fileobj(Fileobj=f, Bucket=self.bucket, Key='%s/output/%s.h5' % (self.folder, self.model_name)) # save Keras model for Tensorflow Serving in /opt/ml/model/1 sess = K.get_session() tf.saved_model.simple_save( sess, os.path.join(os.environ['SM_MODEL_DIR'], '1'), inputs={'inputs': self.model.input}, outputs={t.name: t for t in self.model.outputs}) log.info("finished training model") def password_probability(self, password): """ Calculate the probability of a given password. This works by determining the product of the individual probabilities of a given character conditional to the appearance of the preceding characters. Parameters ---------- password : str The password whose probability is to be calculated. model : The Keras model. tokenizer : The Keras tokenizer object. ix_to_character : dict The index-to-character dictionary. data : pd.DataFrame The dataset, including the tokenized passwords. Returns ------- float The probability of the password. """ # tokenize the password token = self.tokenizer.texts_to_sequences([password])[0] x_test = DataGenerator.slide_window(token) x_test = np.array(x_test) y_test = token - 1 # determine the probabilities of the permutations of the characters probabilities = self.model.predict(x_test, verbose=0) # multiply all of the conditional probabilities together in the password password_probability = 0 for index, probability in enumerate(probabilities): char_probability = probability[ y_test[index]] # get the probability from the model password_probability += np.log( char_probability) # use log to avoid roundoff errors # calculate the perplexity to account for varying password lengths password_length = len(password) password_probability /= -password_length password_probability = np.exp( password_probability) # recover the raw probability return password_probability
model.summary() model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy']) num_epochs = 100 train_padded = np.asarray(train_padded).astype(np.float32) training_label_seq = np.asarray(training_label_seq).astype(np.float32) print(train_padded) history = model.fit(train_padded, np.array(training_label_seq), batch_size=5, epochs=num_epochs, verbose=1) model.save('chatbot_model.h5', history) print("model creation completed") tokenizer_json = tokenizer.to_json() with io.open('tokenizer.json', 'w', encoding='utf-8') as f: f.write(json.dumps(tokenizer_json, ensure_ascii=False)) label_tokenizer_json = label_tokenizer.to_json() with io.open('label_tokenizer.json', 'w', encoding='utf-8') as f: f.write(json.dumps(label_tokenizer_json, ensure_ascii=False)) print("tokenizer saved to folder") labels = ['greeting', 'goodbye', 'thanks', 'options', 'adverse_drug', 'blood_pressure', 'blood_pressure_search', 'pharmacy_search', 'hospital_search'] txt = ["Hi"] seq = tokenizer.texts_to_sequences(txt) padded = pad_sequences(
def train(): # Load survey info print('Loading Dataframe') # df = pd.read_csv('data/survey.csv', sep="\t", # header=None, names=["intent", "valid"]) df = pd.read_csv('data/cumulative.csv') clean_df(df) # create X (input) and Y (expected) X = df.intent Y = df.valid # create new Label encoder label_encoder = LabelEncoder() Y = label_encoder.fit_transform(Y) YES_VAL = label_encoder.transform(["yes"]) NO_VAL = label_encoder.transform(["no"]) Y = Y.reshape(-1, 1) # Train/Test split based on config X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=config['TRAIN_TEST_SPLIT']) print('-- Some sample intents --') print(X.tail(5)) # Data processing # make tokenizer tokenizer = Tokenizer(num_words=config['TOKENIZER_VOCAB_SIZE'], oov_token="<OOV>") tokenizer.fit_on_texts(X_train) print("df size before augmentation: %d" % len(X_train.index)) ### Data Augmentation aug_config = config['AUG'] delta = [] # sentence variations sentence_var_config = aug_config['SENTENCE_VAR'] print("performing sentence variation augmentation %d times" % sentence_var_config['TOTAL']) for row in df.sample(sentence_var_config['TOTAL']).iterrows(): intent = row[1]["intent"] valid = YES_VAL if row[1]["valid"] == "yes" else NO_VAL variations = data_proc.getVariations( intent, sentence_var_config['VARS_PER'], sentence_var_config['MUTATION_PROB']) delta += [[v, valid] for v in variations] # sentence negations (only on df yes cols) sentence_neg_config = aug_config['SENTENCE_NEG'] print("performing sentence negation augmentation %d times" % sentence_neg_config['TOTAL']) for row in df[df.valid == "yes"].sample( sentence_neg_config['TOTAL']).iterrows(): intent = row[1]["intent"] neg = data_proc.negation(intent) delta += [[neg, NO_VAL]] # shuffled sentences shuffle_config = aug_config['SHUFFLE'] print("performing sentence shuffle augmentation %d times" % shuffle_config['TOTAL']) for row in df[df.intent.str.split().apply(len) > 3].sample( shuffle_config['TOTAL']).iterrows(): intent = data_proc.randShuffle(row[1]["intent"]) delta += [[intent, NO_VAL]] # garbage sentences garbage_config = aug_config['GARBAGE'] print("performing garbage sentence augmentation %d times" % garbage_config['TOTAL']) for _ in range(garbage_config['TOTAL']): intent = data_proc.literalGarbage(garbage_config['LENGTH_LOWER_BOUND'], garbage_config['LENGTH_UPPER_BOUND']) delta += [[intent, NO_VAL]] # vocab mix sentences vocab_mix_config = aug_config['VOCAB_GARBAGE'] print("performing vocab mix sentence augmentation %d times" % vocab_mix_config['TOTAL']) t_tokenizer = Tokenizer(num_words=config['TOKENIZER_VOCAB_SIZE'], oov_token="<OOV>") t_tokenizer.fit_on_texts(df.intent) delta += [[intent, NO_VAL] for intent in data_proc.vocabGarbage( vocab_mix_config['TOTAL'], vocab_mix_config['TOPK'], t_tokenizer.word_counts)] appendDF = pd.DataFrame(delta, columns=['intent', 'valid']) X_train = X_train.append(appendDF.intent) Y_train = np.append(Y_train, appendDF.valid) print("df size after augmentation: %d" % len(X_train.index)) seqs = tokenizer.texts_to_sequences(X_train) padded_seqs = sequence.pad_sequences(seqs, maxlen=config['SEQUENCE_MAX_LENGTH']) # Load Network Architecture model = net.RNN(config['SEQUENCE_MAX_LENGTH'], config['TOKENIZER_VOCAB_SIZE']) model.summary() model.compile(loss='binary_crossentropy', optimizer=RMSprop(), metrics=['accuracy']) # Model Training Y_train = np.asarray(Y_train).astype('float32') model.fit(padded_seqs, Y_train, batch_size=config['BATCH_SIZE'], epochs=config['NUM_EPOCHS'], validation_split=config['VALIDATION_SPLIT']) # Run model on test set test_seqs = tokenizer.texts_to_sequences(X_test) padded_test_seqs = sequence.pad_sequences( test_seqs, maxlen=config['SEQUENCE_MAX_LENGTH']) accr = model.evaluate(padded_test_seqs, Y_test) print('Test set\n Loss: {:0.4f}\n Accuracy: {:0.2f}'.format( accr[0], accr[1] * 100)) # Print some example classifications from intent list seq = tokenizer.texts_to_sequences(df.intent.tail(config['TAIL_SIZE'])) padded_seq = sequence.pad_sequences(seq, maxlen=config['SEQUENCE_MAX_LENGTH']) preds = model.predict(padded_seq) out = list( zip(df.intent.tail(config['TAIL_SIZE']), df.valid.tail(config['TAIL_SIZE']), preds)) for obs in out: print('Intent: %s Actual Class: %s Predicted Class: %s' % (obs[0], obs[1], "yes" if obs[2][0] > 0.5 else "no")) # Define Model Name model_name = "acc%.2f" % (accr[1] * 100) os.mkdir('models/' + model_name) # save weights as HDF5 model.save("models/" + model_name + "/weights.h5") print("Saved model to disk") # save model as JSON model_json = model.to_json() with open("models/" + model_name + "/model.json", "w") as file: file.write(model_json) # save tokenizer as JSON tokenizer_json = tokenizer.to_json() with open("models/" + model_name + "/tokenizer.json", 'w', encoding='utf-8') as file: file.write(json.dumps(tokenizer_json, ensure_ascii=True)) # write training details to YAML detail_dict = { 'TOKENIZER_VOCAB_SIZE': config['TOKENIZER_VOCAB_SIZE'], 'SEQUENCE_MAX_LENGTH': config['SEQUENCE_MAX_LENGTH'], 'BATCH_SIZE': config['BATCH_SIZE'], 'NUM_EPOCHS': config['NUM_EPOCHS'], 'TRAIN_TEST_SPLIT': config['TRAIN_TEST_SPLIT'], 'VALIDATION_SPLIT': config['VALIDATION_SPLIT'], 'TRAINED_AT': datetime.datetime.now() } with open("models/" + model_name + "/details.yml", "w") as file: documents = yaml.dump(detail_dict, file)
def get_train_test_data(self): if not self.file_num == 7: #Get sequence Tokenzier tokenizer = Tokenizer(oov_token=self.oov_token) tokenizer.fit_on_texts(self.articles) sequences = tokenizer.texts_to_sequences(self.articles) article_sequences = pad_sequences(sequences, maxlen=self.max_len, truncating=self.trunc_type, padding=self.padding_type) word_index = tokenizer.word_index vocab_size = len(word_index) # Train test split split_index = int(len(article_sequences) * self.train_test_split) train_sequences = np.array(article_sequences[0:split_index]) test_sequences = np.array(article_sequences[split_index:]) # Get label Tokenzier label_tokenizer = Tokenizer() label_tokenizer.fit_on_texts(self.labels) label_sequences = label_tokenizer.texts_to_sequences(self.labels) train_label = np.array(label_sequences[0:split_index]) test_label = np.array(label_sequences[split_index:]) # Get Glove pre-trained word embedding embeddings_index = {} with open('glove/glove.6B.100d.txt', encoding='utf-8') as f: for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs embeddings_matrix = np.zeros((vocab_size + 1, self.embedding_dim)) for word, i in word_index.items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None: embeddings_matrix[i] = embedding_vector # seralize objects to local storage tokenizer_json = tokenizer.to_json() label_tokenizer_json = label_tokenizer.to_json() # Save Tokenizer if not os.path.exists('./pre-trained/tokenizer.json'): with open('./pre-trained/tokenizer.json', 'w', encoding='utf-8') as f: f.write(json.dumps(tokenizer_json, ensure_ascii=False)) if not os.path.exists('./pre-trained/label_tokenizer_json.json'): with open('./pre-trained/label_tokenizer_json.json', 'w', encoding='utf-8') as f: f.write( json.dumps(label_tokenizer_json, ensure_ascii=False)) # Save train test sequence and embeddings matrix if not os.path.exists('./pre-trained/train_sequences.npy'): np.save('./pre-trained/train_sequences.npy', train_sequences) if not os.path.exists('./pre-trained/train_label.npy'): np.save('./pre-trained/train_label.npy', train_label) if not os.path.exists('./pre-trained/test_sequences.npy'): np.save('./pre-trained/test_sequences.npy', test_sequences) if not os.path.exists('./pre-trained/test_label.npy'): np.save('./pre-trained/test_label.npy', test_label) if not os.path.exists('./pre-trained/embeddings_matrix.npy'): np.save('./pre-trained/embeddings_matrix.npy', embeddings_matrix)
import string samples = ['The cat sat on the mat.', 'The dog ate my homework.'] characters = string.printable token_index = dict(zip(range(1, len(characters) + 1), characters)) max_length = 50 results = np.zeros((len(samples), max_length, max(token_index.keys()) + 1)) for i, sample in enumerate(samples): for j, character in enumerate(sample): index = token_index.get(character) results[i, j, index] = 1 print(len(results[1])) from keras.preprocessing.text import Tokenizer samples = ['The cat sat on the mat.', 'The dog ate my homework.'] tokenizer = Tokenizer(num_words=1000) print(tokenizer.to_json()) tokenizer.fit_on_texts(samples) sequences = tokenizer.texts_to_sequences(samples) print(sequences) one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary') print(one_hot_results[0]) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) samples = ['The cat sat on the mat.', 'The dog ate my homework.'] dimensionality = 1000 max_length = 10 results = np.zeros((len(samples), max_length, dimensionality)) for i, sample in enumerate(samples): for j, word in list(enumerate(sample.split()))[:max_length]: index = abs(hash(word)) % dimensionality
class BiLSTM: def __init__(self, epochs=5, batch_size=36, max_seq_len=25, fit_verbose=2, print_summary=True, load_model_path=None, tokenizer_path=None): self.epochs = epochs self.batch_size = batch_size self.max_seq_len = max_seq_len self.fit_verbose = fit_verbose self.print_summary = print_summary self.encoder = LabelEncoder() if load_model_path: self.model = load_model(load_model_path) with open(tokenizer_path) as f: data = json.load(f) self.tokenizer = tokenizer_from_json(data) else: self.model = self.model_1b self.tokenizer = Tokenizer() def train(self, X_train, y_train, X_dev, y_dev): self.tokenizer.fit_on_texts(X_train) X_train = self.tokenizer.texts_to_sequences(X_train) X_train = pad_sequences(X_train, maxlen=self.max_seq_len) X_dev = self.tokenizer.texts_to_sequences(X_dev) X_dev = pad_sequences(X_dev, maxlen=self.max_seq_len) y_train = self.encoder.fit_transform(y_train) y_train = to_categorical(y_train) y_dev = self.encoder.fit_transform(y_dev) y_dev = to_categorical(y_dev) m = self.model() y_train_int = np.argmax(y_train, axis=1) cws = class_weight.compute_class_weight('balanced', np.unique(y_train_int), y_train_int) if self.print_summary: print(m.summary()) m.fit(X_train, y_train, validation_data=(X_dev, y_dev), epochs=self.epochs, batch_size=self.batch_size, verbose=self.fit_verbose) predictions = m.predict(X_dev, verbose=1) print('Validation Loss:', log_loss(y_dev, predictions)) print('Validation Accuracy', (predictions.argmax(axis=1) == y_dev.argmax(axis=1)).mean()) print( 'Validation F1 Score:', f1_score(y_dev.argmax(axis=1), predictions.argmax(axis=1), average='weighted')) m.save('models/bilstm.keras') tokenizer_json = self.tokenizer.to_json() with io.open('models/bilstm-tokenizer.json', 'w', encoding='utf-8') as f: f.write(json.dumps(tokenizer_json, ensure_ascii=False)) self.model = m def model_1b(self): """ Using a Bidiretional LSTM. """ model = Sequential() model.add( Embedding(input_dim=(len(self.tokenizer.word_counts) + 1), output_dim=128, input_length=self.max_seq_len)) model.add(SpatialDropout1D(0.3)) model.add( Bidirectional(LSTM(128, dropout=0.25, recurrent_dropout=0.25))) model.add(Dense(64, activation='relu')) model.add(Dropout(0.3)) model.add(Dense(2, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) return model def test(self, X_test, y_test=None): X_test = self.tokenizer.texts_to_sequences(X_test) X_test = pad_sequences(X_test, maxlen=self.max_seq_len) predictions = self.model.predict(X_test, verbose=1) if y_test is not None: y_test = self.encoder.fit_transform(y_test) y_test = to_categorical(y_test) print('Test Loss:', log_loss(y_test, predictions)) print('Test Accuracy', (predictions.argmax(axis=1) == y_test.argmax(axis=1)).mean()) print( 'Test F1 Score:', f1_score(y_test.argmax(axis=1), predictions.argmax(axis=1), average='weighted')) predictions = np.argmax(predictions, axis=1) np.savetxt("preds/bilstm-preds.txt", predictions, fmt='%d') return predictions