Пример #1
0
def get_sequence_of_tokens(corpus, refresh=True):
    """
    :param corpus:
    :param refresh:
    :return:
    """
    # tokenization
    if refresh:
        tokenizer = Tokenizer()
        # fit the tokenizer on the text
        tokenizer.fit_on_texts(corpus)
    else:
        with open("tokenizer.json", 'r') as tj:
            tokenizer = tokenizer_from_json(json.load(tj))

    tokenizer_json = tokenizer.to_json()

    with open('tokenizer.json', 'w') as fobj:
        json.dump(tokenizer_json, fobj)

    index_dict = tokenizer.word_index
    seq = tokenizer.texts_to_sequences(corpus)
    # calculate the vocab size
    total_words = len(tokenizer.word_index) + 1
    print(total_words)
    return total_words, seq
Пример #2
0
def prepare_input_data(dataframe,
                       save,
                       maxlen=50,
                       max_words=10000,
                       model_name=None):
    """
        Prepapre the data to be fed to the model.

        Parameters
        ----------
            dataframe: pandas.DataFrame
            save: bool
            maxlen: int
                Maximum number of tokens per tweet
            max_words: int
                Maximum of words for the tokenizer
            model_name: str
                If save, save the variables needed for prediction, 
                including the name of the model in the output file
        
        Returns
        -------
            X: numpy.array
                Array of tweets fit for model input
            y: pandas.DataFrame
                Onehot encoding of the labels
    """
    tokenizer = Tokenizer(num_words=max_words,
                          filters='!"#$%&()*+,-./:;<=>?@[\\]^\'_`{|}~\t\n')
    tokenizer.fit_on_texts(dataframe['tweets'])
    dataframe = dataframe.sample(frac=1).reset_index(drop=True)
    X = tokenizer.texts_to_sequences(dataframe['tweets'])
    X = pad_sequences(X, maxlen=maxlen)
    y = pd.get_dummies(dataframe['emojis'])

    if save:
        saved_tokenizer = tokenizer.to_json()
        with open(f'models_utils/{model_name}_tokenizer.json',
                  'w',
                  encoding='utf-8') as jsonfile:
            json.dump(saved_tokenizer, jsonfile, ensure_ascii=False)

        emojis = [emoji for emoji in dataframe['emojis']]
        emojis_indices = {}
        for i in range(len(emojis)):
            if emojis[i] in emojis_indices.keys():
                emojis_indices[emojis[i]].append(i)
            else:
                emojis_indices[emojis[i]] = [i]

        with open(f'models_utils/{model_name}_emojis_indices.json',
                  'w') as jsonfile:
            json.dump(emojis_indices, jsonfile)

    return X, y
Пример #3
0
def WordtoInt(arr: List[List[str]]):
    t = Tokenizer(num_words=500)
    t.fit_on_texts(arr)
    sequences = t.texts_to_sequences(arr)
    # print('sequences : ', sequences, '\n')
    # print('word_index : ', t.word_index)
    tokenizer_json = t.to_json()
    with io.open('tokenizer.json', 'w', encoding='utf-8') as f:
        f.write(json.dumps(tokenizer_json, ensure_ascii=False))
    # print(tokenizer_json)
    return sequences
Пример #4
0
def fit_on_text(data):
    """
    1. Updates internal vocabulary based on a list of texts according to given params.
    2. Setting max length for pad_sequences and embedding layer.
            Embedding layer is capable of processing sequence of heterogenous length, if you don't pass an explicit input_length 
                argument to the layer).
            If max is big, reveiws will have too many padded values left for short reviews and decreases the accuracy in turn. 4
            so setting a maximum length by (max-avg).
    3. This method creates the vocabulary index based on word frequency.

        So if you give it something like, "the boy drove on the road." It will create a dictionary 
                            s.t. 
                                 word_index["the"] = 1;
                                 word_index["boy"] = 2;
                                 0 is reserved for padding. 
                                 
        so this way, each word gets a unique integer value.
        So lower integer means more frequent word.


    Args:
        data(list)

    Arguments:
        data(list): list of list of tokens 
    
    Returns: 
        Max_length for padding.
        token object.=> A dictionary word_index with values as indexes . lowest one is most frequent.
    """

    print("Inside fit on text..")

    # length_list=[len(seq) for seq in data]
    # avg=sum(length_list)/len(length_list)
    # max_length= int(max(length_list)-avg)/5                                 #max-average number of words in each sentence.
    max_length = 450  #defining after finding optimal value
    # print("Max length for pad sequences: ",max_length)

    token = Tokenizer()  #Defining the Tokenizer object

    list_of_strings_full_data = [' '.join(seq[:]) for seq in data
                                 ]  # Making list of strings for token ob

    token.fit_on_texts(list_of_strings_full_data)

    tokenizer_json = token.to_json()
    with io.open(os.path.join("models", 'tokenizer.json'),
                 'w',
                 encoding='utf-8') as f:
        f.write(json.dumps(tokenizer_json, ensure_ascii=False))

    return max_length, token
Пример #5
0
    def preprocess(self, tokenizer_string=None):
        """
        Preprocess the textual data.

        Returns
        -------
        x_train: The processed-sequenced training data.
        y_train: Processed training labels
        x_val: The processed-sequenced validation data
        y_val: processed validation labels
        word_index: A dictionary containing the word-tokens and their indices for the sequencing.
        """
        if tokenizer_string is None:
            tokenizer = Tokenizer(num_words=self.MAX_NB_WORDS)
            tokenizer.fit_on_texts(self.texts)
            self.tokenizer_string = tokenizer.to_json()
        else:
            self.tokenizer_string = tokenizer_string
            from keras.preprocessing.text import tokenizer_from_json
            tokenizer = tokenizer_from_json(tokenizer_string)
        sequences = tokenizer.texts_to_sequences(self.texts)

        word_index = tokenizer.word_index
        print('Found %s unique tokens.' % len(word_index))

        data = pad_sequences(sequences, maxlen=self.MAX_SEQUENCE_LENGTH)
        labels = to_categorical(np.asarray(self.labels))

        print('Shape of data tensor:', data.shape)
        print('Shape of label tensor:', labels.shape)

        # split the data into a training set and a validation set
        if (self.VALIDATION_SPLIT):
            indices = np.arange(data.shape[0])
            np.random.shuffle(indices)

            data = data[indices]
            labels = labels[indices]
            num_validation_samples = int(self.VALIDATION_SPLIT * data.shape[0])

            x_train = data[:-num_validation_samples]
            y_train = labels[:-num_validation_samples]
            x_val = data[-num_validation_samples:]
            y_val = labels[-num_validation_samples:]
        else:
            x_train = data
            y_train = labels
            x_val = None
            y_val = None

        return x_train, y_train, x_val, y_val, word_index
Пример #6
0
def fetch_tokenizer(sentences):
    global tokenizer
    if tokenizer is not None:
        return tokenizer
    if os.path.isfile('models/tokenizer.json'):
        with open('models/tokenizer.json') as f:
            data = json.load(f)
            tokenizer = tokenizer_from_json(data)
    else:
        tokenizer = Tokenizer(num_words=vocabulary_size)
        tokenizer.fit_on_texts(sentences)

        tokenizer_json = tokenizer.to_json()
        with io.open('models/tokenizer.json', 'w', encoding='utf-8') as f:
            f.write(json.dumps(tokenizer_json, ensure_ascii=False))

    return tokenizer
Пример #7
0
def load_tokenizer( texts=None, num_words=MAX_WORDS ):
  file = os.path.join( DATA_HOME, SAVE_DIR, __TOKENIZER_FILE.format( num_words ) )
  # tokenizer config file exists. load it and return tokenizer
  if os.path.exists( file ):
    print( 'loading tokenizer' )
    with open( file, 'r' ) as f:
      return tokenizer_from_json( f.readline() )

  if texts is None:
    texts, _ = load_raw_text()  # load the review data
  tokenizer = Tokenizer( num_words=MAX_WORDS )
  print( 'fitting tokenizer' )
  tokenizer.fit_on_texts( texts )
  json = tokenizer.to_json()
  print( 'saving tokenizer' )
  with open( file, 'w' ) as f:
    f.write( json )

  return tokenizer
def get_preprocessor(x1, x2):

    max_vocab = 10000
    tokenizer = Tokenizer(num_words=max_vocab)
    tokenizer.fit_on_texts(x1)
    x_train = tokenizer.texts_to_sequences(x1)
    x_test = tokenizer.texts_to_sequences(x2)
    x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train,
                                                            padding='post',
                                                            maxlen=256)
    x_test = tf.keras.preprocessing.sequence.pad_sequences(x_test,
                                                           padding='post',
                                                           maxlen=256)

    #writting the tokenizer to a json file
    with open(tokenizer_path, 'w') as f:
        tokenizer_json_string = tokenizer.to_json()
        f.write(tokenizer_json_string)

    return x_train, x_test
Пример #9
0
testdat = traindat

traincat = cat[:trainlen]
#valcat = cat[trainlen:trainlen+vallen]
#testcat = cat[trainlen+vallen:]
valcat = traincat
testcat = traincat

tokenizer = Tokenizer(lower=False, num_words=vocab_size, oov_token="UNK")
tokenizer.fit_on_texts(traindat)
#with open("data/abnb_pets_tok.json") as f:
#    data = json.load(f)
#    tokenizer = tokenizer_from_json(json.dumps(data))

# deployment config
tokenizer_json = tokenizer.to_json()

# This is not working correctly it seems
with open("cities.json", "w", encoding="utf-8") as f:
    f.write(tokenizer_json)

#Xtrain = tokenizer.texts_to_matrix(traindat, mode='freq')
Ytrain = np.asarray(traincat)

Yval = np.asarray(valcat)

#Xtest = tokenizer.texts_to_matrix(testdat, mode='freq')
Ytest = np.asarray(testcat)

Xtrain = tokenizer.texts_to_sequences(traindat)
Xval = tokenizer.texts_to_sequences(valdat)
Пример #10
0
class SiameseXSimilarity(Similarity):
    """Siamese neural network similarity with extra feature."""
    def __init__(self):
        """Segmentation and normalization are not allowed in Siamese similarity."""

        super().__init__()
        self.max_sequence_length = 10
        self.embedding_dimension = 50
        self.number_lstm_units = 50
        self.number_dense_units = 50
        self.rate_drop_lstm = 0.17
        self.rate_drop_dense = 0.25
        self.activation_function = 'relu'
        self.epochs = 20
        self.model_cache = 'cache/models/siamx'
        self.tokenizer_cache = 'cache/models/tokenizerx'

    def similarity(self, x, y):
        return self.run_similarity([x, y])

    def run_similarity(self, df):
        """Predict similarity for each pair."""

        comments1, comments2, word_counts, name_similarities = self.features(
            df)
        return np.array(
            list(
                self.model.predict(
                    [comments1, comments2, word_counts,
                     name_similarities]).ravel()))

    def load(self, cache):
        """Load trained model."""

        self.model = load_model(self.model_cache)
        with open(self.tokenizer_cache) as f:
            self.tokenizer = tokenizer_from_json(json.load(f))
        super().load(cache)

    def train(self, df, verbose=False, cache=None):
        """Define and train the neural network."""

        # Flatten list of comment pairs
        pairs = df[['comment1', 'comment2']]
        comments = list(np.ravel(pairs))
        comments = list(map(text_to_word_sequence, comments))
        # Train word2vec embeddings
        word2vec = Word2Vec(comments,
                            min_count=1,
                            size=self.embedding_dimension)
        # Train tokenizer
        self.tokenizer = Tokenizer()
        self.tokenizer.fit_on_texts(comments)
        if cache:
            with open(self.tokenizer_cache, 'w') as f:
                json.dump(self.tokenizer.to_json(), f)
        word_index = self.tokenizer.word_index
        vocab_size = len(word_index) + 1
        # Generate embedding matrix
        self.embedding_matrix = np.zeros(
            (vocab_size, self.embedding_dimension))
        for word, i in word_index.items():
            self.embedding_matrix[i] = word2vec.wv[word]
        del word2vec

        # Define the neural network
        # Word embedding layer
        embedding_layer = Embedding(vocab_size,
                                    self.embedding_dimension,
                                    weights=[self.embedding_matrix],
                                    input_length=self.max_sequence_length,
                                    trainable=False)
        # LSTM encoder
        lstm_layer = Bidirectional(
            LSTM(self.number_lstm_units,
                 dropout=self.rate_drop_lstm,
                 recurrent_dropout=self.rate_drop_lstm))
        # LSTM encoder layer for the 1st comment
        input1 = Input(shape=(self.max_sequence_length, ), dtype='int32')
        embedding1 = embedding_layer(input1)
        lstm1 = lstm_layer(embedding1)
        # LSTM encoder layer for the 2nd comment
        input2 = Input(shape=(self.max_sequence_length, ), dtype='int32')
        embedding2 = embedding_layer(input2)
        lstm2 = lstm_layer(embedding2)
        # Word count layer
        input3 = Input(shape=(3, ))
        dense3 = Dense(int(self.number_dense_units / 2),
                       activation=self.activation_function)(input3)
        # Name similarity layer
        input4 = Input(shape=(1, ))
        dense4 = Dense(int(self.number_dense_units / 5),
                       activation=self.activation_function)(input4)
        # Merge two LSTM layers and dense word count and name similarity layers
        merged = concatenate([lstm1, lstm2, dense3, dense4])
        # [Normalization + dropout + dense] x2
        merged = BatchNormalization()(merged)
        merged = Dropout(self.rate_drop_dense)(merged)
        merged = Dense(self.number_dense_units,
                       activation=self.activation_function)(merged)
        merged = BatchNormalization()(merged)
        merged = Dropout(self.rate_drop_dense)(merged)
        output = Dense(1, activation='sigmoid')(merged)
        # Initialize the model
        self.model = Model(inputs=[input1, input2, input3, input4],
                           outputs=output)
        self.model.compile(loss='binary_crossentropy',
                           optimizer='nadam',
                           metrics=['acc'])
        # Define early stopping callback
        es = EarlyStopping(patience=3)

        # Extract features
        comments1, comments2, word_counts, name_similarities = self.features(
            df)
        labels = df['label'].to_numpy()
        # Train the model
        self.model.fit([comments1, comments2, word_counts, name_similarities],
                       labels,
                       epochs=self.epochs,
                       validation_split=0.1,
                       callbacks=[es],
                       verbose=0)

        # Save model
        if cache:
            self.model.save(self.model_cache)

        super().train(df, labels, verbose, cache)

    def features(self, df):
        """Get features from comment pairs: tokenized sequences, word counts, name similarities."""

        comments1 = df['comment1'].to_numpy()
        comments2 = df['comment2'].to_numpy()
        comments1 = self.tokenizer.texts_to_sequences(comments1)
        comments2 = self.tokenizer.texts_to_sequences(comments2)
        word_counts = [[
            len(set(x1)),
            len(set(x2)),
            len(set(x1).intersection(x2))
        ] for x1, x2 in zip(comments1, comments2)]
        comments1 = pad_sequences(comments1, self.max_sequence_length)
        comments2 = pad_sequences(comments2, self.max_sequence_length)

        names = df[['name1', 'name2']].to_numpy()

        return comments1, comments2, np.array(
            word_counts), self.get_name_similarities(names)

    def get_name_similarities(self, name_pairs):
        """Get char LCS similarity for each name pair."""

        return np.array([
            difflib.SequenceMatcher(None, n1, n2).ratio()
            for n1, n2 in name_pairs
        ])
Пример #11
0
class LSTM_network():
    """
    Train a BLSTM network on a cleartext password dataset.

    """
    def __init__(self):

        # load variables from the config file
        self.model_name = variables['model']['name']
        self.gpu_count = variables['model']['gpu_count']
        self.bucket = variables['S3']['bucket_name']
        self.folder = variables['S3']['folder']
        self.tokenizer_name = variables['S3']['tokenizer_name']
        self.training_params = variables['S3']['training_params']
        self.history_pkl = variables['S3']['history_pkl']

        # parse the arguments
        parser = argparse.ArgumentParser()
        parser.add_argument('--epochs', type=int, default=10)
        parser.add_argument('--batch_size', type=int, default=128)
        parser.add_argument('--hidden_units', type=int, default=100)
        parser.add_argument('--training', type=str)
        args, _ = parser.parse_known_args()

        # store the arguments as variables
        self.epochs = args.epochs
        self.batch_size = args.batch_size
        self.hidden_units = args.hidden_units
        self.training_path = args.training
        self.output_location = '%s/%s/output' % (self.bucket, self.folder)

    def data_load(self):
        """
        Load and clean the dataset from a specified location in S3.


        Parameters
        ----------
        training_path : str
            The path to the password dataset in S3.

        Returns
        -------
        data
            The cleaned dataset containing all of the passwords.

        """

        # read the dataset from an S3 bucket and store it as a pandas dataframe
        self.data = pd.read_csv(self.training_path, usecols=[0])

        # drop the rows with NaN values
        self.data = self.data.dropna()

        # get rid of duplicate rows
        self.data = self.data.drop_duplicates()

        # truncate dataset
        self.data = self.data.head(10000)

    def parse_data(self):
        """
        Parse the data and determine some dataset properties.


        Parameters
        ----------
        data
            The cleaned dataset containing all of the passwords.

        Returns
        -------
        data_length : int
            The number of passwords in the dataset.
        unique_characters : int
            A sorted list of the unique characters in the dataset.
        vocabulary_size : int
            The number of unique characters in the dataset.
        max_length : int
            The length of the longest password in the dataset.

        """

        self.data_length = len(self.data)
        self.unique_characters = list(set(''.join(self.data['Password'])))
        self.vocabulary_size = len(self.unique_characters)
        self.max_length = self.data['Password'].str.len().max()

    def tokenization(self):
        """
        Tokenize the characters in the passwords.


        Parameters
        ----------
        data : pd.DataFrame
            The dataframe containing the passwords.
        vocabulary_size : int
            The number of unique characters in the dataset.
        max_length : int
            The length of the longest password.
        bucket : str
            The name of the S3 bucket in which the results are stored.
        training_params : str
            The name of the pickle object to store in S3.
        tokenizer_name : str
            The name of the tokenizer object to be store in S3.


        Returns
        -------
        tokenizer : 
            The Keras tokenizer object.
        character_to_ix : 
            The character-to-index dictionary.
        ix_to_character : 
            The index-to-character dictionary.
        data : pd.DataFrame
            The dataset, including the tokenized passwords.

        """

        # get the password column as its own array
        passwords = self.data['Password']

        # define the tokenizer
        self.tokenizer = Tokenizer(num_words=None,
                                   oov_token='UNK',
                                   char_level=True)

        # generate the tokenized passwords
        self.tokenizer.fit_on_texts(passwords)

        # generate the character-to-index dictionary
        self.character_to_ix = self.tokenizer.word_index

        # generate the index-to-character dictionary too
        self.ix_to_character = {i: j for j, i in self.character_to_ix.items()}

        # persist the tokenizer
        with s3.open('%s/%s' % (self.output_location, self.tokenizer_name),
                     'w') as f:
            f.write(json.dumps(self.tokenizer.to_json(), ensure_ascii=False))

        # save the index-to-character dictionary and self.vocabulary_size values
        with s3.open('%s/%s' % (self.output_location, self.training_params),
                     'wb') as f:
            pickle.dump(
                [self.ix_to_character, self.vocabulary_size, self.max_length],
                f)

        # this encodes the passwords
        tokens = self.tokenizer.texts_to_sequences(passwords)

        # save the tokenized passwords in a column of the dataframe
        self.data['Tokenized'] = tokens

        # turn the tokenized column into a column of arrays (not lists)
        self.data['Tokenized'] = self.data['Tokenized'].apply(
            lambda x: np.array(x))

        # this gets rid of the <PAD> character
        self.data['Output'] = self.data['Tokenized'] - 1

    def model_construction(self):
        """
        Construct the model.


        Parameters
        ----------
        vocabulary_size : int
            The number of unique characters in the dataset.
        max_length : int
            The length of the longest password.
        hidden_units : int
            The number of hidden units in the LSTM network.

        Outputs
        -------
        model : 
            The Keras model.

        """

        # handle model loading

        # build the model
        self.model = Sequential()
        self.model.add(
            Embedding(
                input_dim=self.vocabulary_size +
                1,  # vocabulary size plus an extra element for <PAD> 
                output_dim=int(self.vocabulary_size**(
                    1. / 4)),  # size of embeddings; fourth root of cardinality
                input_length=self.max_length -
                1))  # length of the padded sequences
        self.model.add(Bidirectional(LSTM(self.hidden_units))
                       )  # size of hidden layer; n_h ~= n_s / (2(n_i + n_o))
        self.model.add(Dense(self.vocabulary_size,
                             activation='softmax'))  # output
        self.model.compile('rmsprop', 'categorical_crossentropy')

        log.info(self.model.summary())

    def model_training(self):
        """
        Train the model.

        The dataset of tokenized passwords is split, using a sliding window, into 
        sublists of sequences of each password. The sliding window step is handled
        by the generator defined in generator.py. This process is used to generate 
        additional data that allows the network to learn the expected character given 
        an input sequence. This is ultimately how the probability of a given password
        is calculated.


        Parameters
        ----------
        data : pd.DataFrame
            The dataset containing the passwords.
        vocabulary_size : int
            The number of unique characters in the dataset.
        max_length : int
            The length of the longest password.
        batch_size : int
            The number of samples to train during a single iteration.
        epoch_size : int
            The number of steps to train the model.
        model : 
            The Keras model created in model_construction.
        bucket : str
            The name of the S3 bucket in which the results are stored.
        folder : str
            The name of the folder in the above S3 bucket in which the results are stored.
        history_pkl : str
            The name of the pickle object to store in S3.
        model_name : str
            The name of the model to be store in S3.


        Returns
        -------
        history : obj
            The Keras history object.


        """

        # define the generator parameters
        paramaters = {
            'vocabulary_size': self.vocabulary_size,
            'max_length': self.max_length,
            'batch_size': self.batch_size,
            'shuffle': True
        }

        # split the data into training and testing sets
        training, testing = train_test_split(self.data, test_size=0.1)

        # check memory
        log.info("these are the memory stats prior to training: ")
        log.info(psutil.virtual_memory())

        log.info("starting training of model")

        # define the generators for the training and test datasets
        training_generator = DataGenerator(training, **paramaters)
        test_generator = DataGenerator(testing, **paramaters)
        log.info(psutil.virtual_memory())

        # callbacks during training
        save_checkpoint = ModelCheckpoint(filepath='%s.h5' % self.model_name,
                                          monitor='val_accuracy',
                                          save_best_only=True)
        early_stopping = EarlyStopping(monitor='loss', patience=5)

        # add support for multiple GPUs
        if self.gpu_count > 1:
            self.model = multi_gpu_model(self.model, gpus=self.gpu_count)

        # train the network
        self.history = self.model.fit_generator(
            generator=training_generator,
            validation_data=test_generator,
            epochs=self.epochs,
            steps_per_epoch=(len(training) // self.batch_size),
            validation_steps=(len(testing) // self.batch_size),
            callbacks=[save_checkpoint, early_stopping],
            use_multiprocessing=True,
            workers=multiprocessing.cpu_count(),
            max_queue_size=multiprocessing.cpu_count() * 2,
            verbose=1).history

        # save the history variable
        with s3.open('%s/%s' % (self.output_location, self.history_pkl),
                     'wb') as f:
            pickle.dump(self.history, f)

        # save the hdf5 model in an S3 bucket
        self.model.save('%s.h5' % self.model_name)
        with open('%s.h5' % self.model_name, "rb") as f:
            client.upload_fileobj(Fileobj=f,
                                  Bucket=self.bucket,
                                  Key='%s/output/%s.h5' %
                                  (self.folder, self.model_name))

        # save Keras model for Tensorflow Serving in /opt/ml/model/1
        sess = K.get_session()
        tf.saved_model.simple_save(
            sess,
            os.path.join(os.environ['SM_MODEL_DIR'], '1'),
            inputs={'inputs': self.model.input},
            outputs={t.name: t
                     for t in self.model.outputs})

        log.info("finished training model")

    def password_probability(self, password):
        """
        Calculate the probability of a given password. This works by 
        determining the product of the individual probabilities of a 
        given character conditional to the appearance of the preceding
        characters.


        Parameters
        ----------
        password : str
            The password whose probability is to be calculated.
        model : 
            The Keras model.
        tokenizer : 
            The Keras tokenizer object.
        ix_to_character : dict
            The index-to-character dictionary.
        data : pd.DataFrame
            The dataset, including the tokenized passwords.

        Returns
        -------
        float
            The probability of the password.

        """

        # tokenize the password
        token = self.tokenizer.texts_to_sequences([password])[0]
        x_test = DataGenerator.slide_window(token)
        x_test = np.array(x_test)
        y_test = token - 1

        # determine the probabilities of the permutations of the characters
        probabilities = self.model.predict(x_test, verbose=0)

        # multiply all of the conditional probabilities together in the password
        password_probability = 0
        for index, probability in enumerate(probabilities):
            char_probability = probability[
                y_test[index]]  # get the probability from the model
            password_probability += np.log(
                char_probability)  # use log to avoid roundoff errors

        # calculate the perplexity to account for varying password lengths
        password_length = len(password)
        password_probability /= -password_length
        password_probability = np.exp(
            password_probability)  # recover the raw probability

        return password_probability
model.summary()

model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])
num_epochs = 100
train_padded = np.asarray(train_padded).astype(np.float32)
training_label_seq = np.asarray(training_label_seq).astype(np.float32)

print(train_padded)
history = model.fit(train_padded, np.array(training_label_seq),
                    batch_size=5, epochs=num_epochs, verbose=1)
model.save('chatbot_model.h5', history)
print("model creation completed")


tokenizer_json = tokenizer.to_json()
with io.open('tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))

label_tokenizer_json = label_tokenizer.to_json()
with io.open('label_tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(label_tokenizer_json, ensure_ascii=False))

print("tokenizer saved to folder")

labels = ['greeting', 'goodbye', 'thanks', 'options', 'adverse_drug',
          'blood_pressure', 'blood_pressure_search', 'pharmacy_search', 'hospital_search']

txt = ["Hi"]
seq = tokenizer.texts_to_sequences(txt)
padded = pad_sequences(
Пример #13
0
def train():
    # Load survey info
    print('Loading Dataframe')
    # df = pd.read_csv('data/survey.csv', sep="\t",
    #                   header=None, names=["intent", "valid"])
    df = pd.read_csv('data/cumulative.csv')

    clean_df(df)

    # create X (input) and Y (expected)
    X = df.intent
    Y = df.valid

    # create new Label encoder
    label_encoder = LabelEncoder()
    Y = label_encoder.fit_transform(Y)
    YES_VAL = label_encoder.transform(["yes"])
    NO_VAL = label_encoder.transform(["no"])
    Y = Y.reshape(-1, 1)

    # Train/Test split based on config
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=config['TRAIN_TEST_SPLIT'])

    print('-- Some sample intents --')
    print(X.tail(5))

    # Data processing
    # make tokenizer
    tokenizer = Tokenizer(num_words=config['TOKENIZER_VOCAB_SIZE'],
                          oov_token="<OOV>")
    tokenizer.fit_on_texts(X_train)

    print("df size before augmentation: %d" % len(X_train.index))

    ### Data Augmentation
    aug_config = config['AUG']
    delta = []

    # sentence variations
    sentence_var_config = aug_config['SENTENCE_VAR']
    print("performing sentence variation augmentation %d times" %
          sentence_var_config['TOTAL'])
    for row in df.sample(sentence_var_config['TOTAL']).iterrows():
        intent = row[1]["intent"]
        valid = YES_VAL if row[1]["valid"] == "yes" else NO_VAL
        variations = data_proc.getVariations(
            intent, sentence_var_config['VARS_PER'],
            sentence_var_config['MUTATION_PROB'])
        delta += [[v, valid] for v in variations]

    # sentence negations (only on df yes cols)
    sentence_neg_config = aug_config['SENTENCE_NEG']
    print("performing sentence negation augmentation %d times" %
          sentence_neg_config['TOTAL'])
    for row in df[df.valid == "yes"].sample(
            sentence_neg_config['TOTAL']).iterrows():
        intent = row[1]["intent"]
        neg = data_proc.negation(intent)
        delta += [[neg, NO_VAL]]

    # shuffled sentences
    shuffle_config = aug_config['SHUFFLE']
    print("performing sentence shuffle augmentation %d times" %
          shuffle_config['TOTAL'])
    for row in df[df.intent.str.split().apply(len) > 3].sample(
            shuffle_config['TOTAL']).iterrows():
        intent = data_proc.randShuffle(row[1]["intent"])
        delta += [[intent, NO_VAL]]

    # garbage sentences
    garbage_config = aug_config['GARBAGE']
    print("performing garbage sentence augmentation %d times" %
          garbage_config['TOTAL'])
    for _ in range(garbage_config['TOTAL']):
        intent = data_proc.literalGarbage(garbage_config['LENGTH_LOWER_BOUND'],
                                          garbage_config['LENGTH_UPPER_BOUND'])
        delta += [[intent, NO_VAL]]

    # vocab mix sentences
    vocab_mix_config = aug_config['VOCAB_GARBAGE']
    print("performing vocab mix sentence augmentation %d times" %
          vocab_mix_config['TOTAL'])
    t_tokenizer = Tokenizer(num_words=config['TOKENIZER_VOCAB_SIZE'],
                            oov_token="<OOV>")
    t_tokenizer.fit_on_texts(df.intent)
    delta += [[intent, NO_VAL] for intent in data_proc.vocabGarbage(
        vocab_mix_config['TOTAL'], vocab_mix_config['TOPK'],
        t_tokenizer.word_counts)]

    appendDF = pd.DataFrame(delta, columns=['intent', 'valid'])
    X_train = X_train.append(appendDF.intent)
    Y_train = np.append(Y_train, appendDF.valid)
    print("df size after augmentation: %d" % len(X_train.index))

    seqs = tokenizer.texts_to_sequences(X_train)
    padded_seqs = sequence.pad_sequences(seqs,
                                         maxlen=config['SEQUENCE_MAX_LENGTH'])

    # Load Network Architecture
    model = net.RNN(config['SEQUENCE_MAX_LENGTH'],
                    config['TOKENIZER_VOCAB_SIZE'])
    model.summary()
    model.compile(loss='binary_crossentropy',
                  optimizer=RMSprop(),
                  metrics=['accuracy'])

    # Model Training
    Y_train = np.asarray(Y_train).astype('float32')
    model.fit(padded_seqs,
              Y_train,
              batch_size=config['BATCH_SIZE'],
              epochs=config['NUM_EPOCHS'],
              validation_split=config['VALIDATION_SPLIT'])

    # Run model on test set
    test_seqs = tokenizer.texts_to_sequences(X_test)
    padded_test_seqs = sequence.pad_sequences(
        test_seqs, maxlen=config['SEQUENCE_MAX_LENGTH'])
    accr = model.evaluate(padded_test_seqs, Y_test)
    print('Test set\n  Loss: {:0.4f}\n  Accuracy: {:0.2f}'.format(
        accr[0], accr[1] * 100))

    # Print some example classifications from intent list
    seq = tokenizer.texts_to_sequences(df.intent.tail(config['TAIL_SIZE']))
    padded_seq = sequence.pad_sequences(seq,
                                        maxlen=config['SEQUENCE_MAX_LENGTH'])
    preds = model.predict(padded_seq)
    out = list(
        zip(df.intent.tail(config['TAIL_SIZE']),
            df.valid.tail(config['TAIL_SIZE']), preds))
    for obs in out:
        print('Intent: %s   Actual Class: %s   Predicted Class: %s' %
              (obs[0], obs[1], "yes" if obs[2][0] > 0.5 else "no"))

    # Define Model Name
    model_name = "acc%.2f" % (accr[1] * 100)
    os.mkdir('models/' + model_name)

    # save weights as HDF5
    model.save("models/" + model_name + "/weights.h5")
    print("Saved model to disk")

    # save model as JSON
    model_json = model.to_json()
    with open("models/" + model_name + "/model.json", "w") as file:
        file.write(model_json)

    # save tokenizer as JSON
    tokenizer_json = tokenizer.to_json()
    with open("models/" + model_name + "/tokenizer.json",
              'w',
              encoding='utf-8') as file:
        file.write(json.dumps(tokenizer_json, ensure_ascii=True))

    # write training details to YAML
    detail_dict = {
        'TOKENIZER_VOCAB_SIZE': config['TOKENIZER_VOCAB_SIZE'],
        'SEQUENCE_MAX_LENGTH': config['SEQUENCE_MAX_LENGTH'],
        'BATCH_SIZE': config['BATCH_SIZE'],
        'NUM_EPOCHS': config['NUM_EPOCHS'],
        'TRAIN_TEST_SPLIT': config['TRAIN_TEST_SPLIT'],
        'VALIDATION_SPLIT': config['VALIDATION_SPLIT'],
        'TRAINED_AT': datetime.datetime.now()
    }

    with open("models/" + model_name + "/details.yml", "w") as file:
        documents = yaml.dump(detail_dict, file)
    def get_train_test_data(self):
        if not self.file_num == 7:
            #Get sequence Tokenzier
            tokenizer = Tokenizer(oov_token=self.oov_token)
            tokenizer.fit_on_texts(self.articles)
            sequences = tokenizer.texts_to_sequences(self.articles)
            article_sequences = pad_sequences(sequences,
                                              maxlen=self.max_len,
                                              truncating=self.trunc_type,
                                              padding=self.padding_type)
            word_index = tokenizer.word_index
            vocab_size = len(word_index)
            # Train test split
            split_index = int(len(article_sequences) * self.train_test_split)
            train_sequences = np.array(article_sequences[0:split_index])
            test_sequences = np.array(article_sequences[split_index:])
            # Get label Tokenzier
            label_tokenizer = Tokenizer()
            label_tokenizer.fit_on_texts(self.labels)
            label_sequences = label_tokenizer.texts_to_sequences(self.labels)
            train_label = np.array(label_sequences[0:split_index])
            test_label = np.array(label_sequences[split_index:])
            # Get Glove pre-trained word embedding
            embeddings_index = {}
            with open('glove/glove.6B.100d.txt', encoding='utf-8') as f:
                for line in f:
                    values = line.split()
                    word = values[0]
                    coefs = np.asarray(values[1:], dtype='float32')
                    embeddings_index[word] = coefs

            embeddings_matrix = np.zeros((vocab_size + 1, self.embedding_dim))
            for word, i in word_index.items():
                embedding_vector = embeddings_index.get(word)
                if embedding_vector is not None:
                    embeddings_matrix[i] = embedding_vector
            # seralize objects to local storage
            tokenizer_json = tokenizer.to_json()
            label_tokenizer_json = label_tokenizer.to_json()
            # Save Tokenizer
            if not os.path.exists('./pre-trained/tokenizer.json'):
                with open('./pre-trained/tokenizer.json',
                          'w',
                          encoding='utf-8') as f:
                    f.write(json.dumps(tokenizer_json, ensure_ascii=False))
            if not os.path.exists('./pre-trained/label_tokenizer_json.json'):
                with open('./pre-trained/label_tokenizer_json.json',
                          'w',
                          encoding='utf-8') as f:
                    f.write(
                        json.dumps(label_tokenizer_json, ensure_ascii=False))
            # Save train test sequence and embeddings matrix
            if not os.path.exists('./pre-trained/train_sequences.npy'):
                np.save('./pre-trained/train_sequences.npy', train_sequences)
            if not os.path.exists('./pre-trained/train_label.npy'):
                np.save('./pre-trained/train_label.npy', train_label)
            if not os.path.exists('./pre-trained/test_sequences.npy'):
                np.save('./pre-trained/test_sequences.npy', test_sequences)
            if not os.path.exists('./pre-trained/test_label.npy'):
                np.save('./pre-trained/test_label.npy', test_label)
            if not os.path.exists('./pre-trained/embeddings_matrix.npy'):
                np.save('./pre-trained/embeddings_matrix.npy',
                        embeddings_matrix)
Пример #15
0
import string
samples = ['The cat sat on the mat.', 'The dog ate my homework.']
characters = string.printable
token_index = dict(zip(range(1, len(characters) + 1), characters))
max_length = 50
results = np.zeros((len(samples), max_length, max(token_index.keys()) + 1))
for i, sample in enumerate(samples):
    for j, character in enumerate(sample):
        index = token_index.get(character)
        results[i, j, index] = 1
print(len(results[1]))

from keras.preprocessing.text import Tokenizer
samples = ['The cat sat on the mat.', 'The dog ate my homework.']
tokenizer = Tokenizer(num_words=1000)
print(tokenizer.to_json())
tokenizer.fit_on_texts(samples)
sequences = tokenizer.texts_to_sequences(samples)
print(sequences)
one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary')
print(one_hot_results[0])
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

samples = ['The cat sat on the mat.', 'The dog ate my homework.']
dimensionality = 1000
max_length = 10
results = np.zeros((len(samples), max_length, dimensionality))
for i, sample in enumerate(samples):
    for j, word in list(enumerate(sample.split()))[:max_length]:
        index = abs(hash(word)) % dimensionality
class BiLSTM:
    def __init__(self,
                 epochs=5,
                 batch_size=36,
                 max_seq_len=25,
                 fit_verbose=2,
                 print_summary=True,
                 load_model_path=None,
                 tokenizer_path=None):
        self.epochs = epochs
        self.batch_size = batch_size
        self.max_seq_len = max_seq_len
        self.fit_verbose = fit_verbose
        self.print_summary = print_summary
        self.encoder = LabelEncoder()

        if load_model_path:
            self.model = load_model(load_model_path)
            with open(tokenizer_path) as f:
                data = json.load(f)
                self.tokenizer = tokenizer_from_json(data)
        else:
            self.model = self.model_1b
            self.tokenizer = Tokenizer()

    def train(self, X_train, y_train, X_dev, y_dev):
        self.tokenizer.fit_on_texts(X_train)

        X_train = self.tokenizer.texts_to_sequences(X_train)
        X_train = pad_sequences(X_train, maxlen=self.max_seq_len)

        X_dev = self.tokenizer.texts_to_sequences(X_dev)
        X_dev = pad_sequences(X_dev, maxlen=self.max_seq_len)

        y_train = self.encoder.fit_transform(y_train)
        y_train = to_categorical(y_train)

        y_dev = self.encoder.fit_transform(y_dev)
        y_dev = to_categorical(y_dev)

        m = self.model()

        y_train_int = np.argmax(y_train, axis=1)
        cws = class_weight.compute_class_weight('balanced',
                                                np.unique(y_train_int),
                                                y_train_int)

        if self.print_summary:
            print(m.summary())
        m.fit(X_train,
              y_train,
              validation_data=(X_dev, y_dev),
              epochs=self.epochs,
              batch_size=self.batch_size,
              verbose=self.fit_verbose)
        predictions = m.predict(X_dev, verbose=1)
        print('Validation Loss:', log_loss(y_dev, predictions))
        print('Validation Accuracy',
              (predictions.argmax(axis=1) == y_dev.argmax(axis=1)).mean())
        print(
            'Validation F1 Score:',
            f1_score(y_dev.argmax(axis=1),
                     predictions.argmax(axis=1),
                     average='weighted'))
        m.save('models/bilstm.keras')
        tokenizer_json = self.tokenizer.to_json()
        with io.open('models/bilstm-tokenizer.json', 'w',
                     encoding='utf-8') as f:
            f.write(json.dumps(tokenizer_json, ensure_ascii=False))

        self.model = m

    def model_1b(self):
        """
        Using a Bidiretional LSTM. 
        """
        model = Sequential()
        model.add(
            Embedding(input_dim=(len(self.tokenizer.word_counts) + 1),
                      output_dim=128,
                      input_length=self.max_seq_len))
        model.add(SpatialDropout1D(0.3))
        model.add(
            Bidirectional(LSTM(128, dropout=0.25, recurrent_dropout=0.25)))
        model.add(Dense(64, activation='relu'))
        model.add(Dropout(0.3))
        model.add(Dense(2, activation='softmax'))
        model.compile(loss='categorical_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
        return model

    def test(self, X_test, y_test=None):
        X_test = self.tokenizer.texts_to_sequences(X_test)
        X_test = pad_sequences(X_test, maxlen=self.max_seq_len)

        predictions = self.model.predict(X_test, verbose=1)
        if y_test is not None:
            y_test = self.encoder.fit_transform(y_test)
            y_test = to_categorical(y_test)
            print('Test Loss:', log_loss(y_test, predictions))
            print('Test Accuracy',
                  (predictions.argmax(axis=1) == y_test.argmax(axis=1)).mean())
            print(
                'Test F1 Score:',
                f1_score(y_test.argmax(axis=1),
                         predictions.argmax(axis=1),
                         average='weighted'))
        predictions = np.argmax(predictions, axis=1)
        np.savetxt("preds/bilstm-preds.txt", predictions, fmt='%d')
        return predictions