def test_tokenizer_oov_flag(self):
    x_train = ['This text has only known words']
    x_test = ['This text has some unknown words']  # 2 OOVs: some, unknown

    # Default, without OOV flag
    tokenizer = preprocessing_text.Tokenizer()
    tokenizer.fit_on_texts(x_train)
    x_test_seq = tokenizer.texts_to_sequences(x_test)
    self.assertEqual(len(x_test_seq[0]), 4)  # discards 2 OOVs

    # With OOV feature
    tokenizer = preprocessing_text.Tokenizer(oov_token='<unk>')
    tokenizer.fit_on_texts(x_train)
    x_test_seq = tokenizer.texts_to_sequences(x_test)
    self.assertEqual(len(x_test_seq[0]), 6)  # OOVs marked in place
def sequence_vectorize(train_texts, val_texts):
    """ Vectorizes texts as sequence vectors

	1 text = 1 sequence vector with fixed length

	# Returns
		x_train, x_val, word_index: vectorized training and validation texts and word index dictionary

	"""
    tokenizer = text.Tokenizer(num_words=TOP_K)
    tokenizer.fit_on_texts(train_texts)

    x_train = tokenizer.texts_to_sequences(train_texts)
    x_val = tokenizer.texts_to_sequences(val_texts)

    max_length = len(max(x_train, key=len))
    if max_length > MAX_SEQUENCE_LENGTH:
        max_length = MAX_SEQUENCE_LENGTH

    # if shorter then padded in the beginning if longer then truncated

    x_train = sequence.pad_sequences(x_train, maxlen=max_length)
    x_val = sequence.pad_sequences(x_val, maxlen=max_length)

    return x_train, x_val, tokenizer.word_index
示例#3
0
def sequence_vectorize(train_texts, val_texts):
    """Vectorizes texts as sequence vectors.
    1 text = 1 sequence vector with fixed length.
    # Arguments
        train_texts: list, training text strings.
        val_texts: list, validation text strings.
    # Returns
        x_train, x_val, word_index: vectorized training and validation
            texts and word index dictionary.
    """
    # Create vocabulary with training texts.
    tokenizer = text.Tokenizer(num_words=TOP_K)
    tokenizer.fit_on_texts(train_texts)

    # Vectorize training and validation texts.
    x_train = tokenizer.texts_to_sequences(train_texts)
    x_val = tokenizer.texts_to_sequences(val_texts)

    # Get max sequence length.
    max_length = len(max(x_train, key=len))
    if max_length > MAX_SEQUENCE_LENGTH:
        max_length = MAX_SEQUENCE_LENGTH

    # Fix sequence length to max value. Sequences shorter than the length are
    # padded in the beginning and sequences longer are truncated
    # at the beginning.
    x_train = sequence.pad_sequences(x_train, maxlen=max_length)
    x_val = sequence.pad_sequences(x_val, maxlen=max_length)
    return x_train, x_val, tokenizer.word_index
示例#4
0
def sequentialize_data(train_contents, val_contents=None):
    """Vectorize data into ngram vectors.

    Args:
        train_contents: training instances
        val_contents: validation instances
        y_train: labels of train data.

    Returns:
        sparse ngram vectors of train, valid text inputs.
    """
    tokenizer = text.Tokenizer(num_words=MAX_VOCAB_SIZE)
    tokenizer.fit_on_texts(train_contents)
    x_train = tokenizer.texts_to_sequences(train_contents)

    if val_contents:
        x_val = tokenizer.texts_to_sequences(val_contents)

    max_length = len(max(x_train, key=len))
    if max_length > MAX_SEQ_LENGTH:
        max_length = MAX_SEQ_LENGTH

    x_train = sequence.pad_sequences(x_train, maxlen=max_length)
    if val_contents:
        x_val = sequence.pad_sequences(x_val, maxlen=max_length)

    word_index = tokenizer.word_index
    num_features = min(len(word_index) + 1, MAX_VOCAB_SIZE)
    if val_contents:
        return x_train, x_val, word_index, num_features, tokenizer, max_length
    else:
        return x_train, word_index, num_features, tokenizer, max_length
示例#5
0
def tokenizer_from_json(json_string):
    """Parses a JSON tokenizer configuration file and returns a
    tokenizer instance.
    # Arguments
        json_string: JSON string encoding a tokenizer configuration.
    # Returns
        A Keras Tokenizer instance
    """
    tokenizer_config = json.loads(json_string)
    config = tokenizer_config.get('config')

    word_counts = json.loads(config.pop('word_counts'))
    word_docs = json.loads(config.pop('word_docs'))
    index_docs = json.loads(config.pop('index_docs'))
    # Integer indexing gets converted to strings with json.dumps()
    index_docs = {int(k): v for k, v in index_docs.items()}
    index_word = json.loads(config.pop('index_word'))
    index_word = {int(k): v for k, v in index_word.items()}
    word_index = json.loads(config.pop('word_index'))

    tokenizer = text.Tokenizer(**config)
    tokenizer.word_counts = word_counts
    tokenizer.word_docs = word_docs
    tokenizer.index_docs = index_docs
    tokenizer.word_index = word_index
    tokenizer.index_word = index_word

    return tokenizer
  def test_tokenizer_unicode(self):
    texts = [
        u'ali veli kırk dokuz elli', u'ali veli kırk dokuz elli veli kırk dokuz'
    ]
    tokenizer = preprocessing_text.Tokenizer(num_words=5)
    tokenizer.fit_on_texts(texts)

    self.assertEqual(len(tokenizer.word_counts), 5)
示例#7
0
def train_and_evaluate(output_dir, hparams):
    tf.summary.FileWriterCache.clear(
    )  # ensure filewriter cache is clear for TensorBoard events file

    # Load Data
    ((train_texts, train_labels),
     (test_texts,
      test_labels)) = load_hacker_news_data(hparams['train_data_path'],
                                            hparams['eval_data_path'])

    # Create vocabulary from training corpus.
    tokenizer = text.Tokenizer(num_words=TOP_K)
    tokenizer.fit_on_texts(train_texts)

    # Generate vocabulary file from tokenizer object to enable
    # creating a native tensorflow lookup table later (see vectorize_sentences())
    tf.gfile.MkDir(
        output_dir)  # directory must exist before we can use tf.gfile.open
    global VOCAB_FILE_PATH
    VOCAB_FILE_PATH = os.path.join(output_dir, 'vocab.txt')
    with tf.gfile.Open(VOCAB_FILE_PATH, 'wb') as f:
        f.write("{},0\n".format(PADWORD))  # map padword to 0
        for word, index in tokenizer.word_index.items():
            if index < TOP_K:  # only save mappings for TOP_K words
                f.write("{},{}\n".format(word, index))

    # Create estimator
    run_config = tf.estimator.RunConfig(save_checkpoints_steps=500)
    estimator = keras_estimator(model_dir=output_dir,
                                config=run_config,
                                learning_rate=hparams['learning_rate'],
                                embedding_path=hparams['embedding_path'],
                                word_index=tokenizer.word_index)

    # Create TrainSpec
    train_steps = hparams['num_epochs'] * len(
        train_texts) / hparams['batch_size']
    train_spec = tf.estimator.TrainSpec(
        input_fn=lambda: input_fn(train_texts,
                                  train_labels,
                                  hparams['batch_size'],
                                  mode=tf.estimator.ModeKeys.TRAIN),
        max_steps=train_steps)

    # Create EvalSpec
    exporter = tf.estimator.LatestExporter('exporter', serving_input_fn)
    eval_spec = tf.estimator.EvalSpec(
        input_fn=lambda: input_fn(test_texts,
                                  test_labels,
                                  hparams['batch_size'],
                                  mode=tf.estimator.ModeKeys.EVAL),
        steps=None,
        exporters=exporter,
        start_delay_secs=10,
        throttle_secs=10)

    # Start training
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
示例#8
0
def seq2int(data: 'tuple from getData()', mod: 'concat name of model data'):
    """ 
        Assing string data to random integrals,
        takes the tuple from getData(),
        returns tranform tuple and word_index.
        input: getData() ==> (train_data, y_train):80%, (test_data, y_test):20%
        output: int(train_data, y_train), int(test_data, y_test), word_index
    """
    (x_train, y_train), (x_test, y_test) = data

    try:
        # checking if we already have the data since there are no reason to make it more than once
        x_train = joblib.load(
            os.path.join(pre_data_dir, mod,
                         '{0}_x_train_sequence.pkl'.format(mod)))
        x_test = joblib.load(
            os.path.join(pre_data_dir, mod,
                         '{0}_x_test_sequence.pkl'.format(mod)))
        tokenizer = joblib.load(
            os.path.join(pre_data_dir, mod, '{0}_word_index.pkl'.format(mod)))

        return (x_train, y_train), (x_test, y_test), tokenizer[0]

    except:
        # seting maximum for different words
        TOP_K = 20000
        # setting maximum length for array of words
        MAX_SEQUENCE_LENGTH = 500
        # stating tranformer
        tokenizer = text.Tokenizer(num_words=TOP_K)
        # fitting on train set
        tokenizer.fit_on_texts(x_train)
        # tranforming on train/test sets
        x_train = tokenizer.texts_to_sequences(x_train)
        x_test = tokenizer.texts_to_sequences(x_test)
        # checking if the longest data is longer than the MAX_SEQUENCE_LENGTH
        # since we work with tweets, MAX_SEQUENCE_LENGTH is always greater.
        max_length = len(max(x_train, key=len))
        if max_length > MAX_SEQUENCE_LENGTH:
            max_length = MAX_SEQUENCE_LENGTH
        # padding sequence so data base has always same length
        x_train = sequence.pad_sequences(x_train, maxlen=max_length)
        x_test = sequence.pad_sequences(x_test, maxlen=max_length)

        # saving final database
        joblib.dump(
            x_train,
            os.path.join(pre_data_dir, mod,
                         '{0}_x_train_sequence.pkl'.format(mod)))
        joblib.dump(
            x_test,
            os.path.join(pre_data_dir, mod,
                         '{0}_x_test_sequence.pkl'.format(mod)))
        joblib.dump([tokenizer.word_index, max_length],
                    os.path.join(pre_data_dir, mod,
                                 '{0}_word_index.pkl'.format(mod)))

        return (x_train, y_train), (x_test, y_test), tokenizer.word_index
def tokenized_seq_vectors(X_train, X_test):
    tokenizer = text.Tokenizer(num_words=1000)
    tokenizer.fit_on_texts(X_train)
    X_train = tokenizer.texts_to_sequences(X_train)
    X_test = tokenizer.texts_to_sequences(X_test)
    maxlength = len(max(X_train, key=len))
    X_train = sequence.pad_sequences(X_train, maxlen=maxlength)
    X_test = sequence.pad_sequences(X_test, maxlen=maxlength)
    index = tokenizer.word_index
    return X_train, X_test, index
示例#10
0
def sequentialize_data(train_contents):
    MAX_VOCAB_SIZE = 200000
    tokenizer = text.Tokenizer(num_words=MAX_VOCAB_SIZE)
    tokenizer.fit_on_texts(train_contents)
    x_train = tokenizer.texts_to_sequences(train_contents)
    max_length = len(max(x_train, key=len))
    x_train = sequence.pad_sequences(x_train, maxlen=max_length)
    word_index = tokenizer.word_index
    num_features = min(len(word_index) + 1, MAX_VOCAB_SIZE)
    return x_train, word_index, num_features, tokenizer, max_length
def train_and_evaluate(output_dir, hparams):
    tf.summary.FileWriterCache.clear() # ensure filewriter cache is clear for TensorBoard events file

    # Load Data
    ((train_texts, train_labels), (test_texts, test_labels)) = load_hacker_news_data(
        hparams['train_data_path'], hparams['eval_data_path'])

    # Create vocabulary from training corpus.
    tokenizer = text.Tokenizer(num_words=TOP_K)
    tokenizer.fit_on_texts(train_texts)

    # Save token dictionary to use during prediction time
    pickle.dump(tokenizer, open('tokenizer.pickled', 'wb'))

    # Create estimator
    run_config = tf.estimator.RunConfig(save_checkpoints_steps=500)
    # TODO: create estimator
    estimator = keras_estimator(
        model_dir=output_dir,
        config=run_config,
        learning_rate=hparams['learning_rate'],
        embedding_path=hparams['embedding_path'],
        word_index=tokenizer.word_index
    )

    # Create TrainSpec
    train_steps = hparams['num_epochs'] * len(train_texts) / hparams['batch_size']
    train_spec = tf.estimator.TrainSpec(
        input_fn=input_fn(
            train_texts,
            train_labels,
            tokenizer,
            hparams['batch_size'],
            mode=tf.estimator.ModeKeys.TRAIN),
        max_steps=train_steps
    )

    # Create EvalSpec
    exporter = tf.estimator.LatestExporter('exporter', serving_input_fn)
    eval_spec = tf.estimator.EvalSpec(
        input_fn=input_fn(
            test_texts,
            test_labels,
            tokenizer,
            hparams['batch_size'],
            mode=tf.estimator.ModeKeys.EVAL),
        steps=None,
        exporters=exporter,
        start_delay_secs=10,
        throttle_secs=10
    )

    # Start training
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
示例#12
0
def train_and_evaluate(output_dir, hparams):

    tf.compat.v1.summary.FileWriterCache.clear(
    )  # ensure filewriter cache is clear for TensorBoard events file

    (train_texts, train_labels), (test_texts, test_labels) = load_train_eval(
        hparams['train_data_path'], hparams['eval_data_path'])

    tokenizer = text.Tokenizer()
    tokenizer.fit_on_texts(train_texts)

    tf.io.gfile.mkdir(
        output_dir)  # directory must exist before we can use tf.gfile.open
    global VOCAB_FILE_PATH
    VOCAB_FILE_PATH = os.path.join(output_dir, 'vocab.txt')
    with tf.io.gfile.GFile(VOCAB_FILE_PATH, 'wb') as f:
        f.write("{},0\n".format(PADWORD))  # map padword to 0
        for word, index in tokenizer.word_index.items():
            # only save mappings for TOP_K words
            f.write("{},{}\n".format(word, index))

    runconfig = tf.estimator.RunConfig(save_checkpoints_steps=500)

    estimator = keras_estimator(model_dir=output_dir,
                                config=runconfig,
                                learning_rate=hparams['learning_rate'],
                                embedding_path=hparams['embedding_path'],
                                word_index=tokenizer.word_index,
                                embedding_dim=hparams['embedding_dim'])

    # Create TrainSpec
    train_steps = hparams['num_epochs'] * len(
        train_texts) / hparams['batch_size']
    train_spec = tf.estimator.TrainSpec(
        input_fn=lambda: input_fn(train_texts,
                                  train_labels,
                                  hparams['batch_size'],
                                  mode=tf.estimator.ModeKeys.TRAIN),
        max_steps=train_steps)

    # Create EvalSpec
    exporter = tf.estimator.LatestExporter('exporter', serving_input_fn)
    eval_spec = tf.estimator.EvalSpec(
        input_fn=lambda: input_fn(test_texts,
                                  test_labels,
                                  hparams['batch_size'],
                                  mode=tf.estimator.ModeKeys.EVAL),
        steps=None,
        exporters=exporter,
        start_delay_secs=10,
        throttle_secs=10)

    # Start training
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
示例#13
0
def sequence_vectorizer(X_train, X_val):
    tokenizer = text.Tokenizer(num_words=maximum_features)
    tokenizer.fit_on_texts(X_train)
    X_train = tokenizer.texts_to_sequences(X_train)
    X_val = tokenizer.texts_to_sequences(X_val)
    maximum_length = len(max(X_train, key=len))
    if (maximum_length > max_sequence_length):
        maximum_length = max_sequence_length
    print(maximum_length)
    X_train = sequence.pad_sequences(X_train, maxlen=maximum_length)
    X_val = sequence.pad_sequences(X_val, maxlen=maximum_length)
    return X_train, X_val, tokenizer.word_index, tokenizer
示例#14
0
  def test_sequential_fit(self):
    texts = [
        'The cat sat on the mat.', 'The dog sat on the log.',
        'Dogs and cats living together.'
    ]
    word_sequences = [['The', 'cat', 'is', 'sitting'],
                      ['The', 'dog', 'is', 'standing']]
    tokenizer = preprocessing_text.Tokenizer()
    tokenizer.fit_on_texts(texts)
    tokenizer.fit_on_texts(word_sequences)

    self.assertEqual(tokenizer.document_count, 5)

    tokenizer.texts_to_matrix(texts)
    tokenizer.texts_to_matrix(word_sequences)
示例#15
0
 def __init__(self, num_words = 2000, maxlen = 200, padding = 'post', 
     truncating = 'post'):
     # Maximo de palabras
     self.num_words = num_words
     # Tokenizador
     self.tokenizer = text.Tokenizer(
         num_words = self.num_words,
         lower = True,
         split = ' '
     )
     # Maxima logitud del vector
     self.maxlen = maxlen
     # Añadir ceros
     self.padding = padding
     # Truncar frases
     self.truncating = truncating
示例#16
0
def vectorize_data(training_text, validation_text, test_text):
  glyphs = " abcdefghijklmnopqrstuvwxyz"
  #trn = [' '.join([j for j in i]) for i in training_text]
  #val = [' '.join([j for j in i]) for i in validation_text]

  tokenizer = text.Tokenizer(lower=True, char_level=True, oov_token='@')
  tokenizer.fit_on_texts(training_text + validation_text + test_text)

  train = tokenizer.texts_to_sequences(training_text)
  validate = tokenizer.texts_to_sequences(validation_text)
  testing = tokenizer.texts_to_sequences(test_text)
  glyph_dictionary = tokenizer.word_index
  train = sequence.pad_sequences(train, maxlen=MAX_WORD_LENGTH, padding='post')
  validate = sequence.pad_sequences(validate, maxlen=MAX_WORD_LENGTH, padding='post')
  testing = sequence.pad_sequences(testing, maxlen=MAX_WORD_LENGTH, padding='post')
  return train, validate, testing, glyph_dictionary, tokenizer
示例#17
0
def tokenize_vectorize(trainTexts, testTexts):
    #Tokenization and Vectorisation for sequence models, this method assumes that order of words is important in text, and is better for CNN and RNN
    #Create vocabulary with training texts
    tokenizer = text.Tokenizer(num_words=TOP_K, lower=False)
    tokenizer.fit_on_texts(trainTexts.text)
    word_index_text = tokenizer.word_index
    #Create vocabulary with training title
    tokenizer.fit_on_texts(trainTexts.title)
    word_index_title = tokenizer.word_index
    #Vectorize the training and validation texts
    trainSetText = tokenizer.texts_to_sequences(trainTexts.text)
    testSetText = tokenizer.texts_to_sequences(testTexts.text)
    trainSetTitle = tokenizer.texts_to_sequences(trainTexts.title)
    testSetTitle = tokenizer.texts_to_sequences(testTexts.title)

    #Get max sequence length
    max_length = len(max(trainSetText, key=len))
    if max_length > MAX_SEQUENCE_LENGTH:
        max_length = MAX_SEQUENCE_LENGTH

    #Fix sequence length to max value.
    #The sequence is padded in the beginning if shorter than the length
    #and longer sequences are truncated
    trainSetText = sequence.pad_sequences(trainSetText, maxlen=max_length)
    trainSetTitle = sequence.pad_sequences(trainSetTitle, maxlen=max_length)
    testSetText = sequence.pad_sequences(testSetText, maxlen=max_length)
    testSetTitle = sequence.pad_sequences(testSetTitle, maxlen=max_length)
    trainSetText = numpy.array(trainSetText)
    trainSetTitle = numpy.array(trainSetTitle)
    testSetText = numpy.array(testSetText)
    testSetTitle = numpy.array(testSetTitle)

    #Commented out to fit with other changes to the classifier model
    #trainSetText =  trainSetText.reshape((trainSetText.shape[0], trainSetText.shape[1], 1))
    #testSetText =  testSetText.reshape((testSetText.shape[0], testSetText.shape[1], 1))
    #trainSetTitle = trainSetTitle.reshape((trainSetTitle.shape[0], trainSetTitle.shape[1], 1))
    #testSetTitle =  testSetTitle.reshape((testSetTitle.shape[0], testSetTitle.shape[1], 1))
    #Shape should be 35918, 500, 1

    X_train = [trainSetText, trainSetTitle]
    X_test = [testSetText, testSetTitle]

    #Labels- Converting labels to binary vectors
    Y_train = to_categorical(trainTexts.label, num_classes=2)
    Y_test = to_categorical(testTexts.label, num_classes=2)

    return X_train, X_test, Y_train, Y_test, word_index_text, word_index_title
示例#18
0
def train_and_evaluate(output_dir, hparams):
    # Load Data
    ((train_texts, train_labels), (test_texts, test_labels)) = load_hacker_news_data(
        hparams['train_data_path'], hparams['eval_data_path'])

    # Create vocabulary from training corpus.
    tokenizer = text.Tokenizer(num_words=TOP_K)
    tokenizer.fit_on_texts(train_texts)

    # Save token dictionary to use during prediction time
    pickle.dump(tokenizer, open('tokenizer.pickled', 'wb'))

    # Create estimator
    run_config = tf.estimator.RunConfig(save_checkpoints_steps=1000)
    estimator = # TODO: create estimator

    # Create TrainSpec
    train_steps = hparams['num_epochs'] * len(train_texts) / hparams['batch_size']
    train_spec = tf.estimator.TrainSpec(
        input_fn=input_fn(
            train_texts,
            train_labels,
            tokenizer,
            hparams['batch_size'],
            mode=tf.estimator.ModeKeys.TRAIN),
        max_steps=train_steps
    )

    # Create EvalSpec
    exporter = tf.estimator.LatestExporter('exporter', serving_input_fn)
    eval_spec = tf.estimator.EvalSpec(
        input_fn=input_fn(
            test_texts,
            test_labels,
            tokenizer,
            hparams['batch_size'],
            mode=tf.estimator.ModeKeys.EVAL),
        steps=None,
        exporters=exporter,
        start_delay_secs=10,
        throttle_secs=10
    )

    # Start training
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
示例#19
0
def train_and_predict(X_train, X_test, Y_train, Y_test):
    max_words = 1226
    tokenize = text.Tokenizer(num_words=max_words, char_level=False)
    tokenize.fit_on_texts(X_train)

    x_train = tokenize.texts_to_matrix(X_train)
    x_test = tokenize.texts_to_matrix(X_test)

    encoder = LabelEncoder()
    encoder.fit(Y_train)
    y_train = encoder.transform(Y_train)
    y_test = encoder.transform(Y_test)

    num_classes = np.max(y_train) + 1
    y_train = utils.to_categorical(y_train, num_classes)
    y_test = utils.to_categorical(y_test, num_classes)

    batch_size = 32
    epochs = 1000

    # Build the model
    model = Sequential()
    model.add(Dense(512, input_shape=(max_words, )))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    tensorboard = TensorBoard(log_dir="logs/{}".format(time.time()))

    history = model.fit(x_train,
                        y_train,
                        batch_size=batch_size,
                        epochs=epochs,
                        verbose=1,
                        validation_split=0.1,
                        callbacks=[tensorboard])

    score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1)
    print('Test accuracy:', score[1])
示例#20
0
    def test_tokenizer(self):
        texts = [
            'The cat sat on the mat.', 'The dog sat on the log.',
            'Dogs and cats living together.'
        ]
        tokenizer = preprocessing_text.Tokenizer(num_words=10)
        tokenizer.fit_on_texts(texts)

        sequences = []
        for seq in tokenizer.texts_to_sequences_generator(texts):
            sequences.append(seq)
        self.assertLess(np.max(np.max(sequences)), 10)
        self.assertEqual(np.min(np.min(sequences)), 1)

        tokenizer.fit_on_sequences(sequences)

        for mode in ['binary', 'count', 'tfidf', 'freq']:
            matrix = tokenizer.texts_to_matrix(texts, mode)
            self.assertEqual(matrix.shape, (3, 10))
def sequence_vectorize(train_texts, val_texts, k=1):
    """Vectorizes texts as sequence vectors.

    1 text = 1 sequence vector with fixed length.

    # Arguments
        train_texts: list, training text strings.
        val_texts: list, validation text strings.

    # Returns
        x_train, x_val, word_index: vectorized training and validation
            texts and word index dictionary.
    """
    print('Tokenizing')
    # Create vocabulary with training texts.
    tokenizer = text.Tokenizer(num_words=TOP_K)
    tokenizer.fit_on_texts(train_texts)
    print('Vectorizing')
    # Vectorize training and validation texts.
    x_train = tokenizer.texts_to_sequences(train_texts)
    x_val = tokenizer.texts_to_sequences(val_texts)

    # Get max sequence length.
    max_length = len(max(x_train, key=len))
    if max_length > MAX_SEQUENCE_LENGTH:
        max_length = MAX_SEQUENCE_LENGTH

    # Fix sequence length to max value. Sequences shorter than the length are
    # padded in the beginning and sequences longer are truncated
    # at the beginning.
    print('Padding/Truncating Sequences')
    x_train = sequence.pad_sequences(x_train, maxlen=max_length)
    x_val = sequence.pad_sequences(x_val, maxlen=max_length)

    # Save Tokenizer to Disk
    print('Saving Tokenizer')
    tokenConfig = tokenizer.to_json()
    f = open('amazon_sepcnn_' + str(k) + 'k_tokenizer.json', 'w')
    f.write(tokenConfig)
    f.close()

    return x_train, x_val, tokenizer.word_index
示例#22
0
    def __init__(self,
                 vectorizer_mode=TFIDF_MODE,
                 max_features=60000,
                 verbose=False):
        self.le = LabelEncoder()
        self.stemmer = SnowballStemmer("english")
        self.vectorizer_mode = vectorizer_mode
        self.max_features = max_features
        self.verbose = verbose

        if vectorizer_mode == TFIDF_MODE:

            self.tfidf = TfidfVectorizer(stop_words=stopwords.words('english'),
                                         max_features=self.max_features,
                                         ngram_range=(1, 2),
                                         token_pattern=token_pattern,
                                         tokenizer=custom_tokenizer)

        elif vectorizer_mode == EMBEDDING_MODE:
            self.tokenizer = text.Tokenizer(num_words=self.max_features)
示例#23
0
def sequence_vectorize(train_texts, val_texts,number_of_features,max_sequence_length):

    # Create vocabulary with training texts.
    tokenizer = text.Tokenizer(num_words=number_of_features)
    tokenizer.fit_on_texts(train_texts)

    # Vectorize training and validation texts.
    x_train = tokenizer.texts_to_sequences(train_texts)
    x_val = tokenizer.texts_to_sequences(val_texts)

    # Get max sequence length.
    max_length = len(max(x_train, key=len))
    if max_length > max_sequence_length:
        max_length = max_sequence_length

    # Fix sequence length to max value. Sequences shorter than the length are
    # padded in the beginning and sequences longer are truncated
    # at the beginning.
    x_train = sequence.pad_sequences(x_train, maxlen=max_length)
    x_val = sequence.pad_sequences(x_val, maxlen=max_length)
    return x_train, x_val, tokenizer.word_index
示例#24
0
def sequence_vectorize(train, val, test, num_words=10000, max_seq_length=100):
    """
    Vectorizes texts as sequence vectors.
    1 text = 1 sequence vector with fixed length.

    Args:
        train: list, training speeches
        val: list, validation speeches
        test: list, test speeches

    Kwargs:
        num_words: int, number of words to keep
        max_seq_length: int, make all sequences of this length

    Returns:
        x_train, x_val, x_test, word_index: vectorized training, validation, test
            speeches and word index dictionary.
    """

    # Create vocabulary with training texts.
    tokenizer = text.Tokenizer(num_words=num_words,
                               lower=True,
                               oov_token='<unk>')
    tokenizer.fit_on_texts(train_texts)

    # Vectorize training and validation texts.
    # Transforms each text to a sequence of integers.
    x_train = tokenizer.texts_to_sequences(train)
    x_val = tokenizer.texts_to_sequences(val)
    x_test = tokenizer.texts_to_sequences(test)

    # Fix sequence length to max value. Sequences shorter than the length are
    # padded in the beginning and sequences longer are truncated
    # at the beginning.
    x_train = sequence.pad_sequences(x_train, maxlen=max_seq_length)
    x_val = sequence.pad_sequences(x_val, maxlen=max_seq_length)
    x_test = sequence.pad_sequences(x_test, maxlen=max_seq_length)

    return x_train, x_val, x_test, tokenizer.word_index
示例#25
0
def sequence_vectorize(pre, hyp, top_k=20000, max_seq_len=500):
    """Vectorizes texts as sequence vectors."""

    # Create vocabulary with training texts.
    tokenizer = text.Tokenizer(num_words=top_k)
    tokenizer.fit_on_texts(pre + hyp)

    # Vectorize training texts.
    x_hyp = tokenizer.texts_to_sequences(pre)
    x_pre = tokenizer.texts_to_sequences(hyp)

    # Get max sequence length.
    max_length = len(max(x_hyp + x_pre, key=len))
    if max_length > max_seq_len:
        max_length = max_seq_len

    # Fix sequence length to max value. Sequences shorter than the length are
    # padded in the beginning and sequences longer are truncated
    # at the beginning.
    x_hyp = sequence.pad_sequences(x_hyp, maxlen=max_length).astype('float32')
    x_pre = sequence.pad_sequences(x_pre, maxlen=max_length).astype('float32')
    return x_hyp, x_pre, tokenizer
示例#26
0
def vectorize_data(train_data, test_data):
    """
    Construye la secuencias de palabras para cada aviso de propiedad. Las palabras se encuentran indexadas.
    :param train_data:  dataframe que almacena los avisos del conjunto de entrenamiento
    :param test_data:   dataframe que almacena los avisos del conjunto de prueba
    :return:            x_train: lista de las secuencias palabras (índices) de los textos de entrenamiento
                        x_test:  lista de las secuencias palabras (índices) de los textos de prueba
                        tokenizer_obj: objeto de la clase Tokenizer
    """
    # Límite del vector de características
    limit_words = 20000

    # Une el nombre y descripcion de una propiedad en un solo texto
    train_texts = (
        train_data['nombre'].apply(lambda x: x if x is not np.nan else ' ') +
        ' ' + train_data['descripcion'].apply(lambda x: x
                                              if x is not np.nan else ' '))

    test_texts = (
        test_data['nombre'].apply(lambda x: x if x is not np.nan else ' ') +
        ' ' +
        test_data['descripcion'].apply(lambda x: x if x is not np.nan else ' ')
    )

    # Vocabulario
    tokenizer_obj = text.Tokenizer(limit_words)
    tokenizer_obj.fit_on_texts(train_texts.to_list())

    # Vectorización de ambos conjuntos
    x_train = tokenizer_obj.texts_to_sequences(train_texts.to_list())
    x_test = tokenizer_obj.texts_to_sequences(test_texts.to_list())

    # Se obtiene la dimensión máxima de un vector y se ajustan los vectores según éste valor
    max_dimension = max(len(max(x_train, key=len)), len(max(x_test, key=len)))

    x_train = sequence.pad_sequences(x_train, maxlen=max_dimension)
    x_test = sequence.pad_sequences(x_test, maxlen=max_dimension)

    return x_train, x_test, tokenizer_obj
def sequence_vectorize(texts):
    """Vectorizes texts as sequence vectors.
    # Arguments
        train_texts: list, training text strings.
    # Returns
        x_train, word_index: vectorized training and validation
            texts and word index dictionary.
    """
    # Create vocabulary with training texts.
    tokenizer = text.Tokenizer(num_words=TOP_K)
    tokenizer.fit_on_texts(texts)

    # Vectorize text.
    vectors = tokenizer.texts_to_sequences(texts)

    # Get max sequence length.
    max_length = len(max(vectors, key=len))
    if max_length > MAX_SEQUENCE_LENGTH:
        max_length = MAX_SEQUENCE_LENGTH

    # Add padding to sequences.
    padded_vectors = sequence.pad_sequences(vectors, maxlen=max_length)

    return padded_vectors, tokenizer.word_index
示例#28
0
def create_tokenizer(input_ds,
                     top_words,
                     preprocess_fn=None,
                     key='text',
                     oov_token=None,
                     tokenizer=None):
    '''
    creates keras.preprocessing.text.Tokenizer object based on
    input dataset, top number of words, and nlp preprocessing
    functions
    
    args:
        input_ds: list of dicts, each dict has the following key:
            key: str, default 'text', text that needs to be tokenized, default 'text'
        top_words: int, top number of words to be tokenized
        preprocess_fn: function, default, None, the text in the input_ds will be
        passed into this function to train the tokenizer
        (in input_ds text will also not be passed into the preprocess_fn)
        oov_token: str, default None, if not None, token to replace out of vocab words
        tokenizer: keras.preprocessing.text.Tokenizer object, default None, existing 
            tokenizer object that will get trained on the input_ds, if value is passed:
            top_words and oov_token args will be ignored
    
    returns:keras.preprocessing.text.Tokenizer object
    '''
    word_list = []  #list of texts
    for item in input_ds:
        words = item[key]
        word_list.append(words)
        if preprocess_fn:
            preprocessed_words = preprocess_fn(words)
            word_list.append(preprocessed_words)
    if tokenizer == None:
        tokenizer = text.Tokenizer(num_words=top_words, oov_token=oov_token)
    tokenizer.fit_on_texts(word_list)
    return tokenizer
示例#29
0
def train_and_evaluate(output_dir, hparams):
    # ensure filewriter cache is clear for TensorBoard events file
    tf.summary.FileWriterCache.clear()

    # Load Data
    ((train_texts, train_labels),
     (test_texts, test_labels)) = load_review_data(hparams['train_data_path'],
                                                   hparams['eval_data_path'])

    # Create vocabulary from training corpus.
    tokenizer = text.Tokenizer(num_words=TOP_K)
    tokenizer.fit_on_texts(train_texts)

    # Generate vocabulary file from tokenizer object to enable
    # creating a native tensorflow lookup table later (used in vectorize_sentences())
    tf.gfile.MkDir(
        output_dir)  # directory must exist before we can use tf.gfile.open
    global VOCAB_FILE_PATH
    VOCAB_FILE_PATH = os.path.join(output_dir, 'vocab.txt')
    with tf.gfile.Open(VOCAB_FILE_PATH, 'wb') as f:
        f.write("{},0\n".format(PADWORD))  # map padword to 0
        for word, index in tokenizer.word_index.items():
            if index < TOP_K:  # only save mappings for TOP_K words
                f.write("{},{}\n".format(word, index))

    # Create estimator
    run_config = tf.estimator.RunConfig(save_checkpoints_steps=100,
                                        save_summary_steps=100)

    if hparams['model_type'] == 'CNN':
        estimator = keras_CNN_estimator(
            model_dir=output_dir,
            config=run_config,
            learning_rate=hparams['learning_rate'],
            filters=hparams['filters'],
            dropout_rate=hparams['dropout_rate'],
            embedding_dim=hparams['embedding_dim'],
            kernel_size=hparams['kernel_size'],
            pool_size=hparams['pool_size'],
            embedding_path=hparams['embedding_path'],
            word_index=tokenizer.word_index)
    elif hparams['model_type'] == 'LSTM':
        estimator = keras_LSTM_estimator(
            model_dir=output_dir,
            config=run_config,
            learning_rate=hparams['learning_rate'],
            dropout_rate=hparams['dropout_rate'],
            embedding_dim=hparams['embedding_dim'],
            embedding_path=hparams['embedding_path'],
            word_index=tokenizer.word_index)
    elif hparams['model_type'] == 'BiDirect':
        estimator = keras_BiDirect_estimator(
            model_dir=output_dir,
            config=run_config,
            learning_rate=hparams['learning_rate'],
            dropout_rate=hparams['dropout_rate'],
            embedding_dim=hparams['embedding_dim'],
            embedding_path=hparams['embedding_path'],
            word_index=tokenizer.word_index)

    ### Add evaluating metric
    #estimator = tf.contrib.estimator.add_metrics(estimator, my_acc)

    ### Add early stopping
    early_stopping = tf.estimator.experimental.stop_if_no_decrease_hook(
        estimator,
        metric_name='loss',
        max_steps_without_decrease=1000,
        min_steps=100)

    # Create TrainSpec
    train_steps = hparams['num_epochs'] * len(
        train_texts) / hparams['batch_size']
    train_spec = tf.estimator.TrainSpec(
        input_fn=lambda: input_fn(train_texts,
                                  train_labels,
                                  hparams['batch_size'],
                                  mode=tf.estimator.ModeKeys.TRAIN),
        hooks=[early_stopping],
        max_steps=train_steps)

    # Create EvalSpec
    exporter = tf.estimator.LatestExporter('exporter', serving_input_fn)
    eval_spec = tf.estimator.EvalSpec(
        input_fn=lambda: input_fn(test_texts,
                                  test_labels,
                                  hparams['batch_size'],
                                  mode=tf.estimator.ModeKeys.EVAL),
        steps=None,
        exporters=exporter,
        start_delay_secs=10,
        throttle_secs=10)

    # Start training
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
示例#30
0
    def train(self, texts: List[str], target: List[int]) -> None:

        from tensorflow.python.keras.models import Sequential  #type: ignore
        from tensorflow.python.keras.layers import Embedding, Dense, LSTM, GlobalMaxPool1D  #type: ignore
        from tensorflow.keras.optimizers import Adam  #type: ignore
        from tensorflow.keras.callbacks import History  #type: ignore

        if self.downsampling:
            texts, target = downsample(texts, target, self.downsampling_ratio)

        if self.verbose:
            print('1. Vectorizing texts')

        NUMBER_OF_FEATURES: int = 20000
        self.tokenizer = text.Tokenizer(num_words=NUMBER_OF_FEATURES)
        self.tokenizer.fit_on_texts(texts)
        vocabulary: Dict[str, int] = self.tokenizer.word_index

        if self._max_sequence_length == 0:
            self._max_sequence_length = len(max(texts, key=len))

        vectorized_texts: array = self.vectorize_texts(texts)

        if self.embedding_location == '':
            if self.verbose:
                print('2. Skip (no embeddings)')
                print('3. Skip (no embeddings)')
        else:
            if self.verbose:
                print('2. Loading word embeddings')

            embedding_dictionary: Dict[
                str, List[float]] = load_embedding_dictionary(
                    self.embedding_location)
            nr_of_embedding_features: int = len(
                list(embedding_dictionary.values())
                [1])  # Check how many values we have for the first word

            if self.verbose:
                print('3. Creating embedding matrix')

            embedding_matrix: array = create_embedding_matrix_for_vocabulary(
                embedding_dictionary, vocabulary)

        if self.verbose:
            print('4. Building up model')

        #Define a simple LSTM model with a pretrained embedding layer
        model: Sequential = Sequential()

        if self.embedding_location == '':
            #Add an empty embedding layer if we have no pretrained embeddings
            EMPTY_EMBEDDING_LAYER_SIZE: int = 300
            model.add(
                Embedding(len(vocabulary) + 1, EMPTY_EMBEDDING_LAYER_SIZE))

        else:
            model.add(
                Embedding(input_dim=len(vocabulary) + 1,
                          output_dim=nr_of_embedding_features,
                          input_length=vectorized_texts.shape[1],
                          weights=[embedding_matrix],
                          trainable=False))

        model.add(LSTM(16, return_sequences=True))
        model.add(LSTM(16, return_sequences=True))
        model.add(LSTM(16, return_sequences=True))
        model.add(GlobalMaxPool1D())

        model.add(Dense(256))
        model.add(Dense(256))

        model.add(Dense(1, activation='sigmoid'))

        #Compile the model
        optimizer: Adam = Adam(lr=self.learning_rate)
        model.compile(optimizer=optimizer,
                      loss='binary_crossentropy',
                      metrics=['acc'])

        if self.verbose:
            print('5. training the model')

        history: History = model.fit(
            vectorized_texts,
            target,
            epochs=self.learning_epochs,
            #validation_data=(test_vectors, test_target),
            verbose=1,  # Logs once per epoch.
            batch_size=self.learning_batch_size)

        self.model = model