示例#1
0
def test_tokenizer_serde_fitting():
    sample_texts = [
        'There was a time that the pieces fit, but I watched them fall away',
        'Mildewed and smoldering, strangled by our coveting',
        'I\'ve done the math enough to know the dangers of our second guessing'
    ]
    tokenizer = text.Tokenizer(num_words=100)
    tokenizer.fit_on_texts(sample_texts)

    seq_generator = tokenizer.texts_to_sequences_generator(sample_texts)
    sequences = [seq for seq in seq_generator]
    tokenizer.fit_on_sequences(sequences)

    tokenizer_json = tokenizer.to_json()
    recovered = text.tokenizer_from_json(tokenizer_json)

    assert tokenizer.char_level == recovered.char_level
    assert tokenizer.document_count == recovered.document_count
    assert tokenizer.filters == recovered.filters
    assert tokenizer.lower == recovered.lower
    assert tokenizer.num_words == recovered.num_words
    assert tokenizer.oov_token == recovered.oov_token

    assert tokenizer.word_docs == recovered.word_docs
    assert tokenizer.word_counts == recovered.word_counts
    assert tokenizer.word_index == recovered.word_index
    assert tokenizer.index_word == recovered.index_word
    assert tokenizer.index_docs == recovered.index_docs
示例#2
0
 def get_tokenizer(self):
     '''
     :param embedded_matrix_size: 嵌入矩阵大小
     :return: tokenizer
     '''
     if not os.path.exists(self.tokenizer_path):
         self.remove_stop_word_list = self.get_remove_stop_word()
         tokenizer = text.Tokenizer(num_words=self.embedded_matrix_size,
                                    lower=False,
                                    char_level=False)
         tokenizer.fit_on_texts(self.remove_stop_word_list)
         tokenizer_json = tokenizer.to_json()
         with open(self.tokenizer_path, "w") as f:
             f.write(tokenizer_json)
             print("save tokenizer_json success as '{}'".format(
                 self.tokenizer_path))
         return tokenizer
     else:
         print("更换数据集需手动删除{}此文件,并重新运行代码后会自动生成tokenizer.".format(
             self.tokenizer_path))
         with open(self.tokenizer_path, "r") as f:
             tokenizer_json = f.read()
         tokenizer = text.tokenizer_from_json(tokenizer_json)
         print("load tokenizer_json success as '{}'".format(
             self.tokenizer_path))
         return tokenizer
示例#3
0
    def __init__(self,
                 train_data_path='data/token_train.tsv',
                 max_document_length=100,
                 vocabulary_size=5000,
                 embedding_size=300,
                 dropout_keep_prob=0.5,
                 lr=1e-4,
                 batch_size=50,
                 num_epochs=5,
                 dev_size=0.2):
        self.dev_size = dev_size
        self.num_epochs = num_epochs
        self.batch_size = batch_size
        self.lr = lr
        self.dropout_keep_prob = dropout_keep_prob
        self.embedding_size = embedding_size
        self.vocabulary_size = vocabulary_size
        self.max_document_length = max_document_length
        self.train_data_path = train_data_path
        self.pickle_path = os.path.splitext(train_data_path)[0] + '.model.p'
        self._tokenizer = text.Tokenizer(num_words=vocabulary_size,
                                         char_level=False,
                                         filters='')

        raw_x, raw_y = self._aggregate_raw_data_from_dir(self.train_data_path)
        self._tokenizer.fit_on_texts(raw_x)

        self.x_train, self.y_train = self._prepare_from_raw_data(raw_x, raw_y)
        self.model = self._create_model()
示例#4
0
def test_tokenizer_unicode():
    sample_texts = [u'ali veli kırk dokuz elli',
                    u'ali veli kırk dokuz elli veli kırk dokuz']
    tokenizer = text.Tokenizer(num_words=5)
    tokenizer.fit_on_texts(sample_texts)

    assert len(tokenizer.word_counts) == 5
示例#5
0
def test_tokenizer_oov_flag():
    """Test of Out of Vocabulary (OOV) flag in text.Tokenizer
    """
    x_train = ['This text has only known words']
    x_test = ['This text has some unknown words']  # 2 OOVs: some, unknown

    # Default, without OOV flag
    tokenizer = text.Tokenizer()
    tokenizer.fit_on_texts(x_train)
    x_test_seq = tokenizer.texts_to_sequences(x_test)
    assert len(x_test_seq[0]) == 4  # discards 2 OOVs

    # With OOV feature
    tokenizer = text.Tokenizer(oov_token='<unk>')
    tokenizer.fit_on_texts(x_train)
    x_test_seq = tokenizer.texts_to_sequences(x_test)
    assert len(x_test_seq[0]) == 6  # OOVs marked in place
示例#6
0
def test_tokenizer_serde_no_fitting():
    tokenizer = text.Tokenizer(num_words=100)

    tokenizer_json = tokenizer.to_json()
    recovered = text.tokenizer_from_json(tokenizer_json)

    assert tokenizer.get_config() == recovered.get_config()

    assert tokenizer.word_docs == recovered.word_docs
    assert tokenizer.word_counts == recovered.word_counts
    assert tokenizer.word_index == recovered.word_index
    assert tokenizer.index_word == recovered.index_word
    assert tokenizer.index_docs == recovered.index_docs
示例#7
0
def test_tokenizer_lower_flag():
    """Tests for `lower` flag in text.Tokenizer
    """
    # word level tokenizer with sentences as texts
    word_tokenizer = text.Tokenizer(lower=True)
    texts = [
        'The cat sat on the mat.', 'The dog sat on the log.',
        'Dog and Cat living Together.'
    ]
    word_tokenizer.fit_on_texts(texts)
    expected_word_counts = OrderedDict([('the', 4), ('cat', 2), ('sat', 2),
                                        ('on', 2), ('mat', 1), ('dog', 2),
                                        ('log', 1), ('and', 1), ('living', 1),
                                        ('together', 1)])
    assert word_tokenizer.word_counts == expected_word_counts

    # word level tokenizer with word_sequences as texts
    word_tokenizer = text.Tokenizer(lower=True)
    word_sequences = [['The', 'cat', 'is', 'sitting'],
                      ['The', 'dog', 'is', 'standing']]
    word_tokenizer.fit_on_texts(word_sequences)
    expected_word_counts = OrderedDict([('the', 2), ('cat', 1), ('is', 2),
                                        ('sitting', 1), ('dog', 1),
                                        ('standing', 1)])
    assert word_tokenizer.word_counts == expected_word_counts

    # char level tokenizer with sentences as texts
    char_tokenizer = text.Tokenizer(lower=True, char_level=True)
    texts = [
        'The cat sat on the mat.', 'The dog sat on the log.',
        'Dog and Cat living Together.'
    ]
    char_tokenizer.fit_on_texts(texts)
    expected_word_counts = OrderedDict([('t', 11), ('h', 5), ('e', 6),
                                        (' ', 14), ('c', 2), ('a', 6),
                                        ('s', 2), ('o', 6), ('n', 4), ('m', 1),
                                        ('.', 3), ('d', 3), ('g', 5), ('l', 2),
                                        ('i', 2), ('v', 1), ('r', 1)])
    assert char_tokenizer.word_counts == expected_word_counts
示例#8
0
def test_sequential_fit():
    texts = [
        'The cat sat on the mat.', 'The dog sat on the log.',
        'Dogs and cats living together.'
    ]
    word_sequences = [['The', 'cat', 'is', 'sitting'],
                      ['The', 'dog', 'is', 'standing']]

    tokenizer = text.Tokenizer()
    tokenizer.fit_on_texts(texts)
    tokenizer.fit_on_texts(word_sequences)

    assert tokenizer.document_count == 5

    tokenizer.texts_to_matrix(texts)
    tokenizer.texts_to_matrix(word_sequences)
示例#9
0
def test_tokenizer():
    sample_texts = ['The cat sat on the mat.',
                    'The dog sat on the log.',
                    'Dogs and cats living together.']
    tokenizer = text.Tokenizer(num_words=10)
    tokenizer.fit_on_texts(sample_texts)

    sequences = []
    for seq in tokenizer.texts_to_sequences_generator(sample_texts):
        sequences.append(seq)
    assert np.max(np.max(sequences)) < 10
    assert np.min(np.min(sequences)) == 1

    tokenizer.fit_on_sequences(sequences)

    for mode in ['binary', 'count', 'tfidf', 'freq']:
        tokenizer.texts_to_matrix(sample_texts, mode)
示例#10
0
def get_word2vec(dataframe):
    #词向量
    # jieba分词
    wordList = dataframe["title"].apply(lambda x: list(jieba.cut(x)))
    #words_dict =[]
    texts = []
    stoplist = []
    # 去掉停用词
    for words in wordList:
        line = [word for word in words if word not in stoplist]
        #words_dict.extend([word for word in line])
        texts.append(line)
    maxlen = 0
    for line in texts:
        if maxlen < len(line):
            maxlen = len(line)
    max_words = 50000
    # 利用keras的Tokenizer进行onehot,并调整未等长数组
    tokenizer = text.Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(texts)
    data_w = tokenizer.texts_to_sequences(texts)
    word2vec = sequence.pad_sequences(data_w, maxlen=maxlen)
    return word2vec
示例#11
0
    def __init__(self):
        """Constructor for the selector."""
        logging.info('Initializing tokenizer..')

        words, embedding_matrix = self._build_embedding_matrix()
        self.tokenizer = text.Tokenizer(num_words=len(words), lower=False)
        # Tokenizer treats each item in a nested list as a token.
        self.tokenizer.fit_on_texts([[word] for word in words])

        # Preppend a array of zeros to the embeddings matrix that will be used by
        # out-of-vocabulary words.
        embedding_matrix = np.concatenate(
            [np.zeros((1, embedding_matrix.shape[1])), embedding_matrix])

        assert len(words) == len(self.tokenizer.word_index), (
            'embeddings_matrix and tokenizer.word_index do not have the same size:'
            ' {} and {}, respectively'.format(len(words),
                                              len(self.tokenizer.word_index)))
        assert all([
            self.tokenizer.word_index[word] == i + 1
            for i, word in enumerate(words)
        ]), ('embeddings_matrix and tokenizer.word_index are not aligned.')

        self.model = self._build_model(embedding_matrix)
示例#12
0
def load_dataset(test_sen=None):

    EMBEDDING_FILE = config.EMBEDDING_300

    df = pd.read_csv(config.DATA_PATH)
    X = df["content"].values
    Y = df["label"].values
    x_train, x_test, y_train, y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.3,
                                                        random_state=109)

    # data preprocessing
    print(X[0])
    puncList = ["।", "”", "“", "’"]
    x = "".join(puncList)
    filterString = x + '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n০১২৩৪৫৬৭৮৯'
    tokenizer = text.Tokenizer(
        num_words=50000,
        filters=filterString,
        lower=False,
    )
    tokenizer.fit_on_texts(x_train)
    train_idx = tokenizer.texts_to_sequences(x_train)
    test_idx = tokenizer.texts_to_sequences(x_test)
    word_index = tokenizer.word_index

    embeddings_index = {}
    for i, line in enumerate(open(EMBEDDING_FILE, encoding="utf-8")):
        val = line.split()
        embeddings_index[val[0]] = np.asarray(val[1:], dtype='float32')
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    x_train = sequence.pad_sequences(train_idx,
                                     maxlen=32,
                                     padding='post',
                                     truncating='post')
    x_test = sequence.pad_sequences(test_idx,
                                    maxlen=32,
                                    padding='post',
                                    truncating='post')

    test_size = len(x_test)

    dev_size = (int)(test_size * 0.1)

    x_dev = x_test[:dev_size]
    x_test = x_test[dev_size:]
    y_dev = y_test[:dev_size]
    y_test = y_test[dev_size:]

    x_train = torch.tensor(x_train, dtype=torch.long)
    y_train = torch.tensor(y_train, dtype=torch.float32)

    train = TensorDataset(x_train, y_train)
    train_iter = DataLoader(train, batch_size=32)

    x_test = torch.tensor(x_test, dtype=torch.long)
    y_test = torch.tensor(y_test, dtype=torch.float32)

    test = TensorDataset(x_test, y_test)
    test_iter = DataLoader(test, batch_size=32)

    x_dev = torch.tensor(x_dev, dtype=torch.long)
    y_dev = torch.tensor(y_dev, dtype=torch.float32)

    valid = TensorDataset(x_dev, y_dev)
    valid_iter = DataLoader(valid, batch_size=32)
    word_embeddings = embedding_matrix
    vocab_size = 50000

    return vocab_size, word_embeddings, train_iter, valid_iter, test_iter
示例#13
0
# Data Loading
train = pd.read_csv('../Data/MELD.Raw/train_sent_emo.csv', dtype=str)
validation = pd.read_csv('../Data/MELD.Raw/dev_sent_emo.csv', dtype=str)
test = pd.read_csv('../Data/MELD.Raw/test_sent_emo.csv', dtype=str)

# Create mapping to identify audio files
train["ID"] = 'dia' + train["Dialogue_ID"] + '_utt' + train[
    "Utterance_ID"] + '.jpg'
validation["ID"] = 'dia' + validation["Dialogue_ID"] + '_utt' + validation[
    "Utterance_ID"] + '.jpg'
test["ID"] = 'dia' + test["Dialogue_ID"] + '_utt' + test[
    "Utterance_ID"] + '.jpg'

# Text Features
tokenizer = kt.Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train['Utterance'])

vocab_size = len(tokenizer.word_index) + 1

train_tokens = tokenizer.texts_to_sequences(train['Utterance'])
text_features = pd.DataFrame(ks.pad_sequences(train_tokens, maxlen=200))

validation_tokens = tokenizer.texts_to_sequences(validation['Utterance'])
validation_features = pd.DataFrame(
    ks.pad_sequences(validation_tokens, maxlen=200))


# Data Pipeline
def train_generator(features, batch):
    train_generator = ki.ImageDataGenerator(rescale=1. / 255.)
    def trainer(dict_csv='test.csv'):
        data = pd.read_csv(dict_csv, engine='python')
        train_size = int(len(data) * .7)
        train_posts = data['documents']
        train_tags = data['tags']
        test_posts = data['documents'][train_size:]
        test_tags = data['tags'][train_size:]
        posts = data['documents']
        dlp_data = {'filename': [], 'tags': []}
        vocab_size = 10000
        tokenize = text.Tokenizer(num_words=vocab_size)
        tokenize.fit_on_texts(train_posts)
        # save token
        with open('tokenizer.pickle', 'wb') as handle:
            pickle.dump(tokenize, handle, protocol=pickle.HIGHEST_PROTOCOL)
            print("Saving tokenizer with name tokenizer.pickle")

        x_train = tokenize.texts_to_matrix(train_posts)
        x_test = tokenize.texts_to_matrix(test_posts)
        x_post = tokenize.texts_to_matrix(posts)

        encoder = preprocessing.LabelBinarizer()
        encoder.fit(train_tags)
        y_train = encoder.transform(train_tags)
        y_test = encoder.transform(test_tags)
        text_labels = encoder.classes_

        num_labels = len(np.unique(y_train))
        batch_size = 1024
        model = Sequential()

        ##Buat hidden layer, gunanya buat naikin akurasi
        model.add(Dense(512, input_shape=(vocab_size,)))
        model.add(BatchNormalization())
        model.add(Activation('relu'))
        model.add(Dense(128))
        model.add(BatchNormalization())
        model.add(Activation('relu'))
        model.add(Dense(512))
        model.add(BatchNormalization())
        model.add(Activation('relu'))
        model.add(Dense(128))
        model.add(BatchNormalization())
        model.add(Activation('relu'))
        model.add(Dense(64))
        model.add(BatchNormalization())
        model.add(Activation('relu'))

        model.add(Dense(num_labels))
        model.add(BatchNormalization())
        model.add(Activation('sigmoid'))

        model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])

        history = model.fit(x_train, y_train,
                            batch_size=batch_size,
                            epochs=256,
                            verbose=1,
                            validation_split=0.1)

        score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1)
        model_json = model.to_json()
        with open("model.json", "w") as json_file:
            json_file.write(model_json)
            print("\n Saved h5 json model to disk with name model.json ")

        model.save_weights("model.h5")
        print("\n Saved model to disk with name model.h5")
        print("Training done")

        pred = model.predict(np.array(x_post))
        pred = pred > 0.5
        for i in range(0, len(posts)):
            print('Document name: %s, is %s' % (data['filename'][i], text_labels[np.argmax(pred[i])]))
            dlp_data['filename'].append(data['filename'][i])
            dlp_data['tags'].append(text_labels[np.argmax(pred[i])])

        df = pd.DataFrame(dlp_data, columns=['filename', 'tags'])
        df.to_csv('dlp.csv', encoding="utf-8")

        print('Saved CSV model')
        json_file = open('model.json', 'r')
        loaded_json_model = json_file.read()
        json_file.close()
        loaded_model = model_from_json(loaded_json_model)

        loaded_model.load_weights("model.h5")
        print("Loaded model from disk")

        loaded_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        score = loaded_model.evaluate(x_test, y_test, verbose=1)
        print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1] * 100))
示例#15
0
def LSTM(positive_tweets, negative_tweets):
    """ Return a LSTM model fitted on postive_tweets and negative_tweets

    Keyword arguments:
    positive_tweets -- the file (.csv) that contains the positive tweets
    negative_tweets -- the file (.csv) that contains the negative tweets
    """
    pos_df = pd.read_csv(positive_tweets, index_col=0)
    neg_df = pd.read_csv(negative_tweets, index_col=0)

    train = pd.concat([pos_df, neg_df])

    #Randomize order
    train = train.sample(frac=1, random_state=1)

    train = train.dropna()

    # The maximum number of words to be used. (most frequent)
    MAX_NB_WORDS = 450000
    # Max number of words in each complaint.
    MAX_SEQUENCE_LENGTH = 50
    EMBEDDING_DIM = 300
    tokenizer = text.Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(train.tweets.values)
    word_index = tokenizer.word_index

    #number of word with that appears at least 5 times
    words = len([
        k for k in tokenizer.word_index.keys() if tokenizer.word_counts[k] > 4
    ])

    # The maximum number of words to be used. (most frequent)
    MAX_NB_WORDS = words
    # Max number of words in each complaint.
    MAX_SEQUENCE_LENGTH = 50
    EMBEDDING_DIM = 300
    tokenizer = text.Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(train.tweets.values)
    word_index = tokenizer.word_index

    #Create sequence of index
    X = tokenizer.texts_to_sequences(train.tweets.values)
    X = sequence.pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)

    Y = train.label

    X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.20,
                                                        random_state=1)

    #Build model
    batch_size = 8192

    model = Sequential()
    model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
    model.add(LSTM(256, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    #To have our best model
    es = EarlyStopping(monitor='val_acc', mode='max', verbose=1, patience=5)
    mc = ModelCheckpoint('best_model.h5',
                         monitor='val_acc',
                         mode='max',
                         verbose=1,
                         save_best_only=True)

    #Training
    model.fit(X_train,
              Y_train,
              batch_size=batch_size,
              epochs=250,
              validation_data=(X_test, Y_test),
              callbacks=[es, mc])

    saved_model = load_model('best_model.h5')

    return saved_model