예제 #1
0
def prepare_data():
    data = pd.read_csv("dataset.csv")
    X = data.loc[:, 'sub']
    y = data.loc[:, 'severity']
    train_X, train_y, test_X, test_y = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)

    t = Tokenizer()
    t.fit_on_texts(train_X)
    vocab_size = len(t.word_index) + 1

    encoded_train_X = t.text_to_sequences(train_X)
    encoded_test_X = t.text_to_sequence(test_X)

    max_len = 20

    padded_train_X = pad_sequences(encoded_train_X,
                                   maxlen=max_len,
                                   padding='post')
    padded_test_X = pad_sequences(encoded_test_X,
                                  maxlen=max_len,
                                  padding='post')

    embedding_index = dict()
    f = open('all_embeddings.txt')  #custom embedding matrix
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:])
        embedding_index[word] = coefs
    f.close()

    embedding_matrix = np.zeros([vocab_size, max_len])
    for word, i in t.word_index.items():
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return vocab_size, padded_train_X, train_y, padded_test_X, test_y, embedding_matrix
예제 #2
0
data.head(5)


#提取关键字
def key_word_extract(row):
    return " ".join(
        analyse.extract_tags(texts, topK=50, withWeight=False, allowPOS=()))


data['key_extract'] = data.Job_Description.apply(key_word_extract)
data.head(5)
#  -------------------------- 3、分词和提取关键词 --------------------------------
#  -------------------------- 4、建立字典,并使用 --------------------------------
token = Tokenizer(num_words=2000)
token.fit_on_texts(job_detail_pd['key_extract'])  #按单词出现次数排序,排序前2000的单词会列入词典中
Job_Description_Seq = token.text_to_sequences(
    job_detail_pd['key_extract'])  #将文字转换为数字列表
Job_Description_Seq_Padding = sequence.pad.sequences(
    Job_Description_Seq, maxlen=50)  #截长补短,让所有数字列表长度均为50
x_train = Job_Description_Seq
y_train = data['label'].tolist  #数组转列表


#  -------------------------- 4、建立字典,并使用 --------------------------------
#  -------------------------- 5、训练模型 ---------------------------------------
#/--------------------------method1-class-----------------------------------------/#
class Model1(nn.Model):
    def __init__(self):
        super(Model1, self).__init()
        self.embedding = torch.nn.Embedding(output_dim=32,
                                            intput_dim=2000,
                                            input_length=50)
예제 #3
0
class Network:
    def __init__(self, env, maxlen=50, sample_size=300000, num_words=100000):

        texts = env.env_core.get_all_snippets()
        self.tokenizer = Tokenizer(num_words=num_words,
                                   filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',
                                   oov_token='UNK')

        if len(texts) < sample_size:
            sample_size = 50000  #int(len(texts)/2)

        self.tokenizer.fit_on_texts(random.sample(texts, sample_size))
        self.maxlen = maxlen
        self.voca_size = num_words
        self.voca = self.tokenizer.word_index.keys()

        #self.tensor_shape = tensor_shape
        self.model = self._create_model(self.maxlen, self.voca_size)
        self.model.compile(loss=keras.losses.mean_squared_error,
                           optimizer=keras.optimizers.adam(),
                           metrics=['accuracy'])

    #TODO PA: we can play with the NN model. The used model in the MIT paper is: linear, RELU, linear, RELU, linear
    """
    our NN model for predicting Q(s,a):
    
        Layer (type)                 Output Shape              Param #   
    =================================================================
    input_1 (InputLayer)         (None, 28)                0         
    _________________________________________________________________
    dense_1 (Dense)              (None, 10)                280       
    _________________________________________________________________
    dropout_1 (Dropout)          (None, 10)                0         
    _________________________________________________________________
    dense_2 (Dense)              (None, 10)                110       
    _________________________________________________________________
    dropout_2 (Dropout)          (None, 10)                0         
    _________________________________________________________________
    dense_3 (Dense)              (None, 1)                 11        
    ================================================================= 
    """

    @staticmethod
    def _create_model(maxlen, voca_size):
        input_ = Input(shape=(maxlen, ), dtype='int32')
        action_ = Input(shape=(5, ))
        state_plus_ = Input(shape=(5, ))
        embeddings_ = Embedding(voca_size, 64)(input_)
        lstm_ = Bidirectional(LSTM(64, dropout=0.2,
                                   recurrent_dropout=0.2))(embeddings_)
        dense_1 = Dense(32)(lstm_)
        concat_ = Concatenate(axis=-1)([dense_1, state_plus_])
        dense_2 = Dense(32)(concat_)
        concat_2 = Concatenate(axis=-1)([dense_2, action_])
        dense_3 = Dense(16)(concat_2)
        outputs_ = Dense(1, activation='linear')(dense_3)
        return Model(inputs=[input_, state_plus_, action_], outputs=outputs_)

    def pad_sequence(text):
        x_train = self.tokenizer.text_to_sequences([text])
        pad_sequences(x_train, maxlen=self.maxlen)

    def fit(self, texts_actions, y_train, epochs, batch_size, callbacks=None):
        texts, state_plus, actions = texts_actions
        x_train = self.tokenizer.texts_to_sequences(texts)
        x_train = pad_sequences(x_train, maxlen=self.maxlen)

        if callbacks is None:
            hist = self.model.fit(
                [np.array(x_train),
                 np.array(state_plus),
                 np.array(actions)],
                np.array(y_train),  # batch_size=batch_size,
                epochs=epochs,
                verbose=1)  # could be 1
        else:
            hist = self.model.fit(
                [np.array(x_train),
                 np.array(state_plus),
                 np.array(actions)],
                np.array(y_train),  # batch_size=batch_size,
                epochs=epochs,
                callbacks=None,
                verbose=1)  # could be 1

        return hist.history

    def fit_generator(self, gen, steps_per_epoch, epochs):
        self.model.fit_generator(
            generator=gen,
            # steps_per_epoch=steps_per_epoch,
            # epochs=epochs,
            verbose=1)

    # Desc: (loss, accuracy)
    def evaluate(self, x_test, y_test):
        return self.model.evaluate(x_test, y_test, verbose=0)

    def save_weights(self, path):
        self.model.save_weights(filepath=path)
        pickle.dump(self.tokenizer, open("tokenizer.pickle", "wb"))

    def load_weights(self, path):
        self.model.load_weights(filepath=path, by_name=False)
        self.tokenizer = pickle.load(open("tokenizer.pickle", "rb"))

    def save_model(self, path):
        self.model.save(path)

    def load_model(self, path):
        keras.models.load_model(path)

    def predict(self, text_action):
        text, state_plus, action = text_action

        x_train = self.tokenizer.texts_to_sequences(text)
        x_train = pad_sequences(x_train, maxlen=self.maxlen)

        return self.model.predict(
            [np.array(x_train),
             np.array(state_plus),
             np.array(action)])
y = train_df[classes_to_predict].values
#y_test_predicted = test_df[classes_to_predict].values

processed_train_comments = []
for comment in raw_train_comments:
    processed_train_comments.append(preprocess_text(comment))
    
processed_test_comments = []    
for comment in raw_test_comments:
    processed_test_comments.append(preprocess_text(comment))
        

tokenizer = Tokenizer(num_words = MAX_NB_WORDS)
tokenizer.fit_on_texts(processed_train_comments + processed_test_comments)

train_sequences = tokenizer.text_to_sequences(processed_train_comments)
test_sequences = tokenizer.text_to_sequences(processed_test_comments)

print('found %s tokens in text.' %(tokenizer.word_index))

train_data = pad_sequences(train_sequences, maxlen = MAX_SEQUENCE_LENGTH)
final_test_data = pad_sequences(test_sequences, maxlen = MAX_SEQUENCE_LENGTH)

print('shape of train_data(will be divided further into final_train_data + final_validation_data) ready for feeding to network is %s' %(train_data.shape))
print('shape of final_test_data ready for fedding to network is %s' %(final_test_data.shape))
print('shape of label(y) is %s' %(y.shape))



##################################################
## preparing word embeddings.
예제 #5
0
#提取关键字
def key_word_extract(row):
    return " ".join(
        analyse.extract_tags(texts, topK=50, withWeight=False, allowPOS=()))


job_detail_pd[
    'Job_Description_key_wprd'] = job_detail_pd.Job_Description.apply(
        key_word_extract)
#  -------------------------- 3、分词和提取关键词 ---------------------------
#  -------------------------- 4、建立字典,并使用 ---------------------------
token = Tokenizer(num_words=2000)
token.fit_on_texts(
    job_detail_pd['Job_Description_key_wprd'])  #按单词出现次数排序,排序前2000的单词会列入词典中
Job_Description_Seq = token.text_to_sequences(
    job_detail_pd['Job_Description_key_wprd'])  #将文字转换为数字列表
Job_Description_Seq_Padding = sequence.pad.sequences(
    Job_Description_Seq, maxlen=50)  #截长补短,让所有数字列表长度均为50
x_train = Job_Description_Seq
y_train = job_detail_pd['label'].tolist  #数组转列表
#  -------------------------- 4、建立字典,并使用 ----------------------------
#  -------------------------- 5、训练模型 -----------------------------------
#/--------------------------method1-API-------------------------------------
inputs = input(shape=(784, ))
#层的实例是可以调用的,他以一个张量为参数,并且输出一个张量
x = Embedding(output_dim=32, intput_dim=2000, input_length=50)(inputs)
x = Conv1D(256, 3, padding='same', activation='relu')(x)
x = MaxPool1D(3, 3, padding='same')(x)
x = Conv1D(3, 3, padding='same', activation='relu')(x)
x = Flatten()(x)
x = Dropout(0.3)(x)
예제 #6
0
from keras.preprocessing.text import Tokenizer

samples = ['I study at CityU', 'I study at CityU at Seattle']

tokenizer = Tokenizer(num_words=1000)

tokenizer.fit_on_texts(samples)

sequences = tokenizer.text_to_sequences(samples)

one_hot_results = tokenizer.text_to_matrix(samples, mode='binary')

word_index = tokenizer.word_index
print('Found %s unique tokesn: ' % len(word_index))
print('Sequences: ', sequences, '\n')
print('word_index: ', tokenizer.word_index)
예제 #7
0
import keras.preprocessing.text as T
from keras.preprocessing.text import Tokenizer

text1='some thing to eat'
text2='some thing to drink'
texts=[text1,text2]

print T.text_to_word_sequence(text1)  #['some', 'thing', 'to', 'eat']
print T.one_hot(text1,10)  #[7, 9, 3, 4]
print T.one_hot(text2,10)  #[7, 9, 3, 1]

tokenizer = Tokenizer(num_words=10)
tokenzier.fit_on_text(texts)
print tokenizer.word_count #[('some', 2), ('thing', 2), ('to', 2), ('eat', 1), ('drink', 1)]
print tokenizer.word_index #{'some': 1, 'thing': 2,'to': 3 ','eat': 4, drink': 5}
print tokenizer.word_docs #{'some': 2, 'thing': 2, 'to': 2, 'drink': 1,  'eat': 1}
print tokenizer.index_docs #{1: 2, 2: 2, 3: 2, 4: 1, 5: 1}

print tokenizer.text_to_sequences(texts) #[[1, 2, 3, 4], [1, 2, 3, 5]]
print tokenizer.text_to_matrix(texts) #
[[ 0.,  1.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.],
 [ 0.,  1.,  1.,  1.,  0.,  1.,  0.,  0.,  0.,  0.]]

import keras.preprocessing.sequence as S
S.pad_sequences([[1,2,3]],10,padding='post')
--------------------- 
作者:vivian_ll 
来源:CSDN 
原文:https://blog.csdn.net/vivian_ll/article/details/80795139 
版权声明:本文为博主原创文章,转载请附上博文链接!