Exemplo n.º 1
0
def tokenlize_text(max_num_words, max_seq_length, x_train):
    """Tokenlize text.

    Vectorize a text corpus by transform each text in texts to a sequence of integers.

    Args:
        max_num_words: Int, max number of words in the dictionary.
        max_seq_length: Int, the length of each text sequence, padding if shorter, trim is longer.
        x_train: List contains text data.

    Returns:
        x_train: Tokenlized input data.
        word_index: Dictionary contains word with tokenlized index.
    """
    from keras_preprocessing.sequence import pad_sequences
    from keras_preprocessing.text import Tokenizer
    print("tokenlizing texts...")
    tokenizer = Tokenizer(num_words=max_num_words)
    tokenizer.fit_on_texts(x_train)
    sequences = tokenizer.texts_to_sequences(x_train)
    word_index = tokenizer.word_index
    x_train = pad_sequences(sequences, maxlen=max_seq_length)
    print("data readed and convert to %d length sequences" % max_seq_length)
    return x_train, word_index
Exemplo n.º 2
0
from keras.models import load_model
from keras_preprocessing.sequence import pad_sequences
from keras_preprocessing.text import Tokenizer
import numpy as np

model = load_model('sentiment_model.h5')
test_data = [
    "A lot of good things are happening. We are respected again throughout the world, and that's a great thing.@realDonaldTrump"
]
max_features = 200
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(test_data)
X = tokenizer.texts_to_sequences(test_data)
max_len = 28
X = pad_sequences(X, maxlen=max_len)
class_names = ['positive', 'negative']
preds = model.predict(X)
print(preds)
classes = model.predict_classes(X)
print(classes)
print(class_names[classes[0]])
Exemplo n.º 3
0
#print(dict(zip(labels,responses)))

enc = LabelEncoder()
enc.fit(training_labels)  ## 'Y' | Dependent Variable
training_labels = enc.transform(training_labels)

vocab_size = 8000
max_len = 24
trunc_type = "post"
oov_token = "<OOV>"

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(training_sentences)

padded = pad_sequences(sequences, truncating=trunc_type, maxlen=max_len)
classes = len(labels)
'''
print(type(padded))
print('\n\n')
print(type(sequences),'\n')
print(padded[5], '\n\n', sequences[5])
print(word_index['whats'], word_index['up'])
print(training_sentences[5])
'''

embeddings_index = {}
with open('../glove.6B/glove.6B.200d.txt', encoding='utf-8') as f:
    for line in f:
def run_keras_experiment():
    print('Reading files')

    # Reading File Section - This should change
    full = pd.read_csv("data/hindi_dataset.tsv",
                       sep='\t',
                       names=['text_id', 'text', 'task_1', 'task_2', 'task_3'])

    is_hof = full['task_1'] == 'HOF'
    full = full[is_hof]

    train, test = train_test_split(full, test_size=0.2)

    print('Completed reading')

    #############
    print("Train shape : ", train.shape)
    print("Test shape : ", test.shape)

    # Variables

    TEXT_COLUMN = "text"
    LABEL_COLUMN = "task_3"

    configParser = configparser.RawConfigParser()
    configFilePath = "config.txt"
    configParser.read(configFilePath)

    EMBEDDING_FILE = configParser.get('hindi_task_3_model-config',
                                      'EMBEDDING_FILE')
    MODEL_PATH = configParser.get('hindi_task_3_model-config', 'MODEL_PATH')
    PREDICTION_FILE = configParser.get('hindi_task_3_model-config',
                                       'PREDICTION_FILE')

    print(train.head())

    print("Removing URLs")
    train[TEXT_COLUMN] = train[TEXT_COLUMN].apply(lambda x: remove_url(x))
    test[TEXT_COLUMN] = test[TEXT_COLUMN].apply(lambda x: remove_url(x))
    print(train.head())

    print("Removing usernames")
    train[TEXT_COLUMN] = train[TEXT_COLUMN].apply(lambda x: remove_names(x))
    test[TEXT_COLUMN] = test[TEXT_COLUMN].apply(lambda x: remove_names(x))
    print(train.head())
    #
    # print("Identifying names")
    #
    # train[TEXT_COLUMN] = train[TEXT_COLUMN].apply(lambda x: entity_recognizing(x))
    # test[TEXT_COLUMN] = test[TEXT_COLUMN].apply(lambda x: entity_recognizing(x))
    # print(train.head())

    print("Converting to lower-case")
    train[TEXT_COLUMN] = train[TEXT_COLUMN].str.lower()
    test[TEXT_COLUMN] = test[TEXT_COLUMN].str.lower()
    print(train.head())

    print("Cleaning punctuation marks")
    train[TEXT_COLUMN] = train[TEXT_COLUMN].apply(lambda x: clean_text(x))
    test[TEXT_COLUMN] = test[TEXT_COLUMN].apply(lambda x: clean_text(x))
    print(train.head())

    train['doc_len'] = train[TEXT_COLUMN].apply(
        lambda words: len(words.split(" ")))
    max_seq_len = np.round(train['doc_len'].mean() +
                           train['doc_len'].std()).astype(int)

    embed_size = 300  # how big is each word vector
    max_features = None  # how many unique words to use (i.e num rows in embedding vector)
    maxlen = max_seq_len  # max number of words in a question to use #99.99%

    # fill up the missing values
    X = train[TEXT_COLUMN].fillna("_na_").values
    X_test = test[TEXT_COLUMN].fillna("_na_").values

    # Tokenize the sentences
    tokenizer = Tokenizer(num_words=max_features, filters='')
    tokenizer.fit_on_texts(list(X))

    X = tokenizer.texts_to_sequences(X)
    X_test = tokenizer.texts_to_sequences(X_test)

    # Pad the sentences
    X = pad_sequences(X, maxlen=maxlen)
    X_test = pad_sequences(X_test, maxlen=maxlen)

    # Get the target values
    Y = train[LABEL_COLUMN].values

    le = LabelEncoder()

    le.fit(Y)
    encoded_Y = le.transform(Y)

    word_index = tokenizer.word_index
    max_features = len(word_index) + 1

    print('Loading Embeddings')

    embedding_matrix = get_emb_matrix(word_index, max_features, EMBEDDING_FILE)

    print('Finished loading Embeddings')

    print('Start Training')

    kfold = StratifiedKFold(n_splits=5, random_state=10, shuffle=True)
    bestscore = []
    y_test = np.zeros((X_test.shape[0], ))
    for i, (train_index, valid_index) in enumerate(kfold.split(X, encoded_Y)):
        X_train, X_val, Y_train, Y_val = X[train_index], X[
            valid_index], encoded_Y[train_index], encoded_Y[valid_index]
        filepath = MODEL_PATH
        checkpoint = ModelCheckpoint(filepath,
                                     monitor='val_loss',
                                     verbose=2,
                                     save_best_only=True,
                                     mode='min')
        reduce_lr = ReduceLROnPlateau(monitor='val_loss',
                                      factor=0.6,
                                      patience=1,
                                      min_lr=0.0001,
                                      verbose=2)
        earlystopping = EarlyStopping(monitor='val_loss',
                                      min_delta=0.0001,
                                      patience=2,
                                      verbose=2,
                                      mode='auto')
        callbacks = [checkpoint, reduce_lr]
        model = capsule(maxlen, max_features, embed_size, embedding_matrix, 1)
        if i == 0: print(model.summary())
        model.fit(
            X_train,
            Y_train,
            batch_size=64,
            epochs=20,
            validation_data=(X_val, Y_val),
            verbose=2,
            callbacks=callbacks,
        )
        model.load_weights(filepath)
        y_pred = model.predict([X_val], batch_size=64, verbose=2)
        y_test += np.squeeze(model.predict([X_test], batch_size=64,
                                           verbose=2)) / 5
        f1, threshold = f1_smart(np.squeeze(Y_val), np.squeeze(y_pred))
        print('Optimal F1: {:.4f} at threshold: {:.4f}'.format(f1, threshold))
        bestscore.append(threshold)

    print('Finished Training')

    y_test = y_test.reshape((-1, 1))
    pred_test_y = (y_test > np.mean(bestscore)).astype(int)
    test['predictions'] = le.inverse_transform(pred_test_y)

    # save predictions
    file_path = PREDICTION_FILE
    test.to_csv(file_path, sep='\t', encoding='utf-8')

    print('Saved Predictions')

    # post analysis
    tn, fp, fn, tp = confusion_matrix(test[LABEL_COLUMN],
                                      test['predictions']).ravel()
    weighted_f1 = f1_score(test[LABEL_COLUMN],
                           test['predictions'],
                           average='weighted')
    accuracy = accuracy_score(test[LABEL_COLUMN], test['predictions'])
    weighted_recall = recall_score(test[LABEL_COLUMN],
                                   test['predictions'],
                                   average='weighted')
    weighted_precision = precision_score(test[LABEL_COLUMN],
                                         test['predictions'],
                                         average='weighted')

    print("Confusion Matrix (tn, fp, fn, tp) {} {} {} {}".format(
        tn, fp, fn, tp))
    print("Accuracy ", accuracy)
    print("Weighted F1 ", weighted_f1)
    print("Weighted Recall ", weighted_recall)
    print("Weighted Precision ", weighted_precision)
train_text, test_text, train_label, test_label = train_test_split(
    tweets['Tweet'],
    tweets['label'],
    random_state=2020,
    test_size=0.25,
    stratify=tweets['label'])

# seq_len = [len(i.split()) for i in train_text]
# pd.Series(seq_len).hist(bins=30)
# plt.show()

max_words = 20000
max_len = 100
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(train_text)
sequences = tok.texts_to_sequences(train_text)
sequences_matrix = sequence.pad_sequences(sequences, maxlen=max_len)

model = RNN()
model.summary()
model.compile(loss='binary_crossentropy',
              optimizer=Adam(),
              metrics=['accuracy'])

#model.fit(sequences_matrix,train_label,batch_size=128,epochs=10,
#validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

checkpoint = ModelCheckpoint("lstm_model",
                             monitor="val_acc",
                             save_best_only=True,
                             mode='max',
Exemplo n.º 6
0
class KerasTokenizerAdapter(FitTransformMixin):
    def __init__(self, **kwargs):
        kwargs['oov_token'] = '<mis>'

        tok = Tokenizer(oov_token=kwargs['oov_token'])
        word_counts = kwargs.pop('word_counts', tok.word_counts)
        word_docs = kwargs.pop('word_docs', tok.word_docs)
        index_docs = kwargs.pop('index_docs', tok.index_docs)
        index_word = kwargs.pop('index_word', tok.index_word)
        word_index = kwargs.pop('word_index', tok.word_index)

        self._encoder = Tokenizer(**kwargs)
        self._encoder.word_counts = word_counts
        self._encoder.word_docs = word_docs
        self._encoder.index_docs = index_docs
        self._encoder.word_index = word_index
        self._encoder.index_word = index_word

    def fit(self, texts):
        self._encoder.fit_on_texts(['<eos> <pad>'])
        self._encoder.fit_on_texts(texts)

        special = {'<pad>': 0, '<mis>': 1, '<eos>': 2}
        vocab = sorted([(
            k,
            f,
        ) for k, f in self._encoder.word_counts.items() if k not in special],
                       reverse=True,
                       key=lambda x: x[1])

        first_slot = max(special.values()) + 1

        word_index = special.copy()
        word_index.update(
            {k: idx + first_slot
             for idx, (k, _) in enumerate(vocab)})
        self._encoder.word_index = word_index
        self._encoder.index_word = {idx: k for k, idx in word_index.items()}

    def transform(self, X: pd.Series) -> List[List[int]]:
        def _transform(X: pd.Series) -> List[List[int]]:
            eos = self._encoder.word_index['<eos>']
            tokens = self._encoder.texts_to_sequences(X)

            for i in range(len(tokens)):
                tokens[i].append(eos)

            return tokens

        logger.debug('KerasTokenizerAdapter::transform - Start')
        try:
            return _transform(X)
        finally:
            logger.debug('KerasTokenizerAdapter::transform - Done')

    @property
    def params(self):
        return {
            'num_words': self._encoder.num_words,
            'filters': self._encoder.filters,
            'lower': self._encoder.lower,
            'split': self._encoder.split,
            'char_level': self._encoder.char_level,
            'oov_token': self._encoder.oov_token,
            'document_count': self._encoder.document_count,
            'word_counts': dict(self._encoder.word_counts),
            'word_docs': dict(self._encoder.word_docs),
            'index_docs': dict(self._encoder.index_docs),
            'index_word': self._encoder.index_word,
            'word_index': self._encoder.word_index
        }
testMovie_df = pd.read_csv('test.tsv', delimiter='\t', encoding='utf-8')
trainMovie_df = pd.read_csv('train.tsv', delimiter='\t', encoding='utf-8')

# Keeping only the necessary columns - cleaning the data set
trainMovie_df = trainMovie_df.drop(columns=['PhraseId', 'SentenceId'])
testMovie_df = testMovie_df.drop(columns=['PhraseId', 'SentenceId'])

trainMovie_df['Phrase'] = trainMovie_df['Phrase'].apply(
    lambda x: re.sub('[^a-zA-z0-9\s]', '', x.lower()))
testMovie_df['Phrase'] = testMovie_df['Phrase'].apply(
    lambda x: re.sub('[^a-zA-z0-9\s]', '', x.lower()))

max_features = 2000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(trainMovie_df['Phrase'].values)
X_train = tokenizer.texts_to_sequences(trainMovie_df['Phrase'].values)
X_train = pad_sequences(X_train)

tokenizer.fit_on_texts(testMovie_df['Phrase'].values)
X_test = tokenizer.texts_to_sequences(testMovie_df['Phrase'].values)
X_test = pad_sequences(X_train)
print("handing data")

# Creating the model
embed_dim = 256
lstm_out = 156

# Design the model using classification
# Model defined
model = Sequential()
# Input layer of the model for processing
# 读取持久化的对象
with open('data.pkl', 'rb') as f:
    train_data = pickle.load(f)
    test_data = pickle.load(f)

max_len = 6000

labels = to_categorical(np.asarray(train_data['label'].tolist()), num_classes=8)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['apis'].tolist())
tokenizer.fit_on_texts(test_data['apis'].tolist())

vocab = tokenizer.word_index
x_train_word_ids = tokenizer.texts_to_sequences(train_data['apis'].tolist())
x_test_word_ids = tokenizer.texts_to_sequences(test_data['apis'].tolist())

x_train_padded_seqs = pad_sequences(x_train_word_ids, maxlen=max_len)

x_test_padded_seqs = pad_sequences(x_test_word_ids, maxlen=max_len)


def text_cnn():
    kernel_size = [2, 4, 6, 8, 10]
    conv_activation = 'relu'
    _input = Input(shape=(max_len,), dtype='int32')
    _embed = Embedding(304, 256, input_length=max_len)(_input)
    _embed = SpatialDropout1D(0.15)(_embed)
    warppers = []
    for _kernel_size in kernel_size:
Exemplo n.º 9
0
# max_length = 64
# padded_train = pad_sequences(encoded_train, maxlen=max_length, padding='post')
# print(padded_train)

# test_ids = df1['id']
# test_ids = test['Code']

# train_labels = train['Code']
# print(train_labels)
# encoded_train_labels = train_labels
# le.inverse_transform(encoded_train_labels)

# integer encode the documents

encoded_test = t.texts_to_sequences(test['Desc'])
max_length = 64
padded_test = pad_sequences(encoded_test, maxlen=max_length, padding='post')
# print(padded_test)

# test_labels = test['Code']
# print(test_labels)

# padded_test = joblib.load('padded_test.vec')
# test_labels = joblib.load('test_labels.vec')
# padded_train = joblib.load('padded_train.vec')
# encoded_train_labels = joblib.load('encoded_train_labels.vec')

# le = joblib.load('label_encoder_le.vec')

# LOAD WORDEMBEDDING
Exemplo n.º 10
0
    def run(self):

        # Path to toxicity_annotated_comments.merged.shuf.cleaned-68MB,_160k-rows.tsv
        raw_data_file = sys.argv[1]

        vocab_size = int(sys.argv[2])

        # Optional path to embeddings file to create embeddings matrix.
        if len(sys.argv) > 3:
            embeddings_file = sys.argv[3]
        else:
            embeddings_file = None

        # Load raw data.
        print("Loading raw data")
        raw_df = pd.read_csv(raw_data_file, sep='\t')

        # Split data.
        print("Splitting data")
        splits = ['train', 'test', 'dev']
        split_labels = {}
        raw_split_text = {}
        for split in splits:
            labels = raw_df.loc[raw_df['split'] == split]['Label'].tolist()
            text = raw_df.loc[raw_df['split'] == split]['comment'].tolist()
            split_labels[split] = labels
            raw_split_text[split] = text

        # Setup tokenizer.
        print("Setting up tokenizer.")
        all_text = raw_df['comment'].tolist()
        tokenizer = Tokenizer(num_words=vocab_size)
        tokenizer.fit_on_texts(all_text)

        # Tokenize text.
        print("Tokenizing")
        split_text = {}
        for split in splits:
            print(("Tokenizing", split))
            tokenized = tokenizer.texts_to_sequences(raw_split_text[split])
            split_text[split] = tokenized

        # Save to tokens file.
        print("Saving to file")
        with open('toxicity_labels.pkl', 'w') as my_file:
            pickle.dump(split_labels, my_file)
        with open('toxicity_tokens_{}_words.pkl'.format(vocab_size), 'w') as my_file:
            pickle.dump(split_text, my_file)

        # Create embeddings matrix
        # Code copied from https://www.kaggle.com/tunguz/bi-gru-lstm-cnn-poolings-fasttext.
        if embeddings_file is not None:
            max_features = vocab_size
            embed_size = 300
            print("Loading embeddings")

            embedding_index = dict(self.get_coefs(*o.strip().split(" "))
                           for o in open(embeddings_file))

            print("Creating embedding matrix")
            word_index = tokenizer.word_index
            nb_words = min(max_features, len(word_index))
            embedding_matrix = np.zeros((nb_words, embed_size))
            for word, i in list(word_index.items()):
                if i >= max_features:
                    continue
                embedding_vector = embedding_index.get(word)
                if embedding_vector is not None:
                    embedding_matrix[i] = embedding_vector

            embeddings_matrix_filename = 'embeddings_matrix_{}.pkl'.format(vocab_size)
            with open(embeddings_matrix_filename, 'w') as my_file:
                pickle.dump(embedding_matrix, my_file)

        print("Done.")
Exemplo n.º 11
0
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from tensorflow.python.keras.layers import Embedding
from tensorflow.python.keras.utils.np_utils import to_categorical

# Read file
train = pd.read_csv("train.tsv", sep="\t")

# Assign Value
X = train['Phrase'].values
y = train['Sentiment'].values

# Tokenizing
tokenizer = Tokenizer(num_words=2000)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X)

# Encoding
le = preprocessing.LabelEncoder()
y = le.fit_transform(y)
y = to_categorical(y)

# Training and testing
x_train, x_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=1000)

# CNN layers
model = Sequential()
Exemplo n.º 12
0
import numpy as np
import tensorflow as tf
import unidecode
from keras_preprocessing.text import Tokenizer

tf.enable_eager_execution()

file_path = ".\\Datasets\\Shakespear.txt"

text = unidecode.unidecode(open(file_path).read())

tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])

encoded = tokenizer.texts_to_sequences([text])[0]

vocab_size = len(tokenizer.word_index) + 1

word2idx = tokenizer.word_index
idx2word = tokenizer.index_word

sequences = list()

for i in range(1, len(encoded)):
    sequence = encoded[i - 1:i + 1]
    sequences.append(sequence)
sequences = np.array(sequences)
X, Y = sequences[:, 0], sequences[:, 1]
X = np.expand_dims(X, 1)
Y = np.expand_dims(Y, 1)
seq = df_history.seq.iloc[0]
seq_array = np.array(seq.split(' '))
all_sequences = seq_array.copy()

for i in range(1, len(df_history)):
    seq = df_history.seq.iloc[i]
    seq_array = np.array(seq.split(' '))
    all_sequences = np.concatenate([all_sequences, seq_array])

# use Keras' tokenizer to translate the str representations of airports into integers
# with a mapping kept in the tokenizer.  vocab_size will be a parameter for the network.
tokenizer = Tokenizer(lower=False, char_level=False)
# fit_on_texts builds the dict
tokenizer.fit_on_texts(all_sequences.tolist())
sequences_from_tokenizer = np.array(
    tokenizer.texts_to_sequences(all_sequences))

acf_x = cal_acf(sequences_from_tokenizer, k=100)
plot_acf_data(acf_x,
              title=f'ACF for all data with length {len(all_sequences)}')

adf_test(pd.Series(sequences_from_tokenizer.reshape(-1)), name=f'ADF_Full')

#%% Check a random sequence for an aircraft
#i = 4582
i = (np.random.randint(low=0, high=len(df_history), size=1))[0]
seq = df_history.seq.iloc[i]
seq_array = np.array(seq.split(' '))
# use Keras' tokenizer to translate the str representations of airports into integers
# with a mapping kept in the tokenizer.  vocab_size will be a parameter for the network.
tokenizer = Tokenizer(lower=False, char_level=False)
print('Pre-processing')


def text_cleaning(text):
    text = re.sub('[^A-Za-z0-9]', ' ', text.lower())
    text = ' '.join(text.split())
    return text


data['question1'] = data['question1'].apply(text_cleaning)
data['question2'] = data['question2'].apply(text_cleaning)
tokenizer = Tokenizer(num_words=max_nb_words,
                      oov_token='oov_token_placeholder')
tokenizer.fit_on_texts(
    list(data['question1'].values) + list(data['question2'].values))
sequences_1 = tokenizer.texts_to_sequences(data['question1'].values)
sequences_2 = tokenizer.texts_to_sequences(data['question2'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))
x1 = pad_sequences(sequences_1, maxlen=max_seq_len)
x2 = pad_sequences(sequences_2, maxlen=max_seq_len)
y = data['is_duplicate'].values

########################################
# retrieval embeddings
########################################

print('Indexing word vectors')
word2vec = {}
fin = io.open(file_emb, 'r', encoding='utf-8', newline='\n', errors='ignore')
for line in fin:
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
        num_words += 1
    return embedding_matrix


corpus = readCorpusData.readCorpusFromFile("../data/final_corpus.txt", 1000)
docs = []
for post in corpus:
    docs.append(post["text"])

t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1

encoded_docs = t.texts_to_sequences(docs)
for i in range(len(corpus)):
    corpus[i]["text_sequence"] = encoded_docs[i]

with open('../data/final_corpus_dictionary_max_length_1000.pickle',
          'wb') as handle:
    pickle.dump(corpus, handle, protocol=pickle.HIGHEST_PROTOCOL)

corpus = readCorpusData.readCorpusFromFile("../data/final_corpus.txt", 500)
docs = []
for post in corpus:
    docs.append(post["text"])
encoded_docs = t.texts_to_sequences(docs)
for i in range(len(corpus)):
    corpus[i]["text_sequence"] = encoded_docs[i]
with open('../data/final_corpus_dictionary_max_length_500.pickle',
Exemplo n.º 16
0
def data_pre(data):
    # 得到标签
    label = [[i] * len(data[i]) for i in range(len(data))][0]
    label = to_categorical(label)
    # 切词
    context = []
    for i in data:
        for j in i:
            context.append(jieba.lcut(j))

    # 构建词典
    tokenizer = Tokenizer(num_words=20000)
    tokenizer.fit_on_texts(context)

    train_tags_title = tokenizer.texts_to_sequences(context)
    train_tags_title_preprocessed = pad_sequences(train_tags_title,
                                                  maxlen=45,
                                                  padding='post')

    # 预训练词向量
    # embedding_matrix = np.zeros((278028, 30), dtype=np.float32)
    # f = open('wiki.zh.text.vector', encoding='utf-8')
    # f = f.readlines()
    # for text in f:
    #     text = text.split()
    #     if text[0] in context:
    #         embedding_matrix[context[text[0]]] = text[1:]

    # 模型
    x_1 = Input(shape=(45, ))  # 输入数据维度
    embed_1 = Embedding(input_dim=45,
                        output_dim=45)(x_1)  # 将索引值转化为稠密向量,且只能做第一层
    L_1 = (LSTM(64))(embed_1)  # 第一个括号构建一个层 64是输出空间的维度,第二个括号用该层做计算
    L_1 = Dropout(0.5)(L_1)  # 防止过拟合,0.5在这里是需要丢弃的输入比例
    L_1 = Dense(9, activation='softmax')(L_1)  # 3是输出空间维度
    model_one = Model(x_1, L_1)  # x_1输入,L_1输出
    model_one.compile(loss='categorical_crossentropy',
                      optimizer='rmsprop',
                      metrics=['acc'])  # 'binary_crossentropy'
    history = model_one.fit(train_tags_title_preprocessed,
                            label,
                            batch_size=512,
                            epochs=20,
                            validation_split=0.1,
                            shuffle=True)
    # 汇总acc函数历史数据
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.title('model acc')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'val'], loc='upper left')
    plt.show()
    # 汇总损失函数历史数据
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'val'], loc='upper left')
    plt.show()
Exemplo n.º 17
0
tokenizer = Tokenizer(num_words=None,
                      filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n',
                      lower=True,
                      split=" ",
                      char_level=False)
# print(train_apis)
# 通过训练和测试数据集丰富取词器的字典,方便后续操作
tokenizer.fit_on_texts(train_apis)
# print(train_apis)
# print(test_apis)
tokenizer.fit_on_texts(test_apis)
# print(test_apis)
# print(tokenizer.word_index)
# #获取目前提取词的字典信息
# # vocal = tokenizer.word_index
train_apis = tokenizer.texts_to_sequences(train_apis)
# 通过字典信息将字符转换为对应的数字
test_apis = tokenizer.texts_to_sequences(test_apis)
# print(test_apis)
# 序列化原数组为没有逗号的数组,默认在前面填充,默认截断前面的
train_apis = pad_sequences(train_apis,
                           inputLen,
                           padding='post',
                           truncating='post')
# print(test_apis)
test_apis = pad_sequences(test_apis,
                          inputLen,
                          padding='post',
                          truncating='post')

# print(test_apis)
Exemplo n.º 18
0
                    K = argp.k
                evaluate(model, ge, X_test, targets_test, tmode='k', k=K)
            else:
                raise Exception('You should pass a threshold mode')
        elif args.emode == 'keras':
            sources_train, targets_train = read_file(
                '../processed_data/games_train.json', emode='keras')
            sources_test, targets_test = read_file(
                '../processed_data/games_test.json', emode='keras')

            tokenizer = Tokenizer(num_words=MAX_NUM_WORDS,
                                  filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',
                                  lower=True)
            tokenizer.fit_on_texts(sources_train)
            word_index = tokenizer.word_index
            x_train = tokenizer.texts_to_sequences(sources_train)
            x_train = pad_sequences(x_train, maxlen=TOKENS_MAX_LENGTH)

            ge = GenresEncoder('../processed_data/genres')
            y_train = ge.transform(targets_train)

            print(x_train.shape)
            print(y_train.shape)

            model = train(x_train,
                          y_train, (TOKENS_MAX_LENGTH, ),
                          ge.num_genres,
                          batch_size=BATCH_SIZE,
                          max_epoch=100,
                          use_es=True,
                          emode='keras')
Exemplo n.º 19
0
# https://wikidocs.net/22660

text = """경마장에 있는 말이 뛰고 있다\n
그의 말이 법이다\n
가는 말이 고와야 오는 말이 곱다\n"""

from keras_preprocessing.text import Tokenizer
token = Tokenizer()
token.fit_on_texts([text])
encoded = token.texts_to_sequences([text])
# encoded = token.texts_to_sequences([text])[0]

print(encoded)

vocab_size = len(token.word_index) + 1  # 12
# 케라스 토크나이저의 정수 인코딩은 인덱스가 1부터 시작하지만,
# 케라스 원-핫 인코딩에서 배열의 인덱스가 0부터 시작하기 때문에
# 배열의 크기를 실제 단어 집합의 크기보다 +1로 생성해야하므로 미리 +1 선언
print('단어 집합의 크기 : %d' % vocab_size)

print(token.word_index)

# 훈련 데이터를 만든다.
sequences = list()
for line in text.split('\n'):  # \n을 기준으로 문장 토큰화
    encoded = token.texts_to_sequences([line])[0]
    for i in range(1, len(encoded)):
        sequence = encoded[:i + 1]
        sequences.append(sequence)

print('학습에 사용할 샘플의 개수: %d' % len(sequences))
Exemplo n.º 20
0
epoch = 3
dropout = 0.1
num_filters = 60

#split to train and val
#train_df, val_df = train_test_split(train_df, test_size=0.05, random_state=2018)

#fill up the missing values
train_X = train_df['clean_text'].fillna('_na_').values  #1175509
#val_X = val_df['clean_text'].fillna('_na_').values  #130613
test_X = test_df['clean_text'].fillna('_na_').values  #56370

#Tokenize the sequences
tokenizer = Tokenizer(num_words=words_size)
tokenizer.fit_on_texts(list(train_X) + list(test_X))
train_X = tokenizer.texts_to_sequences(train_X)
#val_X =tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

#Pad the sequences
train_X = pad_sequences(train_X, maxlen=max_len)
#val_X = pad_sequences(val_X, maxlen=max_len)
test_X = pad_sequences(test_X, maxlen=max_len)

#Get the target values
train_y = train_df['target'].values  #1175509
#val_y = val_df['target'].values  #130613

#numpy2tensor
tensor_X = torch.from_numpy(train_X)
tensor_y = torch.from_numpy(train_y)
                           train['doc_len'].std()).astype(int)

    embed_size = 300  # how big is each word vector
    max_features = None  # how many unique words to use (i.e num rows in embedding vector)
    maxlen = max_seq_len  # max number of words in a question to use #99.99%

    # fill up the missing values
    X = train[TEXT_COLUMN].fillna("_na_").values
    X_test = test[TEXT_COLUMN].fillna("_na_").values
    X_test_2019 = test_2019[TEXT_COLUMN].fillna("_na_").values

    # Tokenize the sentences
    tokenizer = Tokenizer(num_words=max_features, filters='')
    tokenizer.fit_on_texts(list(X))

    X = tokenizer.texts_to_sequences(X)
    X_test = tokenizer.texts_to_sequences(X_test)
    X_test_2019 = tokenizer.texts_to_sequences(X_test_2019)

    # Pad the sentences
    X = pad_sequences(X, maxlen=maxlen)
    X_test = pad_sequences(X_test, maxlen=maxlen)
    X_test_2019 = pad_sequences(X_test_2019, maxlen=maxlen)

    # Get the target values
    Y = train[LABEL_COLUMN].values

    le = LabelEncoder()

    le.fit(Y)
    encoded_Y = le.transform(Y)
Exemplo n.º 22
0
    plt.legend()
    plt.grid(True)
    plt.savefig(filename)


os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

df = pd.read_csv("sample_training2_old.csv")
texts = df.iloc[:, 0].to_list()

tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')
tk.fit_on_texts(texts)
print(tk.word_index)
print("word index len: ", len(tk.word_index))

sequences = tk.texts_to_sequences(texts)
print(texts[0])
print(sequences[0])

lens = [len(x) for i, x in enumerate(sequences)]
print(lens)
print("max: ", max(lens))
sum_ser = reduce(lambda x, y: x + y, lens)
print("sum ", sum_ser)
avg_len = (sum_ser * 1.0) / (len(lens))
print("avg_len: ", avg_len)

data = pad_sequences(sequences, maxlen=1400, padding='post')

print()
print(data[0])
Exemplo n.º 23
0
def training():
    train = pd.read_csv(os.path.join(data_path, 'train.csv'))
    train = train.reindex(np.random.permutation(train.index))
    train = train[['text', 'drug', 'sentiment']]
    # print(train.head())

    train['text_comb'] = train['text'] + train['drug']

    # sns.factorplot(x="sentiment", data=train, kind="count", size=6, aspect=1.5, palette="PuBuGn_d")
    # plt.show()

    train.text_comb = train.text_comb.apply(remove_stopwords)
    # print(train.head())

    x_train, x_test, y_train, y_test = train_test_split(train.text_comb,
                                                        train.sentiment,
                                                        test_size=0.2,
                                                        random_state=37)
    # print('# Train data samples:', x_train.shape[0])
    # print('# Test data samples:', x_test.shape[0])
    assert x_train.shape[0] == y_train.shape[0]
    assert x_test.shape[0] == y_test.shape[0]

    # Converting words to numbers
    tk = Tokenizer(num_words=NB_WORDS,
                   filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                   lower=True,
                   split=" ")
    tk.fit_on_texts(x_train)

    # print('Fitted tokenizer on {} documents'.format(tk.document_count))
    # print('{} words in dictionary'.format(tk.num_words))
    # print('Top 5 most common words are:', collections.Counter(tk.word_counts).most_common(5))
    x_train_seq = tk.texts_to_sequences(x_train)
    x_test_seq = tk.texts_to_sequences(x_test)

    # print('"{}" is converted into {}'.format(x_train[0], x_train_seq[0]))
    x_train_oh = one_hot_seq(x_train_seq)
    x_test_oh = one_hot_seq(x_test_seq)

    # print('"{}" is converted into {}'.format(x_train_seq[0], x_train_oh[0]))
    # print('For this example we have {} features with a value of 1.'.format(x_train_oh[0].sum()))

    y_train_oh = to_categorical(y_train)
    y_test_oh = to_categorical(y_test)
    # print('"{}" is converted into {}'.format(y_train[0], y_train_oh[0]))

    # Splitting of a validation set
    x_train_rest, x_valid, y_train_rest, y_valid = train_test_split(
        x_train_oh, y_train_oh, test_size=0.2, random_state=37)

    assert x_valid.shape[0] == y_valid.shape[0]
    assert x_train_rest.shape[0] == y_train_rest.shape[0]

    # print('Shape of validation set:', x_valid.shape)

    # Baseline model
    base_model = models.Sequential()
    base_model.add(
        layers.Dense(64, activation='relu', input_shape=(NB_WORDS, )))
    base_model.add(layers.Dense(64, activation='relu'))
    base_model.add(layers.Dense(3, activation='softmax'))
    base_model.summary()

    base_history = deep_model(base_model, x_train_rest, y_train_rest, x_valid,
                              y_valid)

    # eval_metric(base_history, 'loss')
    # eval_metric(base_history, 'acc')

    # Handling over-fitting
    reduced_model = models.Sequential()
    reduced_model.add(
        layers.Dense(32, activation='relu', input_shape=(NB_WORDS, )))
    reduced_model.add(layers.Dense(3, activation='softmax'))
    reduced_model.summary()

    reduced_history = deep_model(reduced_model, x_train_rest, y_train_rest,
                                 x_valid, y_valid)

    # compare_loss_with_baseline(reduced_history, 'Reduced Model', base_history)

    # Adding regularization
    reg_model = models.Sequential()
    reg_model.add(
        layers.Dense(64,
                     kernel_regularizer=regularizers.l2(0.001),
                     activation='relu',
                     input_shape=(NB_WORDS, )))
    reg_model.add(
        layers.Dense(64,
                     kernel_regularizer=regularizers.l2(0.001),
                     activation='relu'))
    reg_model.add(layers.Dense(3, activation='softmax'))
    reg_model.summary()

    reg_history = deep_model(reg_model, x_train_rest, y_train_rest, x_valid,
                             y_valid)

    # compare_loss_with_baseline(reg_history, 'Regularized Model', base_history)

    # Adding dropout layers
    drop_model = models.Sequential()
    drop_model.add(
        layers.Dense(64, activation='relu', input_shape=(NB_WORDS, )))
    drop_model.add(layers.Dropout(0.5))
    drop_model.add(layers.Dense(64, activation='relu'))
    drop_model.add(layers.Dropout(0.5))
    drop_model.add(layers.Dense(3, activation='softmax'))
    drop_model.summary()

    drop_history = deep_model(drop_model, x_train_rest, y_train_rest, x_valid,
                              y_valid)

    # compare_loss_with_baseline(drop_history, 'Dropout Model', base_history)

    # Training on the full train data and evaluation on test data
    base_results = test_model(base_model, NB_START_EPOCHS, x_train_oh,
                              y_train_oh, x_test_oh, y_test_oh)
    print('Test accuracy of baseline model: {0:.2f}%\n'.format(
        base_results[1] * 100))

    reduced_results = test_model(reduced_model, NB_START_EPOCHS, x_train_oh,
                                 y_train_oh, x_test_oh, y_test_oh)
    print('Test accuracy of reduced model: {0:.2f}%\n'.format(
        reduced_results[1] * 100))

    reg_results = test_model(reg_model, NB_START_EPOCHS, x_train_oh,
                             y_train_oh, x_test_oh, y_test_oh)
    print('Test accuracy of regularized model: {0:.2f}%\n'.format(
        reg_results[1] * 100))

    drop_results = test_model(drop_model, NB_START_EPOCHS, x_train_oh,
                              y_train_oh, x_test_oh, y_test_oh)
    print('Test accuracy of dropout model: {0:.2f}%\n'.format(drop_results[1] *
                                                              100))

    base_model.save(os.path.join('./data/', 'base_model.h5'))
    reduced_model.save(os.path.join('./data/', 'reduced_model.h5'))
    reg_model.save(os.path.join('./data/', 'reg_model.h5'))
    drop_model.save(os.path.join('./data/', 'drop_model.h5'))
Exemplo n.º 24
0
tokenizer = Tokenizer(num_words=None,
                      filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',
                      split=' ',
                      char_level=False,
                      oov_token=None)
tokenizer.fit_on_texts(files)
tokenizer.fit_on_texts(outfiles)

# with open("wordsdic.pkl", 'wb') as f:
#     pickle.dump(tokenizer, f)

vocab = tokenizer.word_index
print(tokenizer.word_index)
print(len(vocab))
x_train_word_ids = tokenizer.texts_to_sequences(files)
x_out_word_ids = tokenizer.texts_to_sequences(outfiles)

x_train_padded_seqs = pad_sequences(x_train_word_ids, maxlen=maxlen)

x_out_padded_seqs = pad_sequences(x_out_word_ids, maxlen=maxlen)

# with open('datasets.pkl', 'wb') as f:
#     pickle.dump(x_train_padded_seqs, f)
#     pickle.dump(x_out_padded_seqs, f)
#     pickle.dump(labels, f)

# with open('datasets.pkl', 'rb') as f:
#     x_train_padded_seqs = pickle.load(f)
#     # x_test_padded_seqs = pickle.load(f)
#     x_out_padded_seqs = pickle.load(f)
            querytweet['Query'].split()) <= max_query_len:
        cleaned_tweet.append(querytweet['Tweet'])
        cleaned_query.append(querytweet['Query'])

x_train, x_validation, y_train, y_validation = train_test_split(cleaned_tweet,
                                                                cleaned_query,
                                                                test_size=0.1,
                                                                random_state=0,
                                                                shuffle=True)

#prepare a tokenizer for tweets on training data
x_tokenizer = Tokenizer()
x_tokenizer.fit_on_texts(list(x_train))

#convert text sequences to one hot encoding sequences
x_tr_seq = x_tokenizer.texts_to_sequences(x_train)
x_val_seq = x_tokenizer.texts_to_sequences(x_validation)

#post-padding text sequences
x_train = pad_sequences(x_tr_seq, maxlen=max_tweet_len, padding='post')
x_validation = pad_sequences(x_val_seq, maxlen=max_tweet_len, padding='post')

#size of vocabulary ( +1 for padding token)
x_vocab_size = len(x_tokenizer.word_counts.items()) + 1

print("Size of vocabulary in X = {}".format(x_vocab_size))

#prepare a tokenizer for queries on training data
y_tokenizer = Tokenizer()
y_tokenizer.fit_on_texts(list(y_train))
gru_output_size = 70

# Training
batch_size = 128
epochs = 1

print('Loading data...')
(x_train, y_train), (x_val, y_val), (x_test,
                                     y_test) = sentiment_140_neg.load_data()

print('Fitting tokenizer...')
tokenizer = Tokenizer()
tokenizer.fit_on_texts(np.concatenate((x_train, x_val, x_test)))

print('Convert text to sequences')
x_train = tokenizer.texts_to_sequences(x_train)
x_val = tokenizer.texts_to_sequences(x_val)
x_test = tokenizer.texts_to_sequences(x_test)

print(len(x_train), 'train sequences')
print(len(x_val), 'validation sequences')
print(len(x_test), 'test sequences')

print('Pad sequences (samples x time)')

x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_val = sequence.pad_sequences(x_val, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

print('x_train shape:', x_train.shape)
print('x_val shape:', x_val.shape)
Exemplo n.º 27
0
test = pd.read_csv('../input/scenic_score_prediction/predict_first.csv')


# 分词
def participle(data):
    data['word'] = data['Discuss'].map(lambda x: jieba.lcut(x))


participle(train)
participle(test)

max_features = 80000  ## 词汇量

token = Tokenizer(num_words=max_features)
token.fit_on_texts(train.word.values)
train['Discuss_seq'] = token.texts_to_sequences(train.word.values)
test['Discuss_seq'] = token.texts_to_sequences(test.word.values)

maxlen = 150


def get_keras_data(data):
    return {'Discuss_seq': pad_sequences(data.Discuss_seq, maxlen=maxlen)}


x_train = get_keras_data(train)
x_test = get_keras_data(test)
y_train = train.Score.values

embed_size = 200  # emb 长度
Exemplo n.º 28
0
    for i in range(len(finaldf)):
        print(i, finaldf['title'][i])
    print(len(finaldf))

    finaldf = finaldf.sample(frac=1)
    X_train, X_test, y_train, y_test = train_test_split(df5.text,
                                                        df5.target,
                                                        test_size=0.3,
                                                        random_state=37)
    tk = Tokenizer(num_words=10000,
                   filters='!"#$%&()*+,-./:;<=>?@[\]^_`{"}~\t\n',
                   lower=True,
                   split=" ")
    tk.fit_on_texts(X_train)
    X_train_seq = tk.texts_to_sequences(X_train)
    X_test_seq = tk.texts_to_sequences(X_test)

    X_train_seq_trunc = pad_sequences(X_train_seq, maxlen=100)
    X_test_seq_trunc = pad_sequences(X_test_seq, maxlen=100)
    model = Sequential()  # initilaizing the Sequential nature for CNN model
    print(len(tk.index_word))

    # Adding the embedding layer which will take in maximum of 450 words as input and provide a 32 dimensional output of those words which belong in the top_words dictionary
    model.add(Embedding(len(tk.index_word), 32, input_length=100))
    model.add(LSTM(100))

    #CNN
    # model.add(Conv1D(16, 4, padding='valid', activation='relu'))
    # model.add(MaxPooling1D())
    # model.add(Conv1D(32, 4, activation='relu'))
Exemplo n.º 29
0
                    'tSentimentScore', 'label'
                ])
        except:
            print('ERRORRRRRR!!!!')

    if True:
        train_emo_feat = pd.read_csv(
            'datasets/train_articles_emotion_features.csv', index_col='index')
        dev_emo_feat = pd.read_csv(
            'datasets/dev_articles_emotion_features.csv', index_col='index')
        # pd.read_csv()

    print("Extracting tokens...")
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(articles + dev_articles)
    articles_id = tokenizer.texts_to_sequences(articles)
    dev_articles_id = tokenizer.texts_to_sequences(dev_articles)
    # print(articles_id)
    # print(dev_articles_id)

    wordIndex = tokenizer.word_index
    print("Found %s unique tokens." % len(wordIndex))

    print("Populating embedding matrix...")
    embeddingMatrix = getEmbeddingMatrix(wordIndex)

    train_seq_len = []
    dev_seq_len = []
    for x in articles_id:
        train_seq_len.append(len(x))
Exemplo n.º 30
0
def train(max_length=128,
          embeddings_size=100,
          validation_split=0.2,
          model_path='model'):
    """
    Trains the model
    :param model_path: the path to the folder containing the model
    :param validation_split: percentage of the samples kept for validation
    :param max_length: maximum sequence length
    :param embeddings_size: the size of the embeddings
    :return:
    """
    titles, descriptions, types = load_dataset('dataset/movies_metadata.csv')

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(titles + descriptions)

    label_binarizer = MultiLabelBinarizer()
    types = label_binarizer.fit_transform(types)

    with open(model_path + '/tokenizer.pkl', 'wb') as f:
        pickle.dump(tokenizer, f)
    with open(model_path + '/label_binarizer.pkl', 'wb') as f:
        pickle.dump(label_binarizer, f)

    # Converts texts to sequences of token ids
    titles = tokenizer.texts_to_sequences(titles)
    descriptions = tokenizer.texts_to_sequences(descriptions)

    # Pads the sequences with zeros
    titles = pad_sequences(titles, padding='post', maxlen=max_length)
    descriptions = pad_sequences(descriptions,
                                 padding='post',
                                 maxlen=max_length)

    # Split the dataset to train and validation
    train_num = int((1 - validation_split) * len(titles))
    train_titles, train_descriptions, train_types = titles[:
                                                           train_num], descriptions[:
                                                                                    train_num], types[:
                                                                                                      train_num]
    val_titles, val_descriptions, val_types = titles[train_num:], descriptions[
        train_num:], types[train_num:]

    total_labels = len(label_binarizer.classes_)

    model = get_model(embeddings_size=embeddings_size,
                      tokenizer=tokenizer,
                      total_labels=total_labels)

    class MyCallback(keras.callbacks.Callback):
        """ A custom Keras callback for running the evaluation every k batches and storing the best model """
        def __init__(self, model, val_data, label_binarizer):
            super(MyCallback, self).__init__()
            self.model = model
            self.val_data = val_data
            self.label_binarizer = label_binarizer
            self.f1 = 0

        def on_batch_end(self, batch, logs={}):
            if (batch + 1) % 100 == 0:
                precision, recall, f1, acc, avg_precision, avg_recall, avg_f1 = evaluate(
                    self.model,
                    val_data=self.val_data,
                    label_binarizer=self.label_binarizer)
                logs['micro precision'] = precision
                logs['micro recall'] = recall
                logs['micro F1'] = f1
                logs['macro precision'] = avg_precision
                logs['macro recall'] = avg_recall
                logs['macro F1'] = avg_f1
                logs['accuracy'] = acc
                if f1 > self.f1:
                    print(str(self.f1) + ' -> ' + str(f1))
                    self.f1 = f1
                    model.save('model/model.h5')

    # A tensorboard callback to visualize the metrics
    tensorboard_callback = keras.callbacks.TensorBoard(
        log_dir="logs/scalars/" + datetime.now().strftime("%Y%m%d-%H%M%S"),
        update_freq='batch',
        write_grads=True,
        write_graph=True,
        write_images=True)
    callback = MyCallback(model=model,
                          val_data={
                              'titles': val_titles,
                              'descriptions': val_descriptions,
                              'types': val_types
                          },
                          label_binarizer=label_binarizer)

    model.fit([train_titles, train_descriptions], [train_types],
              epochs=32,
              verbose=True,
              batch_size=64,
              callbacks=[callback, tensorboard_callback])
Exemplo n.º 31
0
def load_data():
    user_seq = json.load(open(DATA_PATH, 'r'))

    scores = []
    with open(TRAIN_DATA, "r") as f:
        train_data = f.readlines()
        scores = []
        for raw_data in train_data:
            raw = raw_data.split(',')
            scores.append({raw[0]: raw[1].strip("\n")})

    user_tweet = []
    text = []
    text_score = []
    for user in user_seq:
        for seq in user:
            for score in scores:
                if seq["id_str"] in score.keys() and seq["text"] != "":
                    # user_tweet.append({"post_content": seq["text"], "post_time": seq["time"],
                    # "score": score[seq["id_str"]]})
                    text.append(str(seq["text"]).strip("\n"))
                    text_score.append(score[seq["id_str"]])
                    break

    tokenizer = Tokenizer(num_words=MAX_WORDS_NUM)
    tokenizer.fit_on_texts(text)
    # print(tokenizer.word_index)
    x_seq = tokenizer.texts_to_sequences(text)
    x_train = pad_sequences(x_seq, maxlen=MAX_LEN)
    # print(x_train)
    y_train = np.array(text_score)
    # print(y_train)

    with open(TEST_DATA, "r") as ft:
        test_data = ft.readlines()
        test_scores = []
        for raw_data in test_data:
            raw = raw_data.split(',')
            test_scores.append({raw[0]: raw[1].strip("\n")})

    test_text = []
    test_score = []
    for user in user_seq:
        for seq in user:
            for score in test_scores:
                if seq["id_str"] in score.keys() and seq["text"] != "":
                    # user_tweet.append({"post_content": seq["text"], "post_time": seq["time"],
                    # "score": score[seq["id_str"]]})
                    test_text.append(str(seq["text"]).strip("\n"))
                    test_score.append(score[seq["id_str"]])
                    break

    tokenizer = Tokenizer(num_words=MAX_WORDS_NUM)
    tokenizer.fit_on_texts(test_text)
    # print(tokenizer.word_index)
    test_seq = tokenizer.texts_to_sequences(test_text)
    x_test = pad_sequences(test_seq, maxlen=MAX_LEN)
    # print(x_test)
    y_test = np.array(test_score)
    # print(y_test)

    return (x_train, y_train), (x_test, y_test)