Exemplo n.º 1
0
def validate_model(model, epochs=1, samples=200):
    _logger.info('load data')
    train_set = pd.read_csv(config.TRAIN_SET_PATH)

    tokenizer = load_tokenizer()
    train_sentences, val_sentences, test_sentences = load_sentences()
    x_train = encode_sentence(train_sentences[:samples],
                              padding=True,
                              max_length=config.MAX_SEQUENCE_LENGTH,
                              tokenizer=tokenizer)

    y_train = {}

    for col in constants.COLS:
        y_train[col] = np.array(label_transform(train_set[col]))[:samples]

    _logger.info('test fit model')
    history = model.fit(x_train,
                        y_train,
                        batch_size=config.BATCH_SIZE,
                        epochs=epochs,
                        verbose=1,
                        workers=config.WORKER_NUM)

    _logger.info('test successfully')
    return history
Exemplo n.º 2
0
def x_data():
    train_set = pd.read_csv(config.TRAIN_SET_PATH)
    val_set = pd.read_csv(config.VALIDATION_SET_PATH)

    tokenizer = load_tokenizer()
    train_sentences, val_sentences, test_sentences = load_sentences()
    x_train = encode_sentence(train_sentences,
                              padding=True,
                              max_length=config.MAX_SEQUENCE_LENGTH,
                              tokenizer=tokenizer)
    x_val = encode_sentence(val_sentences,
                            padding=True,
                            max_length=config.MAX_SEQUENCE_LENGTH,
                            tokenizer=tokenizer)
    return x_train, x_val
Exemplo n.º 3
0
def validate_data():
    val_set = pd.read_csv(config.VALIDATION_SET_PATH)

    tokenizer = load_tokenizer()
    train_sentences, val_sentences, test_sentences = load_sentences()
    x_val = encode_sentence(val_sentences,
                            padding=True,
                            max_length=config.MAX_SEQUENCE_LENGTH,
                            tokenizer=tokenizer)

    y_val = {}

    for col in constants.COLS:
        y_val[col] = np.array(label_transform(val_set[col]))

    return x_val, y_val
Exemplo n.º 4
0
def load_val_data_set():
    val_set = pd.read_csv(config.VALIDATION_SET_PATH)

    tokenizer = load_tokenizer()
    train_sentences, val_sentences, test_sentences = load_sentences()
    x_val = encode_sentence(val_sentences,
                            padding=True,
                            max_length=config.MAX_SEQUENCE_LENGTH,
                            tokenizer=tokenizer)

    train_set = pd.read_csv(config.TRAIN_SET_PATH)
    val_set = pd.read_csv(config.VALIDATION_SET_PATH)

    _, y_val = transform_y_data(train_set, val_set, constants.COLS)

    return x_val, y_val
Exemplo n.º 5
0
def train_data():
    train_set = pd.read_csv(config.TRAIN_SET_PATH, )
    val_set = pd.read_csv(config.VALIDATION_SET_PATH)

    tokenizer = load_tokenizer()
    train_sentences, val_sentences, test_sentences = load_sentences()
    x_train = encode_sentence(train_sentences,
                              padding=True,
                              max_length=config.MAX_SEQUENCE_LENGTH,
                              tokenizer=tokenizer)
    x_val = encode_sentence(val_sentences,
                            padding=True,
                            max_length=config.MAX_SEQUENCE_LENGTH,
                            tokenizer=tokenizer)

    y_train = train_set['service_waiters_attitude']
    y_val = val_set['service_waiters_attitude']

    y_train, y_val = np.array(label_transform(y_train)), np.array(
        label_transform(y_val))

    return x_train, y_train, x_val, y_val
Exemplo n.º 6
0
def init_embedding_matrix():
    # wv_model = KeyedVectors.load(tencent_pretrained_word_embedding)
    wv_model = Word2Vec.load(corpus_word_embdding)
    tokenizer = load_tokenizer()
    load_embedding_matrix(tokenizer.word_index, wv_model)
Exemplo n.º 7
0
# encoding: utf8

import env
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

from senti_analysis import config
from senti_analysis.preprocess import load_embedding_matrix, load_tokenizer

tencent_pretrained_word_embedding = '/Users/hotbaby/Datasets/Tencent_AILab_ChineseEmbedding.txt'
corpus_word_embedding = '/Users/hotbaby/code/github/sentiment-analysis/notebooks/w2v.model'

if __name__ == '__main__':
    wv_model = KeyedVectors.load_word2vec_format(
        tencent_pretrained_word_embedding, binary=False)
    # wv_model = Word2Vec.load(corpus_word_embedding)

    wv_model.save(config.W2V_MODEL_PATH)

    tokenizer = load_tokenizer()
    load_embedding_matrix(tokenizer.word_index, wv_model)