示例#1
0
def prepare(data_id, cfg_path='./config.yml'):
    print('start preparing')
    cfg = io.load_yml(cfg_path, data_id)
    data = io.load_csv(cfg['data_file'])

    data.rename(columns={cfg['text_col']: 'text'}, inplace=True)

    if 'add_col' in cfg.keys():
        columns = cfg['add_col'] + ['text', cfg['label_col']]
        data = data[columns]
    else:
        data = data[['text', cfg['label_col']]]

    data.dropna(subset=['text', cfg['label_col']], inplace=True)
    data.drop_duplicates(inplace=True)

    data['seq_length'] = data.text.map(str.split).apply(len)

    data['label'] = data[cfg['label_col']].apply(format_labels.sort,
                                                 args=[cfg['sep']])
    data['str_label'] = data['label'].apply(format_labels.join)

    unique_labels = format_labels.get_unique(data.label.tolist())
    data['one_hot_labels'] = data['label'].apply(format_labels.encode_onehot,
                                                 args=[unique_labels])

    io.to_pickle(data, cfg['pkl_file'])
示例#2
0
def load_data(cfg_path, DATA_ID):
    cfg = io.load_yml(cfg_path, DATA_ID)
    try:
        data = io.load_pickle(cfg['pkl_file'])
    except:
        prepare.prepare(DATA_ID, cfg['pkl_file'])
        data = io.load_pickle(cfg['pkl_file'])

    return data
示例#3
0
def load_data(configuration_path, DATA_ID):
    configuration = io.load_yml(configuration_path, DATA_ID)
    try:
        data = io.load_pickle(configuration['pkl_file'])
    except:
        prepare.prepare(DATA_ID, configuration_path)
        data = io.load_pickle(configuration['pkl_file'])

    return data
示例#4
0
BERT_MODEL = 'uncased_L-12_H-768_A-12'
BERT_MODEL_HUB = 'https://tfhub.dev/google/bert_' + BERT_MODEL + '/1'

USE_TPU = False

import sys

import classifier
import classifier_with_tfhub
from utils import tokenization
import tensorflow_hub as hub

import pdb

cfg = io.load_yml('./config.yml', DATA_ID)
data = io.load_pickle(cfg['pkl_file'])

all_labels = format_labels.get_unique_labels(data.label.tolist())
tokenizer = classifier_with_tfhub.create_tokenizer_from_hub_module(
    BERT_MODEL_HUB)

train_values = data.sample(frac=0.7, random_state=72)[:100]
test_values = data.drop(train_values.index)[:20]

TRAIN_BATCH_SIZE = 32
EVAL_BATCH_SIZE = 8
PREDICT_BATCH_SIZE = 8
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 1.0
MAX_SEQ_LENGTH = 128