def prepare(data_id, cfg_path='./config.yml'): print('start preparing') cfg = io.load_yml(cfg_path, data_id) data = io.load_csv(cfg['data_file']) data.rename(columns={cfg['text_col']: 'text'}, inplace=True) if 'add_col' in cfg.keys(): columns = cfg['add_col'] + ['text', cfg['label_col']] data = data[columns] else: data = data[['text', cfg['label_col']]] data.dropna(subset=['text', cfg['label_col']], inplace=True) data.drop_duplicates(inplace=True) data['seq_length'] = data.text.map(str.split).apply(len) data['label'] = data[cfg['label_col']].apply(format_labels.sort, args=[cfg['sep']]) data['str_label'] = data['label'].apply(format_labels.join) unique_labels = format_labels.get_unique(data.label.tolist()) data['one_hot_labels'] = data['label'].apply(format_labels.encode_onehot, args=[unique_labels]) io.to_pickle(data, cfg['pkl_file'])
def load_data(cfg_path, DATA_ID): cfg = io.load_yml(cfg_path, DATA_ID) try: data = io.load_pickle(cfg['pkl_file']) except: prepare.prepare(DATA_ID, cfg['pkl_file']) data = io.load_pickle(cfg['pkl_file']) return data
def load_data(configuration_path, DATA_ID): configuration = io.load_yml(configuration_path, DATA_ID) try: data = io.load_pickle(configuration['pkl_file']) except: prepare.prepare(DATA_ID, configuration_path) data = io.load_pickle(configuration['pkl_file']) return data
BERT_MODEL = 'uncased_L-12_H-768_A-12' BERT_MODEL_HUB = 'https://tfhub.dev/google/bert_' + BERT_MODEL + '/1' USE_TPU = False import sys import classifier import classifier_with_tfhub from utils import tokenization import tensorflow_hub as hub import pdb cfg = io.load_yml('./config.yml', DATA_ID) data = io.load_pickle(cfg['pkl_file']) all_labels = format_labels.get_unique_labels(data.label.tolist()) tokenizer = classifier_with_tfhub.create_tokenizer_from_hub_module( BERT_MODEL_HUB) train_values = data.sample(frac=0.7, random_state=72)[:100] test_values = data.drop(train_values.index)[:20] TRAIN_BATCH_SIZE = 32 EVAL_BATCH_SIZE = 8 PREDICT_BATCH_SIZE = 8 LEARNING_RATE = 2e-5 NUM_TRAIN_EPOCHS = 1.0 MAX_SEQ_LENGTH = 128