def load_config(config_file='./myconfig.json'): if not os.path.exists(config_file): config_data = OrderedDict() ################################################################## # default option setting.. config_data["data_dir"] = '~/usb/project/kakao_arena/data' config_data["dataset_dir"] = '~/usb/project/kakao_arena/dataset' ################################################################## # kakao default setting config_data["unigram_hash_size"] = 100000 config_data["min_word_length"] = 2 config_data["max_word_length"] = 31 config_data["max_len"] = 32 config_data["db_chunk_size"] = 100000 config_data["num_workers"] = 10 config_data["num_preidct_workers"] = 2 config_data["embd_size"] = 128 config_data["lr"] = 1e-4 config_data["num_epochs"] = 100 config_data["batch_size"] = 1024 ################################################################## with open(config_file, 'w') as fp: json.dump(config_data, fp, ensure_ascii=False, indent=4) fp.close() return Option(config_file)
def main(): opt = Option('./config.json') device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # model model = HMCN(opt).to(device) criterion = nn.BCEWithLogitsLoss() optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr, betas=(0.9, 0.999)) num_params = sum([p.numel() for p in model.parameters()]) print('Total # of params: {:,}'.format(num_params)) if continue_train == True: model.load_state_dict(torch.load(best_model_path)) best_loss = 100000. for epoch in range(opt.num_epochs): train(opt, train_loader, model, criterion, optimizer, epoch) val_loss = evaluate(opt, valid_loader, model, criterion, make_file=False) if val_loss < best_loss: best_loss = val_loss torch.save(model.state_dict(), best_model_path) print('model saved at loss: %.4f' % (best_loss)) if (epoch + 1) % 5 == 0: torch.save(model.state_dict(), save_model_path + '_E%d.pth' % (epoch + 1)) model.load_state_dict(torch.load(best_model_path)) dev_loss = evaluate(opt, dev_loader, model, criterion, make_file=True) pid_order = [] h = h5py.File('./data/dev/data.h5py', 'r')['dev'] pid_order.extend(h['pid'][::]) no_ans = '{pid}\t-1\t-1\t-1\t-1' with open(result_path, 'r') as f: file_len = len(f.readlines()) print('total prediction length:', file_len) with open(result_path, 'a') as f: pid_none = pid_order[file_len:] for pid in pid_none: f.write(no_ans.format(pid=pid)) f.write('\n') print('created file at %s' % (result_path))
import time import traceback from multiprocessing import Pool from gensim.models import Doc2Vec from elasticsearch5 import Elasticsearch import tqdm import fire import h5py import numpy as np import six from six.moves import cPickle import pandas as pd from misc import get_logger, Option opt = Option('./config.json') es = Elasticsearch(hosts=opt.es_host) #TODO conf class Reader(object): def __init__(self, data_path_list, div, begin_offset, end_offset): self.div = div self.data_path_list = data_path_list self.begin_offset = begin_offset self.end_offset = end_offset def is_range(self, i): if self.begin_offset is not None and i < self.begin_offset: return False if self.end_offset is not None and self.end_offset <= i:
parser.add_argument('--input_root', default='/data/output/tmp', help='folder to load shuffled chunks') parser.add_argument('--output_root', default='/data/output', help='folder to save tfrecords') parser.add_argument('--shuffle', type=lambda x: (str(x).lower() == 'true'), default=True, help='shuffle indices in chunks') args = parser.parse_args() if not os.path.exists(args.output_root): os.makedirs(args.output_root) opt = Option("./config.json") final_format = "%s_splitted.chunk.%02d" if args.shuffle: final_format = "%s_shuffled.chunk.%02d" tfrecord_format = "%s" + ( "-%d-max%d" % (opt.unigram_hash_size, opt.max_len)) + ".%02d.tfrecord" def _bytes_feature(value): return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) def _int64_feature(value): return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
# See the License for the specific language governing permissions and # limitations under the License. import tensorflow as tf import keras from keras.models import Model from keras.layers.merge import dot from keras.layers import Dense, Input from keras.layers.core import Reshape from keras.layers.embeddings import Embedding from keras.layers.core import Dropout, Activation from misc import get_logger, Option opt = Option('shopping-classification/config.json') def top1_acc(x, y): return keras.metrics.top_k_categorical_accuracy(x, y, k=1) class TextOnly: def __init__(self): self.logger = get_logger('textonly') def get_model(self, num_classes, activation='sigmoid'): max_len = opt.max_len voca_size = opt.unigram_hash_size + 1 with tf.device('/gpu:0'):
import numpy as np import tensorflow as tf from tensorflow.contrib.tensorboard.plugins import projector from keras.models import load_model from keras.callbacks import ModelCheckpoint from datetime import datetime from misc import get_logger, Option from shutil import copyfile import sklearn.metrics as sklm config_file_path = './config.json' copyfile(config_file_path, 'model/config.json') # backup config file (overwrite) opt = Option(config_file_path) cate1 = json.loads(open(opt.cate1, 'r').read()) os.environ["CUDA_VISIBLE_DEVICES"] = opt.gpu class Classifier(): def __init__(self): self.logger = get_logger('Classifier') self.num_classes = 0 def get_sample_generator(self, ds, batch_size): left = 0 limit = ds['uni'].shape[0] while True: right = min(left + batch_size, limit)
def __init__(self, conf, verbose=False): self.logger = get_logger() self.verbose = verbose self.status = CpCybos.get_instance() self.opt = Option(conf)