def compute_elmo_rep(model_dir, input_list, mtype='BiLSTMAttention'): ''' Given a list of documents, return a list of embedded documents each element in list is [sentence len] * [word embedding dim] ''' config = DefaultConfig( ) # Just take the default config to do the prediction work config.set_attrs({'batch_size': 8}) model_path = '%s/model' % model_dir text_processor = TextPreProcessor( normailze=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'date', 'number' ], annotate={ "hashtag", "allcaps", "elongated", "repeated", "emphasis", "censored" }, fix_html=True, segmenter="english", corrector="english", unpack_hashtags=True, unpack_contractions=True, spell_correct_elong=False, tokenizer=SocialTokenizer(lowercase=True).tokenize, dicts=[emoticons]) listTokenized = list(text_processor.pre_process_docs(input_list)) print('After tokenization:') print(listTokenized) tensorTokenizedCharEncoded = batch_to_ids( listTokenized ) #[ ['I', 'am', 'a' ,'sentense'] , ['A','sentense'] ] )#listShuffledReviewsTokenized ) # print( listShuffledReviewsCharacterEmbedded[0].size() ) arrayTokenizedCharEncoded = tensorTokenizedCharEncoded.numpy().astype( numpy.int32) x = Variable(torch.from_numpy(arrayTokenizedCharEncoded).long(), requires_grad=False) if config.on_cuda: x = x.cuda() else: x = x.cpu() #print(x.size()) model = biLSTMAttention.BiLSTMAttention( param_document_seq_len=tensorTokenizedCharEncoded.size( 1), # 300 in our model param_character_embedding_len=tensorTokenizedCharEncoded.size( 2), #it depends on the setting param_bilstm_hidden_size=1024 // 2, # 1024 is the Elmo size, the concatenated hidden size is supposed to Elmo size, however, any size is OK param_attention_size=(1024 // 2 * 2) // 1024 * 1024 + (1024 // 2 * 2) % 1024, # attention size should be a smoothed representation of character-emb param_class_count=5, param_options_file=config.options_file, param_weight_file=config.weight_file) print('Loading trained model') # here, load and save are defined in biLSTMAttention.py # load <=> model.load_state_dict( torch.load(path) ) # save <=> torch.save( model.state_dict(), path ) # an other way: # model = torch.load( path ) # has 2 field, if torch.save( model, path ), then both ['state_dict'] and ['struct'] != None # torch.save( model, path ) if config.on_cuda: model.load(model_path) model = model.cuda() else: model.load_cpu_from_gputrained(model_path) model = model.cpu() elmo_dict = model.forward_obtainTrainedElmoRep(x) elmo_rep = elmo_dict['elmo_representations'][ 0] # since num_output_representations = 1, so len(list_elmo_rep) = 1, # if num_output_representations == 2, then will produce 2 same elmo_representations of [batch_size, seq_len, wordembedding_len] #print(elmo_rep.size()) arr_elmo_rep = elmo_rep.data.cpu().numpy() return arr_elmo_rep
def predict(model_dir, mtype='BiLSTMAttention'): ''' load the model and conduct the prediction, the prediction is added with 1 since the original prediction is the index prediction is saved in '%s/0'%SAVE_DIR ''' model_path = '%s/model' % model_dir output_path = '%s/res.txt' % model_dir config = DefaultConfig( ) # Just take the default config to do the prediction work config.set_attrs({'batch_size': 8}) if mtype == 'BiLSTMAttention': model = biLSTMAttention.BiLSTMAttention( param_document_seq_len=DOCUMENT_SEQ_LEN, # 300 in our model param_character_embedding_len= CHARACTER_EMBEDDING_LEN, #it depends on the setting param_bilstm_hidden_size=1024 // 2, # 1024 is the Elmo size, the concatenated hidden size is supposed to Elmo size, however, any size is OK param_attention_size=(1024 // 2 * 2) // 1024 * 1024 + (1024 // 2 * 2) % 1024, # attention size should be a smoothed representation of character-emb param_class_count=5, param_options_file=config.options_file, param_weight_file=config.weight_file) print('Loading trained model') if config.on_cuda: config.on_cuda = torch.cuda.is_available() if config.on_cuda == False: print( 'Cuda is unavailable, Although wants to run on cuda, Model still run on CPU' ) # here, load and save are defined in biLSTMAttention.py # load <=> model.load_state_dict( torch.load(path) ) # save <=> torch.save( model.state_dict(), path ) # an other way: # model = torch.load( path ) # has 2 field, if torch.save( model, path ), then both ['state_dict'] and ['struct'] != None # torch.save( model, path ) if config.on_cuda: model.load(model_path) model = model.cuda() else: model.load_cpu_from_gputrained(model_path) model = model.cpu() print('Begin loading data') datamanager = DataManager(param_batch_size=config.batch_size, param_training_instances_size=TESTING_INSTANCES ) # the batch_size makes no differences datamanager.load_dataframe_from_file(TEST_SET_PATH) n_batch = datamanager.n_batches() res = numpy.array([]) batch_index = 0 for batch_index in range(n_batch - 1): (x, y) = datamanager.next_batch() x = Variable(torch.from_numpy(x).long(), requires_grad=False) if config.on_cuda: x = x.cuda() else: x = x.cpu() scores = model.forward(x) _, predict = torch.max( scores, 1) # predict is the first dimension , its the same as [ 1 ] res = numpy.append(res, predict.data.cpu().numpy(), axis=0) print('%d/%d' % (batch_index, n_batch)) if TESTING_INSTANCES % config.batch_size == 0: datamanager.set_current_cursor_in_dataframe_zero() else: batch_index += 1 # the value can be inherented (x, y) = datamanager.tail_batch() x = Variable(torch.from_numpy(x).long(), requires_grad=False) if config.on_cuda: x = x.cuda() else: x = x.cpu() scores = model.forward(x) _, predict = torch.max( scores, 1) # predict is the first dimension , its the same as [ 1 ] res = numpy.append(res, predict.data.cpu().numpy(), axis=0) print('%d/%d' % (batch_index, n_batch)) res = res[:TESTING_INSTANCES] res = res + 1 numpy.savetxt(output_path, res, fmt='%d')
def save_elmo_rep(model_dir, input_path, output_path, mtype='BiLSTMAttention'): ''' Given Tokenized CharEncoded txt file in input_path, save the word embedded file in output_path each line is [sentence len] * [word embedding dim] ''' model_path = '%s/model' % model_dir config = DefaultConfig( ) # Just take the default config to do the prediction work config.set_attrs({'batch_size': 8}) if mtype == 'BiLSTMAttention': model = biLSTMAttention.BiLSTMAttention( param_document_seq_len=DOCUMENT_SEQ_LEN, # 300 in our model param_character_embedding_len= CHARACTER_EMBEDDING_LEN, #it depends on the setting param_bilstm_hidden_size=1024 // 2, # 1024 is the Elmo size, the concatenated hidden size is supposed to Elmo size, however, any size is OK param_attention_size=(1024 // 2 * 2) // 1024 * 1024 + (1024 // 2 * 2) % 1024, # attention size should be a smoothed representation of character-emb param_class_count=5, param_options_file=config.options_file, param_weight_file=config.weight_file) print('Loading trained model') if config.on_cuda: config.on_cuda = torch.cuda.is_available() if config.on_cuda == False: print( 'Cuda is unavailable, Although wants to run on cuda, Model still run on CPU' ) # here, load and save are defined in biLSTMAttention.py # load <=> model.load_state_dict( torch.load(path) ) # save <=> torch.save( model.state_dict(), path ) # an other way: # model = torch.load( path ) # has 2 field, if torch.save( model, path ), then both ['state_dict'] and ['struct'] != None # torch.save( model, path ) if config.on_cuda: model.load(model_path) model = model.cuda() else: model.load_cpu_from_gputrained(model_path) model = model.cpu() # print(model) print('Begin loading data') datamanager = DataManager(param_batch_size=config.batch_size, param_training_instances_size=TESTING_INSTANCES ) # the batch_size makes no differences datamanager.load_dataframe_from_file(input_path) n_batch = datamanager.n_batches() res = numpy.empty( (0, DOCUMENT_SEQ_LEN * 1024), dtype=numpy.float32 ) # res is [], shape = (0, 3) , be sure to append on axis 0 batch_index = 0 for batch_index in range(n_batch - 1): x = datamanager.next_batch_nolabel() x = Variable(torch.from_numpy(x).long(), requires_grad=False) if config.on_cuda: x = x.cuda() else: x = x.cpu() elmo_dict = model.forward_obtainTrainedElmoRep(x) #elmo_rep = elmo_dict['elmo_representations'] #var_elmo_rep = torch.cat( elmo_rep, dim = 0 ) # concatenate seq of tensors var_elmo_rep = elmo_dict['elmo_representations'][0] var_elmo_rep = var_elmo_rep.view(config.batch_size, DOCUMENT_SEQ_LEN * 1024) # 1024 is the Elmo size, fixed res = numpy.append(res, var_elmo_rep.data.cpu().numpy(), axis=0) print('%d/%d' % (batch_index, n_batch)) if TESTING_INSTANCES % config.batch_size == 0: datamanager.set_current_cursor_in_dataframe_zero() else: batch_index += 1 # the value can be inherented x = datamanager.tail_batch_nolabel() x = Variable(torch.from_numpy(x).long(), requires_grad=False) if config.on_cuda: x = x.cuda() else: x = x.cpu() elmo_dict = model.forward_obtainTrainedElmoRep(x) #elmo_rep = elmo_dict['elmo_representations'][0] #var_elmo_rep = torch.cat( elmo_rep, dim = 0 ) # concatenate seq of tensors var_elmo_rep = elmo_dict['elmo_representations'][0] var_elmo_rep = var_elmo_rep.view(config.batch_size, DOCUMENT_SEQ_LEN * 1024) # 1024 is the Elmo size, fixed res = numpy.append(res, var_elmo_rep.data.cpu().numpy(), axis=0) print('%d/%d' % (batch_index, n_batch)) res = res[:TESTING_INSTANCES] numpy.savetxt(output_path, res, fmt='%f')
def train(**kwargs): ''' begin training the model *kwargs: train(1,2,3,4,5)=>kwargs[0] = 1 kwargs[1] = 2 ..., kwargs is principally a tuple **kwargs: train(a=1,b=2,c=3,d=4)CustomPreProcessor=>kwargs[a] = 1, kwargs[b] = 2, kwargs[c] = 3, kwargs[d] = 4, kwargs is principally a dict function containing kwargs *kwargs **kwargs must be written as: def train(args,*args,**args) ''' saveid = latest_save_num() + 1 save_path = '%s/%d' % (SAVE_DIR, saveid) #the save_path is print("logger save path: %s" % (save_path)) if not os.path.exists(save_path): os.makedirs(save_path) log_path_each_save = '%s/log.txt' % save_path model_path_each_save = '%s/model' % save_path logger = get_logger(log_path_each_save) config = DefaultConfig() config.set_attrs( kwargs) # settings here, avalid_data_utillso about whether on cuda # print(config.get_attrs()) epochs = config.epochs batch_size = config.batch_size if config.on_cuda: # determine whether to run on cuda config.on_cuda = torch.cuda.is_available() if config.on_cuda == False: logger.info( 'Cuda is unavailable, Although wants to run on cuda, Model still run on CPU' ) if config.model == 'BiLSTMAttention': model = biLSTMAttention.BiLSTMAttention( param_document_seq_len=DOCUMENT_SEQ_LEN, # 300 in our model param_character_embedding_len= CHARACTER_EMBEDDING_LEN, #it depends on the setting param_bilstm_hidden_size=1024 // 2, # 1024 is the Elmo size, the concatenated hidden size is supposed to Elmo size, however, any size is OK param_attention_size=(1024 // 2 * 2) // 1024 * 1024 + (1024 // 2 * 2) % 1024, # attention size should be a smoothed representation of character-emb param_class_count=5, param_options_file=config.options_file, param_weight_file=config.weight_file) if config.on_cuda: logger.info('Model run on GPU') model = model.cuda() logger.info('Model initialized on GPU') else: logger.info('Model run on CPU') model = model.cpu() logger.info('Model initialized on CPU') #print('logger-setted',file=sys.stderr) logger.info(model.modelname) #output the string informetion to the log logger.info(str( config.get_attrs())) #output the string information to the log #read in the trainset and the trial set train_data_manager = DataManager(batch_size, TRAINING_INSTANCES) #Train Set train_data_manager.load_dataframe_from_file(TRAIN_SET_PATH) #set the optimizer parameter, such as learning rate and weight_decay, function Adam, a method for Stochastic Optizimism lr = config.learning_rate #load the learning rate in config, that is settings.py # params_iterator_requires_grad can only be iterated once params_iterator_requires_grad = filter( lambda trainingParams: trainingParams.requires_grad, model.parameters()) # print( len(list(params_iterator_requires_grad) ) ) # 25 parameters optimizer = torch.optim.Adam( params_iterator_requires_grad, lr=lr, weight_decay=config. weight_decay #weight decay that is L2 penalty that is L2 regularization, usually added after a cost function(loss function), for example C=C_0+penalty, QuanZhongShuaiJian, to avoid overfitting ) # By default, the losses are averaged over observations for each minibatch. # However, if the field size_average is set to False, the losses are instead # summed for each minibatch criterion = torch.nn.CrossEntropyLoss( size_average=False ) #The CrossEntropyLoss, My selector in my notebook = loss + selecting strategy(often is selecting the least loss) #once you have the loss function, you also have to train the parameters in g(x), which will be used for prediction loss_meter = meter.AverageValueMeter( ) #the loss calculated after the smooth method, that is L2 penalty mentioned in torch.optim.Adam confusion_matrix = meter.ConfusionMeter( CLASS_COUNT ) #get confusionMatrix, the confusion matrix is the one show as follows: ''' class1 predicted class2 predicted class3 predicted class1 ground truth [[4, 1, 1] class2 ground truth [2, 3, 1] class2 ground truth [1, 2, 9]] ''' model.train() pre_loss = 1e100 best_acc = 0 for epoch in range(epochs): ''' an epoch, that is, train data of all barches(all the data) for one time ''' loss_meter.reset() confusion_matrix.reset() train_data_manager.reshuffle_dataframe() n_batch = train_data_manager.n_batches( ) # it was ceiled, so it is "instances/batch_size + 1" batch_index = 0 for batch_index in range(0, n_batch - 1): (x, y) = train_data_manager.next_batch( ) # this operation is time consuming x = Variable( torch.from_numpy(x).long() ) # long seems to trigger cuda error, it cannot handle long # variable by defalut requires_grad #print( x.size() ) y = Variable(torch.LongTensor(y), requires_grad=False) y = y - 1 #print(y.size()) ##########################logger.info('Begin fetching a batch') loss, scores, corrects = eval_batch(model, x, y, criterion, config.on_cuda) ##########################logger.info('End fetching a batch, begin optimizer') optimizer.zero_grad() loss.backward() optimizer.step() ##########################logger.info('End optimizer') loss_meter.add( loss.data.item() ) # data is the tensor, [0] is a Python number, if a 0-dim tensor, then .item will get the python number, if 1-dim then .items will get list confusion_matrix.add(scores.data, y.data) if ( batch_index + 1 ) % 50 == 0: # if batch_index == 10 then display the accuracy of the batch accuracy = corrects.float( ) / config.batch_size # for 2 LongTensors, 17 / 18 = 0 logger.info( 'TRAIN\tepoch: %d/%d\tbatch: %d/%d\tloss: %f\taccuracy: %f' % (epoch, epochs, batch_index, n_batch, loss_meter.value()[0], accuracy)) # .value()[0] is the loss value if TRAINING_INSTANCES % batch_size == 0: train_data_manager.set_current_cursor_in_dataframe_zero() else: batch_index += 1 # the value can be inherented (x, y) = train_data_manager.tail_batch() x = Variable(torch.from_numpy(x).long()) # long seems to trigger y = Variable(torch.LongTensor(y), requires_grad=False) y = y - 1 loss, scores, corrects = eval_batch(model, x, y, criterion, config.on_cuda) optimizer.zero_grad() loss.backward() optimizer.step() loss_meter.add(loss.data.item()) confusion_matrix.add(scores.data, y.data) if ( batch_index + 1 ) % 50 == 0: # if batch_index == 10 then display the accuracy of the batch accuracy = corrects.float( ) / config.batch_size # for 2 LongTensors, 17 / 18 = 0 #print("accuracy = %f, corrects = %d"%(accuracy, corrects)) logger.info( 'TRAIN\tepoch: %d/%d\tbatch: %d/%d\tloss: %f\taccuracy: %f' % (epoch, epochs, batch_index, n_batch, loss_meter.value()[0], accuracy) ) # .value()[0] is the loss value y = Variable( torch.LongTensor( y ) , requires_grad = False ) # after an epoch it should be evaluated model.eval() # switch to evaluate model #if ( batch_epochsindex + 1 ) % 25 == 0:# every 50 batches peek its accuracy and get the best accuracy confusion_matrix_value = confusion_matrix.value() acc = 0 for i in range(CLASS_COUNT): acc += confusion_matrix_value[i][i] #correct prediction count acc = acc / confusion_matrix_value.sum( ) #the accuracy, overall accuracy in an epoch the_overall_averaged_loss_in_epoch = loss_meter.value( )[0] # a 1-dim tensor with lenth 1, so you have to access the element by [0] logger.info('epoch: %d/%d\taverage_loss: %f\taccuracy: %f' % (epoch, epochs, the_overall_averaged_loss_in_epoch, acc)) model.train() # switch to train model #if accuracy increased, then save the model and change the learning rate if acc > best_acc: #save the model model.save(model_path_each_save) logger.info('model saved to %s' % model_path_each_save) #change the learning rate lr = lr * config.lr_decay logger.info('learning_rate changed to %f' % lr) for param_group in optimizer.param_groups: param_group['lr'] = lr best_acc = acc pre_loss = loss_meter.value()[0]