def process(self, data_loaders, outputs): beat_logger.debug("############### mt_train_initial_model") dl = data_loaders[0] (data, _, end_data_index) = dl[0] # separate train and dev data data_dict = pickle.loads(data["train_data"].text.encode("latin1")) self.data_dict_train, self.data_dict_dev = beat_separate_train_valid( data_dict) self.params['data'] = {} self.params['data']['train_set'] = {} self.params['data']['train_set']['src'] = self.data_dict_train['src'] self.params['data']['train_set']['trg'] = self.data_dict_train['trg'] self.params['data']['val_set'] = {} self.params['data']['val_set']['src'] = self.data_dict_dev['src'] self.params['data']['val_set']['trg'] = self.data_dict_dev['trg'] self.params['vocabulary'] = {} self.params['vocabulary']['src'] = data['source_vocabulary'].text self.params['vocabulary']['trg'] = data['target_vocabulary'].text self.params['filename'] = '/not/needed/beat_platform' self.params['sections'] = ['train', 'model', 'data', 'vocabulary'] opts = Options.from_dict(self.params, {}) setup_experiment(opts, beat_platform=True) dev_mgr = DeviceManager("gpu") # If given, seed that; if not generate a random seed and print it if opts.train['seed'] > 0: seed = fix_seed(opts.train['seed']) else: opts.train['seed'] = fix_seed() # Instantiate the model object model = getattr(models, opts.train['model_type'])(opts=opts, beat_platform=True) beat_logger.info( "Python {} -- torch {} with CUDA {} (on machine '{}')".format( platform.python_version(), torch.__version__, torch.version.cuda, platform.node())) beat_logger.info("nmtpytorch {}".format(nmtpytorch.__version__)) beat_logger.info(dev_mgr) beat_logger.info("Seed for further reproducibility: {}".format( opts.train['seed'])) loop = MainLoop(model, opts.train, dev_mgr, beat_platform=True) model = loop() # The model is Pickled with torch.save() and converted into a 1D-array of uint8 # Pass the model to the next block outputs['model'].write({'value': model}, end_data_index) beat_logger.debug("############## End of mt_train_model ############") return True
def unsupervised_model_adaptation(source, file_id, data_dict, model, translator, current_hypothesis): #beat_logger.debug("### mt_lifelong_loop:unsupervised_model_adaptation") ######################################################## # ACCESS TRAINING DATA TO ADAPT IF WANTED ######################################################## # You are allowed to re-process training data all along the lifelong adaptation. # The code below shows how to acces data from the training set. # The training and valid data is accessible through the data_dict # See below how to get all training data # The function beat_separate_train_valid allows to split the data into train/valid as during training (with loss of document information) data_dict_train, data_dict_valid = beat_separate_train_valid(data_dict) # Get the list of training files # We create a dictionnary with file_id as keys and the index of the file in the training set as value. # This dictionnary will be used later to access the training files by file_id ######################################################## # Shows how to access training data per INDEX # # Shows how to access source and file_info from # a training file by using its index in the training set ######################################################## file_index = 2 # index of the file we want to access #file_id_per_index = train_loader[file_index][0]['train_file_info'].file_id #time_stamp_per_index = train_loader[file_index][0]['train_file_info'].time_stamp #supervision_per_index = train_loader[file_index][0]['train_file_info'].supervision #source_per_index = train_loader[file_index][0]["train_source"].value ######################################################## # Shows how to access training data per file_id # # Shows how to access source text and file_info from # a training file by using its file_id ######################################################## # Assume that we know the file_id, note that we stored them earlier in a dictionnary train_file_id = list(data_dict.keys())[2] time_stamp_per_ID = data_dict[train_file_id]["file_info"]['time_stamp'] supervision_per_ID = data_dict[train_file_id]["file_info"]['supervision'] source_per_ID = data_dict[train_file_id]["source"] # TODO: Update the model and the translator object return model, translator
def process(self, inputs, data_loaders, outputs, loop_channel): #print("### mt_lifelong_loop:process start -- that's great!") """ """ ######################################################## # RECEIVE INCOMING INFORMATION FROM THE FILE TO PROCESS ######################################################## # Access source text of the current file to process source = inputs["processor_lifelong_source"].data.text # recreate the translation model and train/dev data if self.model is None or self.data_dict_train is None: # Get the model after initial training dl = data_loaders[0] (data, _, end_index) = dl[0] # Store the baseline model if self.model is None: model_data = data['model'].value self.model = struct.pack('{}B'.format(len(model_data)), *list(model_data)) if self.data_dict_train is None: data_dict = pickle.loads( data["processor_train_data"].text.encode("latin1")) self.data_dict_train, self.data_dict_dev = beat_separate_train_valid( data_dict) # Create a baseline Translator object from nmtpy self.translate_params['models'] = [self.model] self.translate_params['source'] = source translator = Translator(beat_platform=True, **self.translate_params) # train sentence vectors for data selection if self.train_sen_vecs is None: #Get the vocab from the opts of the model self.src_vocab = json.loads( translator.instances[0].opts['vocabulary']['src']) #Get the embeddings from the model's weights self.word_embs = translator.instances[0].enc.emb.weight # Create the sentence embeddings for the training data self.train_sen_vecs = get_sen_vecs(self.data_dict_train, self.src_vocab, self.word_embs) # Access incoming file information # See documentation for a detailed description of the mt_file_info file_info = inputs["processor_lifelong_file_info"].data file_id = file_info.file_id supervision = file_info.supervision time_stamp = file_info.time_stamp path_llnmt = '/home/barrault/msc/lifelongmt/' for p in ('original', 'adapted'): if not os.path.exists(path_llnmt + '{}'.format(p)): os.mkdir(path_llnmt + '{}'.format(p)) original_file = path_llnmt + 'original/{}'.format(file_id) adapted_file = path_llnmt + 'adapted/{}'.format(file_id) beat_logger.debug( "mt_lifelong_loop::process: received document {} ({} sentences) to translate " .format(file_id, len(source))) #beat_logger.debug('mt_lifelong_loop::process: source = {}'.format(source)) #TODO: prepare train/valid data for fine-tuning (eventually) -- might not be needed actually as the data is already contained in the training params -> this can be huge! current_hypothesis = run_translation(translator, source, file_id) with open(original_file, 'w') as f: for s in current_hypothesis: f.write(s) f.write('\n') # If human assisted learning is ON ################################################################################################### # Interact with the human if necessary # This section exchange information with the user simulation and ends up with a new hypothesis ################################################################################################### human_assisted_learning = supervision in ["active", "interactive"] # code not used!! if not human_assisted_learning: # In this method, see how to access initial training data to adapt the model # for the new incoming data self.adapted_model, adapted_translator = unsupervised_model_adaptation( source, file_id, self.train_data, self.model, translator, current_hypothesis) # update current_hypothesis with current model current_hypothesis = run_translation(adapted_translator, source, file_id) # If human assisted learning mode is on (active or interactive learning) while human_assisted_learning: # Create an empty request that is used to initiate interactive learning # For the case of active learning, this request is overwritten by your system itself # For now, only requests of type 'reference' are allowed (i.e. give me the reference translation for sentence 'sentence_id' of file 'file_id') if supervision == "active": # The system can send a question to the human in the loop # by using an object of type request # The request is the question asked to the system request = generate_system_request_to_user( file_id, source, current_hypothesis, self.qe_model) # Send the request to the user and wait for the answer message_to_user = { "file_id": file_id, # ID of the file the question is related to "hypothesis": current_hypothesis[ request['sentence_id']], # The current hypothesis "system_request": request, # the question for the human in the loop } #beat_logger.debug("mt_lifelong_loop::process: send message to user: request={}".format(request)) human_assisted_learning, user_answer = loop_channel.validate( message_to_user) # Take into account the user answer to generate a new hypothesis and possibly update the model adapted_model_data = online_adaptation( self.model, self.translate_params, file_id, request['sentence_id'], user_answer, source, current_hypothesis, self.data_dict_train, self.train_sen_vecs, self.src_vocab, self.word_embs) # Update the translator object with the current model self.adapted_model = struct.pack( '{}B'.format(len(adapted_model_data)), *list(adapted_model_data)) self.adapted_translate_params['models'] = [self.adapted_model] self.adapted_translate_params['source'] = source adapted_translator = Translator( beat_platform=True, **self.adapted_translate_params) # Generate a new translation new_hypothesis = run_translation(adapted_translator, source, file_id) # NOTE: let's debug by simply using the previous translation #new_hypothesis = current_hypothesis with open(adapted_file, 'w') as fad: for s in new_hypothesis: fad.write(s) fad.write('\n') #beat_logger.debug("BEFORE online_adaptation: {}".format(current_hypothesis)) #beat_logger.debug("AFTER online_adaptation : {}".format(new_hypothesis)) # Update the current hypothesis with the new one current_hypothesis = new_hypothesis human_assisted_learning = False else: human_assisted_learning = False # End of human assisted learning # Send the current hypothesis # self.init_end_index = 0 #beat_logger.debug("HYPOTHESIS: {}".format(current_hypothesis)) print("mt_lifelong_loop::process: FINISHED translated document {}: ". format(file_id)) outputs["hypothesis"].write(mt_to_allies(current_hypothesis)) if not inputs.hasMoreData(): pass # always return True, it signals BEAT to continue processing return True