def output_brat(output_filepaths, dataset_brat_folders, stats_graph_folder, overwrite=False): # Output brat files for dataset_type in ['train', 'valid', 'test', 'deploy']: if dataset_type not in output_filepaths.keys(): continue brat_output_folder = os.path.join(stats_graph_folder, 'brat', dataset_type) utils.create_folder_if_not_exists(brat_output_folder) conll_to_brat(output_filepaths[dataset_type], output_filepaths[dataset_type], dataset_brat_folders[dataset_type], brat_output_folder, overwrite=overwrite)
def predict(self, test_file_path): # Not use text = '' with open(test_file_path, "r") as f: text = f.read() test_file_path = test_file_path.split('/')[-1] self.prediction_count += 1 if self.prediction_count == 1: self.parameters['dataset_text_folder'] = os.path.join('..', 'data', 'temp') self.stats_graph_folder, _ = self._create_stats_graph_folder(self.parameters) # Update the deploy folder, file, and dataset dataset_type = 'deploy' ### Delete all deployment data for filepath in glob.glob(os.path.join(self.parameters['dataset_text_folder'], '{0}*'.format(dataset_type))): if os.path.isdir(filepath): shutil.rmtree(filepath) else: os.remove(filepath) ### Create brat folder and file dataset_brat_deploy_folder = os.path.join(self.parameters['dataset_text_folder'], dataset_type) utils.create_folder_if_not_exists(dataset_brat_deploy_folder) dataset_brat_deploy_filepath = os.path.join(dataset_brat_deploy_folder, test_file_path.format(str(self.prediction_count).zfill(5)))#self._get_dataset_brat_deploy_filepath(dataset_brat_deploy_folder) with codecs.open(dataset_brat_deploy_filepath, 'w', 'UTF-8') as f: f.write(text) ### Update deploy filepaths dataset_filepaths, dataset_brat_folders = self._get_valid_dataset_filepaths(self.parameters, dataset_types=[dataset_type]) self.dataset_filepaths.update(dataset_filepaths) self.dataset_brat_folders.update(dataset_brat_folders) ### Update the dataset for the new deploy set self.dataset.update_dataset(self.dataset_filepaths, [dataset_type]) # Predict labels and output brat output_filepaths = {} prediction_output = train.prediction_step(self.sess, self.dataset, dataset_type, self.model, self.transition_params_trained, self.stats_graph_folder, self.prediction_count, self.parameters, self.dataset_filepaths) _, _, output_filepaths[dataset_type] = prediction_output conll_to_brat.output_brat(output_filepaths, self.dataset_brat_folders, self.stats_graph_folder, overwrite=True) # Print and output result text_filepath = os.path.join(self.stats_graph_folder, 'brat', 'deploy', os.path.basename(dataset_brat_deploy_filepath)) annotation_filepath = os.path.join(self.stats_graph_folder, 'brat', 'deploy', '{0}.ann'.format(utils.get_basename_without_extension(dataset_brat_deploy_filepath))) text2, entities = brat_to_conll.get_entities_from_brat(text_filepath, annotation_filepath, verbose=True) assert(text == text2) #print (entities) os.rename(self.stats_graph_folder, "../data/" + self.stats_graph_folder.split('/')[-1]) print("Use brat tool to see result at ", "../data/" + self.stats_graph_folder.split('/')[-1])
def _create_stats_graph_folder(self, parameters): """ Initialize stats_graph_folder. Args: parameters (type): description. """ experiment_timestamp = utils.get_current_time_in_miliseconds() dataset_name = utils.get_basename_without_extension( parameters['dataset_text_folder']) model_name = '{0}_{1}'.format(dataset_name, experiment_timestamp) utils.create_folder_if_not_exists(parameters['output_folder']) # Folder where to save graphs stats_graph_folder = os.path.join(parameters['output_folder'], model_name) utils.create_folder_if_not_exists(stats_graph_folder) return stats_graph_folder, experiment_timestamp
def prepare_pretrained_model_for_restoring(output_folder_name, epoch_number, model_name, delete_token_mappings=False): input_model_folder = os.path.join('..', 'output', output_folder_name, 'model') output_model_folder = os.path.join('..', 'trained_models', model_name) utils.create_folder_if_not_exists(output_model_folder) # trim and copy dataset.pickle input_dataset_filepath = os.path.join(input_model_folder, 'dataset.pickle') output_dataset_filepath = os.path.join(output_model_folder, 'dataset.pickle') trim_dataset_pickle(input_dataset_filepath, output_dataset_filepath, delete_token_mappings=delete_token_mappings) # copy parameters.ini parameters_filepath = os.path.join(input_model_folder, 'parameters.ini') shutil.copy(parameters_filepath, output_model_folder) # (trim and) copy checkpoint files epoch_number_string = str(epoch_number).zfill(5) if delete_token_mappings: input_checkpoint_filepath = os.path.join( input_model_folder, 'model_{0}.ckpt'.format(epoch_number_string)) output_checkpoint_filepath = os.path.join(output_model_folder, 'model.ckpt') trim_model_checkpoint(parameters_filepath, output_dataset_filepath, input_checkpoint_filepath, output_checkpoint_filepath) else: for filepath in glob.glob( os.path.join(input_model_folder, 'model_{0}.ckpt*'.format(epoch_number_string))): shutil.copyfile( filepath, os.path.join( output_model_folder, os.path.basename(filepath).replace( '_' + epoch_number_string, '')))
def xml_to_brat(input_folder, output_folder, overwrite=True): print('input_folder: {0}'.format(input_folder)) assert os.path.exists(input_folder) start_time = time.time() if overwrite: shutil.rmtree(output_folder, ignore_errors=True) utils.create_folder_if_not_exists(output_folder) for input_filepath in sorted(glob.glob(os.path.join(input_folder, '*.xml'))): filename = utils.get_basename_without_extension(input_filepath) output_text_filepath = os.path.join(output_folder, '{0}.txt'.format(filename)) xmldoc = xml.etree.ElementTree.parse(input_filepath).getroot() # Get text text = xmldoc.findtext('TEXT') with codecs.open(output_text_filepath, 'w', 'UTF-8') as f: f.write(text) # Get PHI tags tags = xmldoc.findall('TAGS')[ 0] # [0] because there is only one <TAGS>...</TAGS> entities = [] for tag in tags: entity = {} entity['label'] = tag.get('TYPE') entity['text'] = tag.get('text') entity['start'] = int(tag.get('start')) entity['end'] = int(tag.get('end')) entities.append(entity) output_entities(output_folder, filename, entities, output_text_filepath, text, overwrite=overwrite) time_spent = time.time() - start_time print("Time spent formatting: {0:.2f} seconds".format(time_spent))
def telegraph_link_from_zipy_site(link): parser = ZipyParser() resp = urllib.request.urlopen(link) parser.feed(resp.read().decode('utf-8')) # while not parser.image_link: # chunck = resp.read(chunck_size) # parser.feed(chunck.decode('utf-8')) image_link = parser.image_link if image_link is None: print('image not found') return None img_resp = urllib.request.urlopen(image_link) tmp_media_path = '/tmp/il_shopping_bot' utils.create_folder_if_not_exists(tmp_media_path) tmp_path = os.path.join(tmp_media_path, 'anyf.jpg') image_down = img_resp.read() with open(tmp_path, 'wb') as f: f.write(image_down) photo_link = linking.telegraph_link_media(tmp_path) os.remove(tmp_path) return photo_link
def prepare_pretrained_model_for_restoring(output_folder_name, epoch_number, model_name, delete_token_mappings=False): ''' Copy the dataset.pickle, parameters.ini, and model checkpoint files after removing the data used for training. The dataset and labels are deleted from dataset.pickle by default. The only information about the dataset that remain in the pretrained model is the list of tokens that appears in the dataset and the corresponding token embeddings learned from the dataset. If delete_token_mappings is set to True, index_to_token and token_to_index mappings are deleted from dataset.pickle additionally, and the corresponding token embeddings are deleted from the model checkpoint files. In this case, the pretrained model would not contain any information about the dataset used for training the model. If you wish to share a pretrained model with delete_token_mappings = True, it is highly recommended to use some external pre-trained token embeddings and freeze them while training the model to obtain high performance. This can be done by specifying the token_pretrained_embedding_filepath and setting freeze_token_embeddings = True in parameters.ini for training. ''' input_model_folder = os.path.join('..', 'output', output_folder_name, 'model') output_model_folder = os.path.join('..', 'trained_models', model_name) utils.create_folder_if_not_exists(output_model_folder) # trim and copy dataset.pickle input_dataset_filepath = os.path.join(input_model_folder, 'dataset.pickle') output_dataset_filepath = os.path.join(output_model_folder, 'dataset.pickle') trim_dataset_pickle(input_dataset_filepath, output_dataset_filepath, delete_token_mappings=delete_token_mappings) # copy parameters.ini parameters_filepath = os.path.join(input_model_folder, 'parameters.ini') shutil.copy(parameters_filepath, output_model_folder) # (trim and) copy checkpoint files epoch_number_string = str(epoch_number).zfill(5) if delete_token_mappings: input_checkpoint_filepath = os.path.join(input_model_folder, 'model_{0}.ckpt'.format(epoch_number_string)) output_checkpoint_filepath = os.path.join(output_model_folder, 'model.ckpt') trim_model_checkpoint(parameters_filepath, output_dataset_filepath, input_checkpoint_filepath, output_checkpoint_filepath) else: for filepath in glob.glob(os.path.join(input_model_folder, 'model_{0}.ckpt*'.format(epoch_number_string))): shutil.copyfile(filepath, os.path.join(output_model_folder, os.path.basename(filepath).replace('_' + epoch_number_string, '')))
def main(): #### Parameters - start conf_parameters = configparser.ConfigParser() conf_parameters.read(os.path.join('.','parameters.ini')) nested_parameters = utils.convert_configparser_to_dictionary(conf_parameters) parameters = {} for k,v in nested_parameters.items(): parameters.update(v) for k,v in parameters.items(): if k in ['remove_unknown_tokens','character_embedding_dimension','character_lstm_hidden_state_dimension','token_embedding_dimension','token_lstm_hidden_state_dimension', 'patience','maximum_number_of_epochs','maximum_training_time','number_of_cpu_threads','number_of_gpus']: parameters[k] = int(v) if k in ['dropout_rate']: parameters[k] = float(v) if k in ['use_character_lstm','is_character_lstm_bidirect','is_token_lstm_bidirect','use_crf']: parameters[k] = distutils.util.strtobool(v) pprint(parameters) # Load dataset dataset_filepaths = {} dataset_filepaths['train'] = os.path.join(parameters['dataset_text_folder'], 'train.txt') dataset_filepaths['valid'] = os.path.join(parameters['dataset_text_folder'], 'valid.txt') dataset_filepaths['test'] = os.path.join(parameters['dataset_text_folder'], 'test.txt') dataset = ds.Dataset() dataset.load_dataset(dataset_filepaths, parameters) with tf.Graph().as_default(): session_conf = tf.ConfigProto( device_count={'CPU': 1, 'GPU': 1}, allow_soft_placement=True, # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist log_device_placement=False ) sess = tf.Session(config=session_conf) with sess.as_default(): model = EntityLSTM(dataset, parameters) # Define training procedure global_step = tf.Variable(0, name="global_step", trainable=False) if parameters['optimizer'] == 'adam': optimizer = tf.train.AdamOptimizer(1e-3) elif parameters['optimizer'] == 'sgd': optimizer = tf.train.GradientDescentOptimizer(0.005) else: raise ValueError("The lr_method parameter must be either adam or sgd.") # https://github.com/google/prettytensor/issues/6 # https://www.tensorflow.org/api_docs/python/framework/graph_collections #print('tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) : {0}'.format(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) )) #print('tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) : {0}'.format(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) )) #print('tf.get_collection(tf.GraphKeys.MODEL_VARIABLES) : {0}'.format(tf.get_collection(tf.GraphKeys.MODEL_VARIABLES) )) # https://github.com/blei-lab/edward/issues/286#ref-pullrequest-181330211 : utility function to get all tensorflow variables a node depends on grads_and_vars = optimizer.compute_gradients(model.loss) # By defining a global_step variable and passing it to the optimizer we allow TensorFlow handle the counting of training steps for us. # The global step will be automatically incremented by one every time you execute train_op. train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Initialize all variables sess.run(tf.global_variables_initializer()) # Load pretrained token embeddings if not parameters['token_pretrained_embedding_filepath'] == '': load_token_embeddings(sess, model.W, dataset, parameters) estop = False # early stop start_time = time.time() experiment_timestamp = utils.get_current_time_in_miliseconds() results = {} #results['model_options'] = copy.copy(model_options) #results['model_options'].pop('optimizer', None) results['epoch'] = {} # save/initialize execution details results['execution_details'] = {} results['execution_details']['train_start'] = start_time results['execution_details']['time_stamp'] = experiment_timestamp results['execution_details']['early_stop'] = False results['execution_details']['keyboard_interrupt'] = False results['execution_details']['num_epochs'] = 0 results['model_options'] = copy.copy(parameters) dataset_name = utils.get_basename_without_extension(parameters['dataset_text_folder']) #opts.train.replace('/', '_').split('.')[0] # 'conll2003en' model_name = '{0}_{1}'.format(dataset_name, results['execution_details']['time_stamp']) output_folder=os.path.join('..', 'output') stats_graph_folder=os.path.join(output_folder, model_name) # Folder where to save graphs utils.create_folder_if_not_exists(output_folder) print('stats_graph_folder: {0}'.format(stats_graph_folder)) utils.create_folder_if_not_exists(stats_graph_folder) model_folder = os.path.join(stats_graph_folder, 'model') utils.create_folder_if_not_exists(model_folder) step = 0 bad_counter = 0 previous_best_valid_f1_score = 0 transition_params_trained = np.random.rand(len(dataset.unique_labels),len(dataset.unique_labels)) try: while True: epoch_number = math.floor(step / len(dataset.token_indices['train'])) print('epoch_number: {0}'.format(epoch_number)) epoch_start_time = time.time() #print('step: {0}'.format(step)) # Train model: loop over all sequences of training set with shuffling sequence_numbers=list(range(len(dataset.token_indices['train']))) random.shuffle(sequence_numbers) for sequence_number in sequence_numbers: transition_params_trained = train_step(sess, dataset, sequence_number, train_op, global_step, model, transition_params_trained, parameters) step += 1 if sequence_number % 100 == 0: print('.',end='', flush=True) #break # Evaluate model print('step: {0}'.format(step)) all_predictions = {} all_y_true = {} output_filepaths = {} for dataset_type in ['train', 'valid', 'test']: print('dataset_type: {0}'.format(dataset_type)) all_predictions[dataset_type], all_y_true[dataset_type], output_filepaths[dataset_type] = evaluate_model(sess, dataset, dataset_type, model, transition_params_trained, step, stats_graph_folder, epoch_number, parameters) model_options = None # Save and plot results # TODO: remove uidx uidx = 0 results['epoch'][epoch_number] = [] results['execution_details']['num_epochs'] = epoch_number epoch_elapsed_training_time = time.time() - epoch_start_time print('epoch_elapsed_training_time: {0:02f} seconds'.format(epoch_elapsed_training_time)) assess_model.assess_and_save(results, dataset, model_options, all_predictions, all_y_true, stats_graph_folder, epoch_number, uidx, epoch_start_time) assess_model.plot_f1_vs_epoch(results, stats_graph_folder, 'f1_score') assess_model.plot_f1_vs_epoch(results, stats_graph_folder, 'accuracy_score') # CoNLL evaluation script for dataset_type in ['train', 'valid', 'test']: conll_evaluation_script = os.path.join('.', 'conlleval') conll_output_filepath = '{0}_conll_evaluation.txt'.format(output_filepaths[dataset_type]) shell_command = 'perl {0} < {1} > {2}'.format(conll_evaluation_script, output_filepaths[dataset_type], conll_output_filepath) print('shell_command: {0}'.format(shell_command)) #subprocess.call([shell_command]) os.system(shell_command) conll_parsed_output = utils_nlp.get_parsed_conll_output(conll_output_filepath) print('conll_parsed_output: {0}'.format(conll_parsed_output)) results['epoch'][epoch_number][0][dataset_type]['conll'] = conll_parsed_output results['epoch'][epoch_number][0][dataset_type]['f1_conll'] = {} results['epoch'][epoch_number][0][dataset_type]['f1_conll']['micro'] = results['epoch'][epoch_number][0][dataset_type]['conll']['all']['f1'] assess_model.plot_f1_vs_epoch(results, stats_graph_folder, 'f1_conll', from_json=False) #end_time = time.time() #results['execution_details']['train_duration'] = end_time - start_time #results['execution_details']['train_end'] = end_time # Early stop valid_f1_score = results['epoch'][epoch_number][0]['valid']['f1_score']['micro'] if valid_f1_score > previous_best_valid_f1_score: bad_counter = 0 previous_best_valid_f1_score = valid_f1_score else: bad_counter += 1 if bad_counter > parameters['patience']: print('Early Stop!') results['execution_details']['early_stop'] = True break if epoch_number > parameters['maximum_number_of_epochs']: break except KeyboardInterrupt: results['execution_details']['keyboard_interrupt'] = True # assess_model.save_results(results, stats_graph_folder) print('Training interrupted') print('Finishing the experiment') end_time = time.time() results['execution_details']['train_duration'] = end_time - start_time results['execution_details']['train_end'] = end_time assess_model.save_results(results, stats_graph_folder) sess.close() # release the session's resources
import subprocess from utils import create_folder_if_not_exists procs = [] log_files = [] log_path = './results/logs/' create_folder_if_not_exists(log_path) # Regular state-dependent value network prefix = 'regular_value_network' log_file = open(log_path + prefix, 'w') log_files.append(log_file) p = subprocess.Popen( 'python3 load_balance_actor_critic_train.py ' + \ '--num_workers 10 --service_rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 ' '--result_folder ./results/' + prefix + '/ ' + \ '--model_folder ./results/parameters/' + prefix + '/', stdout=log_file, stderr=log_file, shell=True) procs.append(p) # Multi-value network (10 values) prefix = '10_value_networks' log_file = open(log_path + prefix, 'w') log_files.append(log_file) p = subprocess.Popen( 'python3 load_balance_actor_multi_critic_train.py ' + \ '--num_workers 10 --service_rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 ' '--result_folder ./results/' + prefix + '/ ' + \ '--model_folder ./results/parameters/' + prefix + '/', stdout=log_file, stderr=log_file, shell=True) procs.append(p)
def predict(self, text): """ Predict Args: text (str): Description. """ self.prediction_count += 1 if self.prediction_count == 1: self.parameters['dataset_text_folder'] = os.path.join( '.', 'data', 'temp') self.stats_graph_folder, _ = self._create_stats_graph_folder( self.parameters) # Update the deploy folder, file, and modeldata dataset_type = 'deploy' # Delete all deployment data for filepath in glob.glob( os.path.join(self.parameters['dataset_text_folder'], '{0}*'.format(dataset_type))): if os.path.isdir(filepath): shutil.rmtree(filepath) else: os.remove(filepath) # Create brat folder and file dataset_brat_deploy_folder = os.path.join( self.parameters['dataset_text_folder'], dataset_type) utils.create_folder_if_not_exists(dataset_brat_deploy_folder) dataset_brat_deploy_filepath = os.path.join( dataset_brat_deploy_folder, 'temp_{0}.txt'.format(str(self.prediction_count).zfill(5))) #self._get_dataset_brat_deploy_filepath(dataset_brat_deploy_folder) # print('over here: ',dataset_brat_deploy_filepath) with codecs.open(dataset_brat_deploy_filepath, 'w', 'UTF-8') as f: f.write(text) # Update deploy filepaths dataset_filepaths, dataset_brat_folders = self._get_valid_dataset_filepaths( self.parameters, dataset_types=[dataset_type]) self.dataset_filepaths.update(dataset_filepaths) self.dataset_brat_folders.update(dataset_brat_folders) # Update the dataset for the new deploy set self.modeldata.update_dataset(self.dataset_filepaths, [dataset_type]) # Predict labels and output brat output_filepaths = {} prediction_output = train.prediction_step( self.sess, self.modeldata, dataset_type, self.model, self.transition_params_trained, self.stats_graph_folder, self.prediction_count, self.parameters, self.dataset_filepaths) _, _, output_filepaths[dataset_type] = prediction_output conll_to_brat.output_brat(output_filepaths, self.dataset_brat_folders, self.stats_graph_folder, overwrite=True) # Print and output result text_filepath = os.path.join( self.stats_graph_folder, 'brat', 'deploy', os.path.basename(dataset_brat_deploy_filepath)) annotation_filepath = os.path.join( self.stats_graph_folder, 'brat', 'deploy', '{0}.ann'.format( utils.get_basename_without_extension( dataset_brat_deploy_filepath))) text2, entities = brat_to_conll.get_entities_from_brat( text_filepath, annotation_filepath, verbose=True) assert (text == text2) return entities
def main(): parameters, conf_parameters = load_parameters() pprint(parameters) dataset_filepaths = get_valid_dataset_filepaths(parameters) check_parameter_compatiblity(parameters, dataset_filepaths) cross_validation = parameters[ 'cross_validation'] if 'cross_validation' in parameters else 1 valid_fscores = [] valid_precisions = [] valid_recalls = [] for cv in range(0, cross_validation): if "als" in dataset_filepaths['train'] and cross_validation > 1: train_files = list(range(0, cv)) + list( range(cv + 1, cross_validation)) test_file = cv file_train = "tmp_combined.train" file_valid = "tmp_combined.test" output = [] for i in train_files: with open(dataset_filepaths['train'] + "_" + str(i), "r", encoding="utf-8") as file: output.append(file.read()) with open(file_train, "w", encoding="utf-8") as file: file.write("\n\n".join(output)) output = [] with open(dataset_filepaths['train'] + "_" + str(test_file), "r", encoding="utf-8") as file: output.append(file.read()) with open(file_valid, "w", encoding="utf-8") as file: file.write("\n\n".join(output)) dataset_filepaths['train'] = file_train dataset_filepaths['valid'] = file_valid # Load dataset dataset = ds.Dataset(verbose=parameters['verbose'], debug=parameters['debug']) dataset.load_vocab_word_embeddings(parameters) dataset.load_dataset(dataset_filepaths, parameters) # Create graph and session with tf.Graph().as_default(): session_conf = tf.ConfigProto( intra_op_parallelism_threads=parameters[ 'number_of_cpu_threads'], inter_op_parallelism_threads=parameters[ 'number_of_cpu_threads'], device_count={ 'CPU': 1, 'GPU': parameters['number_of_gpus'] }, allow_soft_placement= True, # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist log_device_placement=False) session_conf.gpu_options.allow_growth = True sess = tf.Session(config=session_conf) with sess.as_default(): # Initialize and save execution details start_time = time.time() experiment_timestamp = utils.get_current_time_in_miliseconds() results = {} results['epoch'] = {} results['execution_details'] = {} results['execution_details']['train_start'] = start_time results['execution_details'][ 'time_stamp'] = experiment_timestamp results['execution_details']['early_stop'] = False results['execution_details']['keyboard_interrupt'] = False results['execution_details']['num_epochs'] = 0 results['model_options'] = copy.copy(parameters) dataset_name = utils.get_basename_without_extension( parameters['dataset_train']) if 'data_to_use' in parameters: model_name = '{0}_{1}'.format( parameters['language'] + "_" + dataset_name + "_small", results['execution_details']['time_stamp']) else: model_name = '{0}_{1}'.format( parameters['language'] + "_" + dataset_name, results['execution_details']['time_stamp']) output_folder = os.path.join('..', 'output') utils.create_folder_if_not_exists(output_folder) stats_graph_folder = os.path.join( output_folder, model_name) # Folder where to save graphs utils.create_folder_if_not_exists(stats_graph_folder) model_folder = os.path.join(stats_graph_folder, 'model') utils.create_folder_if_not_exists(model_folder) with open(os.path.join(model_folder, 'parameters.ini'), 'w') as parameters_file: conf_parameters.write(parameters_file) tensorboard_log_folder = os.path.join(stats_graph_folder, 'tensorboard_logs') utils.create_folder_if_not_exists(tensorboard_log_folder) tensorboard_log_folders = {} for dataset_type in dataset_filepaths.keys(): tensorboard_log_folders[dataset_type] = os.path.join( stats_graph_folder, 'tensorboard_logs', dataset_type) utils.create_folder_if_not_exists( tensorboard_log_folders[dataset_type]) #del dataset.embeddings_matrix if not parameters['use_pretrained_model']: pickle.dump( dataset, open(os.path.join(model_folder, 'dataset.pickle'), 'wb')) #dataset.load_pretrained_word_embeddings(parameters) # Instantiate the model # graph initialization should be before FileWriter, otherwise the graph will not appear in TensorBoard model = EntityLSTM(dataset, parameters) # Instantiate the writers for TensorBoard writers = {} for dataset_type in dataset_filepaths.keys(): writers[dataset_type] = tf.summary.FileWriter( tensorboard_log_folders[dataset_type], graph=sess.graph) embedding_writer = tf.summary.FileWriter( model_folder ) # embedding_writer has to write in model_folder, otherwise TensorBoard won't be able to view embeddings embeddings_projector_config = projector.ProjectorConfig() tensorboard_token_embeddings = embeddings_projector_config.embeddings.add( ) tensorboard_token_embeddings.tensor_name = model.token_embedding_weights.name token_list_file_path = os.path.join( model_folder, 'tensorboard_metadata_tokens.tsv') tensorboard_token_embeddings.metadata_path = os.path.relpath( token_list_file_path, '..') if parameters['use_character_lstm']: tensorboard_character_embeddings = embeddings_projector_config.embeddings.add( ) tensorboard_character_embeddings.tensor_name = model.character_embedding_weights.name character_list_file_path = os.path.join( model_folder, 'tensorboard_metadata_characters.tsv') tensorboard_character_embeddings.metadata_path = os.path.relpath( character_list_file_path, '..') projector.visualize_embeddings(embedding_writer, embeddings_projector_config) # Write metadata for TensorBoard embeddings token_list_file = codecs.open(token_list_file_path, 'w', 'UTF-8') for token_index in range(len(dataset.index_to_token)): token_list_file.write('{0}\n'.format( dataset.index_to_token[token_index])) token_list_file.close() if parameters['use_character_lstm']: character_list_file = codecs.open(character_list_file_path, 'w', 'UTF-8') for character_index in range(dataset.alphabet_size): if character_index == dataset.PADDING_CHARACTER_INDEX: character_list_file.write('PADDING\n') else: character_list_file.write('{0}\n'.format( dataset.index_to_character[character_index])) character_list_file.close() try: # Initialize the model sess.run(tf.global_variables_initializer()) if not parameters['use_pretrained_model']: model.load_pretrained_token_embeddings( sess, dataset, parameters) # Start training + evaluation loop. Each iteration corresponds to 1 epoch. bad_counter = 0 # number of epochs with no improvement on the validation test in terms of F1-score previous_best_valid_f1_score = 0 transition_params_trained = np.random.rand( len(dataset.unique_labels), len(dataset.unique_labels) ) #TODO np.random.rand(len(dataset.unique_labels)+2,len(dataset.unique_labels)+2) model_saver = tf.train.Saver( max_to_keep=None ) #parameters['maximum_number_of_epochs']) # defaults to saving all variables epoch_number = 0 while True: epoch_number += 1 print('\nStarting epoch {0}'.format(epoch_number)) epoch_start_time = time.time() if parameters[ 'use_pretrained_model'] and epoch_number == 1: # Restore pretrained model parameters transition_params_trained = train.restore_model_parameters_from_pretrained_model( parameters, dataset, sess, model, model_saver) elif epoch_number != 0: # Train model: loop over all sequences of training set with shuffling sequence_numbers = list( range(len(dataset.token_indices['train']))) random.shuffle(sequence_numbers) data_counter = 0 sub_id = 0 for i in tqdm(range(0, len(sequence_numbers), parameters['batch_size']), "Training", mininterval=1): data_counter += parameters['batch_size'] if data_counter >= 20000: data_counter = 0 sub_id += 0.001 print("Intermediate evaluation number: ", sub_id) #model_saver.save(sess, # os.path.join(model_folder, 'model_{0:05d}_{1}.ckpt'.format(epoch_number, len(sequence_numbers)/4/len(sequence_numbers)))) epoch_elapsed_training_time = time.time( ) - epoch_start_time print( 'Training completed in {0:.2f} seconds' .format(epoch_elapsed_training_time), flush=True) y_pred, y_true, output_filepaths = train.predict_labels( sess, model, transition_params_trained, parameters, dataset, epoch_number + sub_id, stats_graph_folder, dataset_filepaths) # Evaluate model: save and plot results evaluate.evaluate_model( results, dataset, y_pred, y_true, stats_graph_folder, epoch_number, epoch_start_time, output_filepaths, parameters) # Save model model_saver.save( sess, os.path.join( model_folder, 'model_{0:07.3f}.ckpt'.format( epoch_number + sub_id))) # Save TensorBoard logs summary = sess.run(model.summary_op, feed_dict=None) writers['train'].add_summary( summary, epoch_number) writers['train'].flush() utils.copytree( writers['train'].get_logdir(), model_folder) # Early stop valid_f1_score = results['epoch'][ epoch_number][0]['valid']['f1_score'][ 'micro'] # valid_precision = results['epoch'][epoch_number][0]['valid']['precision']['micro'] # valid_recall = results['epoch'][epoch_number][0]['valid']['recall']['micro'] # valid_fscores.append(valid_f1_score) if valid_f1_score > previous_best_valid_f1_score: bad_counter = 0 previous_best_valid_f1_score = valid_f1_score # previous_best_valid_precision = valid_precision # previous_best_valid_recall = valid_recall else: bad_counter += 1 sequence_number = sequence_numbers[ i:i + parameters['batch_size']] transition_params_trained, loss = train.train_step( sess, dataset, sequence_number, model, transition_params_trained, parameters) epoch_elapsed_training_time = time.time( ) - epoch_start_time print('Training completed in {0:.2f} seconds'.format( epoch_elapsed_training_time), flush=True) y_pred, y_true, output_filepaths = train.predict_labels( sess, model, transition_params_trained, parameters, dataset, epoch_number, stats_graph_folder, dataset_filepaths) # Evaluate model: save and plot results evaluate.evaluate_model(results, dataset, y_pred, y_true, stats_graph_folder, epoch_number, epoch_start_time, output_filepaths, parameters) # Save model model_saver.save( sess, os.path.join( model_folder, 'model_{0:05d}.ckpt'.format(epoch_number))) # Save TensorBoard logs summary = sess.run(model.summary_op, feed_dict=None) writers['train'].add_summary(summary, epoch_number) writers['train'].flush() utils.copytree(writers['train'].get_logdir(), model_folder) # Early stop valid_f1_score = results['epoch'][epoch_number][0][ 'valid']['f1_score']['micro'] #valid_precision = results['epoch'][epoch_number][0]['valid']['precision']['micro'] #valid_recall = results['epoch'][epoch_number][0]['valid']['recall']['micro'] #valid_fscores.append(valid_f1_score) if valid_f1_score > previous_best_valid_f1_score: bad_counter = 0 previous_best_valid_f1_score = valid_f1_score #previous_best_valid_precision = valid_precision #previous_best_valid_recall = valid_recall else: bad_counter += 1 print( "The last {0} epochs have not shown improvements on the validation set." .format(bad_counter)) if bad_counter >= parameters['patience']: print('Early Stop!') results['execution_details']['early_stop'] = True break if epoch_number >= parameters[ 'maximum_number_of_epochs']: break except KeyboardInterrupt: results['execution_details']['keyboard_interrupt'] = True print('Training interrupted') # remove the experiment remove_experiment = input( "Do you want to remove the experiment? (yes/y/Yes)") if remove_experiment in ["Yes", "yes", "y"]: shutil.rmtree(stats_graph_folder) print("Folder removed") else: print('Finishing the experiment') end_time = time.time() results['execution_details'][ 'train_duration'] = end_time - start_time results['execution_details']['train_end'] = end_time evaluate.save_results(results, stats_graph_folder) except Exception: logging.exception("") remove_experiment = input( "Do you want to remove the experiment? (yes/y/Yes)") if remove_experiment in ["Yes", "yes", "y"]: shutil.rmtree(stats_graph_folder) print("Folder removed") sess.close() # release the session's resources if 'cross_validation' in parameters and parameters[ 'cross_validation'] > 1: valid_fscores.append(previous_best_valid_f1_score) #valid_precisions.append(previous_best_valid_precision) #valid_recalls.append(previous_best_valid_recall) if 'cross_validation' in parameters and parameters['cross_validation'] > 1: print("mean f1score:", np.mean(valid_fscores)) #print("mean precision:", np.mean(valid_precisions)) #print("mean recall:", np.mean(valid_recalls)) with codecs.open(os.path.join(stats_graph_folder, "result_cv.txt"), "w") as file: file.write("F1score " + ", ".join(map(str, valid_fscores))) # file.write("Precision " + valid_precisions) # file.write("Recall " + valid_recalls) file.write("Mean F1score " + str(np.mean(valid_fscores)))
def main(): parameters, conf_parameters = load_parameters() dataset_filepaths, dataset_brat_folders = get_valid_dataset_filepaths(parameters) check_parameter_compatiblity(parameters, dataset_filepaths) # Load dataset dataset = ds.Dataset(verbose=parameters['verbose'], debug=parameters['debug']) dataset.load_dataset(dataset_filepaths, parameters) # Create graph and session with tf.Graph().as_default(): session_conf = tf.ConfigProto( intra_op_parallelism_threads=parameters['number_of_cpu_threads'], inter_op_parallelism_threads=parameters['number_of_cpu_threads'], device_count={'CPU': 1, 'GPU': parameters['number_of_gpus']}, allow_soft_placement=True, # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist log_device_placement=False ) sess = tf.Session(config=session_conf) with sess.as_default(): # Initialize and save execution details start_time = time.time() experiment_timestamp = utils.get_current_time_in_miliseconds() results = {} results['epoch'] = {} results['execution_details'] = {} results['execution_details']['train_start'] = start_time results['execution_details']['time_stamp'] = experiment_timestamp results['execution_details']['early_stop'] = False results['execution_details']['keyboard_interrupt'] = False results['execution_details']['num_epochs'] = 0 results['model_options'] = copy.copy(parameters) dataset_name = utils.get_basename_without_extension(parameters['dataset_text_folder']) model_name = '{0}_{1}'.format(dataset_name, results['execution_details']['time_stamp']) output_folder=os.path.join('..', 'output') utils.create_folder_if_not_exists(output_folder) stats_graph_folder=os.path.join(output_folder, model_name) # Folder where to save graphs utils.create_folder_if_not_exists(stats_graph_folder) model_folder = os.path.join(stats_graph_folder, 'model') utils.create_folder_if_not_exists(model_folder) with open(os.path.join(model_folder, 'parameters.ini'), 'w') as parameters_file: conf_parameters.write(parameters_file) tensorboard_log_folder = os.path.join(stats_graph_folder, 'tensorboard_logs') utils.create_folder_if_not_exists(tensorboard_log_folder) tensorboard_log_folders = {} for dataset_type in dataset_filepaths.keys(): tensorboard_log_folders[dataset_type] = os.path.join(stats_graph_folder, 'tensorboard_logs', dataset_type) utils.create_folder_if_not_exists(tensorboard_log_folders[dataset_type]) pickle.dump(dataset, open(os.path.join(model_folder, 'dataset.pickle'), 'wb')) # Instantiate the model # graph initialization should be before FileWriter, otherwise the graph will not appear in TensorBoard model = EntityLSTM(dataset, parameters) # Instantiate the writers for TensorBoard writers = {} for dataset_type in dataset_filepaths.keys(): writers[dataset_type] = tf.summary.FileWriter(tensorboard_log_folders[dataset_type], graph=sess.graph) embedding_writer = tf.summary.FileWriter(model_folder) # embedding_writer has to write in model_folder, otherwise TensorBoard won't be able to view embeddings embeddings_projector_config = projector.ProjectorConfig() tensorboard_token_embeddings = embeddings_projector_config.embeddings.add() tensorboard_token_embeddings.tensor_name = model.token_embedding_weights.name token_list_file_path = os.path.join(model_folder, 'tensorboard_metadata_tokens.tsv') tensorboard_token_embeddings.metadata_path = os.path.relpath(token_list_file_path, '..') tensorboard_character_embeddings = embeddings_projector_config.embeddings.add() tensorboard_character_embeddings.tensor_name = model.character_embedding_weights.name character_list_file_path = os.path.join(model_folder, 'tensorboard_metadata_characters.tsv') tensorboard_character_embeddings.metadata_path = os.path.relpath(character_list_file_path, '..') projector.visualize_embeddings(embedding_writer, embeddings_projector_config) # Write metadata for TensorBoard embeddings token_list_file = codecs.open(token_list_file_path,'w', 'UTF-8') for token_index in range(dataset.vocabulary_size): token_list_file.write('{0}\n'.format(dataset.index_to_token[token_index])) token_list_file.close() character_list_file = codecs.open(character_list_file_path,'w', 'UTF-8') for character_index in range(dataset.alphabet_size): if character_index == dataset.PADDING_CHARACTER_INDEX: character_list_file.write('PADDING\n') else: character_list_file.write('{0}\n'.format(dataset.index_to_character[character_index])) character_list_file.close() # Initialize the model sess.run(tf.global_variables_initializer()) if not parameters['use_pretrained_model']: model.load_pretrained_token_embeddings(sess, dataset, parameters) # Start training + evaluation loop. Each iteration corresponds to 1 epoch. bad_counter = 0 # number of epochs with no improvement on the validation test in terms of F1-score previous_best_valid_f1_score = 0 transition_params_trained = np.random.rand(len(dataset.unique_labels)+2,len(dataset.unique_labels)+2) model_saver = tf.train.Saver(max_to_keep=parameters['maximum_number_of_epochs']) # defaults to saving all variables epoch_number = -1 try: while True: step = 0 epoch_number += 1 print('\nStarting epoch {0}'.format(epoch_number)) epoch_start_time = time.time() if parameters['use_pretrained_model'] and epoch_number == 0: # Restore pretrained model parameters transition_params_trained = train.restore_model_parameters_from_pretrained_model(parameters, dataset, sess, model, model_saver) elif epoch_number != 0: # Train model: loop over all sequences of training set with shuffling sequence_numbers=list(range(len(dataset.token_indices['train']))) random.shuffle(sequence_numbers) for sequence_number in sequence_numbers: transition_params_trained = train.train_step(sess, dataset, sequence_number, model, transition_params_trained, parameters) step += 1 if step % 10 == 0: print('Training {0:.2f}% done'.format(step/len(sequence_numbers)*100), end='\r', flush=True) epoch_elapsed_training_time = time.time() - epoch_start_time print('Training completed in {0:.2f} seconds'.format(epoch_elapsed_training_time), flush=True) y_pred, y_true, output_filepaths = train.predict_labels(sess, model, transition_params_trained, parameters, dataset, epoch_number, stats_graph_folder, dataset_filepaths) # Evaluate model: save and plot results evaluate.evaluate_model(results, dataset, y_pred, y_true, stats_graph_folder, epoch_number, epoch_start_time, output_filepaths, parameters) if parameters['use_pretrained_model'] and not parameters['train_model']: conll_to_brat.output_brat(output_filepaths, dataset_brat_folders, stats_graph_folder) break # Save model model_saver.save(sess, os.path.join(model_folder, 'model_{0:05d}.ckpt'.format(epoch_number))) # Save TensorBoard logs summary = sess.run(model.summary_op, feed_dict=None) writers['train'].add_summary(summary, epoch_number) writers['train'].flush() utils.copytree(writers['train'].get_logdir(), model_folder) # Early stop valid_f1_score = results['epoch'][epoch_number][0]['valid']['f1_score']['micro'] if valid_f1_score > previous_best_valid_f1_score: bad_counter = 0 previous_best_valid_f1_score = valid_f1_score conll_to_brat.output_brat(output_filepaths, dataset_brat_folders, stats_graph_folder, overwrite=True) else: bad_counter += 1 print("The last {0} epochs have not shown improvements on the validation set.".format(bad_counter)) if bad_counter >= parameters['patience']: print('Early Stop!') results['execution_details']['early_stop'] = True break if epoch_number >= parameters['maximum_number_of_epochs']: break except KeyboardInterrupt: results['execution_details']['keyboard_interrupt'] = True print('Training interrupted') print('Finishing the experiment') end_time = time.time() results['execution_details']['train_duration'] = end_time - start_time results['execution_details']['train_end'] = end_time print('ok1') evaluate.save_results(results, stats_graph_folder) print('ok2') print('ok3') #sess.close() # release the session's resources print('ok4')
def main(): file_params = 'parameters_yelp_50k.ini' if len(sys.argv) > 1 and '.ini' in sys.argv[1]: file_params = sys.argv[1] # Load config parameters, conf_parameters = load_parameters( parameters_filepath=os.path.join('.', file_params)) dataset_filepaths = get_valid_dataset_filepaths(parameters) #check_parameter_compatiblity(parameters, dataset_filepaths) if parameters['seed'] != -1: random.seed(parameters['seed']) # Create annotator annotator = stanford_corenlp_pywrapper.CoreNLP( configdict={ 'annotators': 'tokenize, ssplit', 'ssplit.eolonly': True }, corenlp_jars=[parameters['stanford_folder'] + '/*']) # Load dataset dataset = ds.Dataset(verbose=parameters['verbose'], debug=parameters['debug']) dataset.load_dataset(dataset_filepaths, parameters, annotator) # Adapt train/valid/test to be multiple of batch_size for size in ['train_size', 'valid_size', 'test_size']: if parameters[size] % parameters['batch_size'] != 0: parameters[size] = int( parameters[size] / parameters['batch_size']) * parameters['batch_size'] print('Changed {}'.format(size)) # Set GPU device if more GPUs are specified if parameters['number_of_gpus'] > 1 and parameters['gpu_device'] != -1: os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = parameters['gpu_device'] # GPUs print(device_lib.list_local_devices()) # Create graph and session with tf.Graph().as_default(): session_conf = tf.ConfigProto( intra_op_parallelism_threads=parameters['number_of_cpu_threads'], inter_op_parallelism_threads=parameters['number_of_cpu_threads'], device_count={ 'CPU': 1, 'GPU': parameters['number_of_gpus'] }, allow_soft_placement= True, # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): if parameters['seed'] != -1: tf.set_random_seed(parameters['seed']) # Initialize and save execution details start_time = time.time() experiment_timestamp = utils.get_current_time_in_miliseconds() results = {} results['epoch'] = {} results['execution_details'] = {} results['execution_details']['train_start'] = start_time results['execution_details'][ 'time_stamp'] = experiment_timestamp results['execution_details']['early_stop'] = False results['execution_details']['keyboard_interrupt'] = False results['execution_details']['num_epochs'] = 0 results['model_options'] = copy.copy(parameters) dataset_name = utils.get_basename_without_extension( parameters['dataset_folder']) model_name = '{0}_{1}'.format( dataset_name, results['execution_details']['time_stamp']) output_folder = os.path.join('..', 'output') utils.create_folder_if_not_exists(output_folder) stats_graph_folder = os.path.join( output_folder, model_name) # Folder where to save graphs utils.create_folder_if_not_exists(stats_graph_folder) model_folder = os.path.join(stats_graph_folder, 'model') utils.create_folder_if_not_exists(model_folder) with open(os.path.join(model_folder, file_params), 'w') as parameters_file: conf_parameters.write(parameters_file) pickle.dump( dataset, open(os.path.join(model_folder, 'dataset.pickle'), 'wb')) # Instantiate the model # graph initialization should be before FileWriter, otherwise the graph will not appear in TensorBoard model = SelfSent(dataset, parameters) # Initialize the model sess.run(tf.global_variables_initializer()) if not parameters['use_pretrained_model']: model.load_pretrained_token_embeddings( sess, dataset, parameters) # Start training + evaluation loop. Each iteration corresponds to 1 epoch. bad_counter = 0 # number of epochs with no improvement on the validation test previous_best_valid_accuracy = 0 previous_best_test_accuracy = 0 model_saver = tf.train.Saver( max_to_keep=parameters['maximum_number_of_epochs'] ) # defaults to saving all variables epoch_number = -1 try: while True: epoch_number += 1 print('\nStarting epoch {0}'.format(epoch_number)) epoch_start_time = time.time() if parameters[ 'use_pretrained_model'] and epoch_number == 0: # Restore pretrained model parameters dataset = train.restore_model_parameters_from_pretrained_model( parameters, dataset, sess, model_saver) dataset.load_deploy( os.path.join(parameters['dataset_folder'], '{0}.json'.format('deploy')), parameters, annotator) y_pred, y_true, output_filepaths, attentions = train.predict_labels( sess, model, parameters, dataset, epoch_number, stats_graph_folder, dataset_filepaths, only_deploy=True) y_pred = y_pred['deploy'] with open( output_filepaths['deploy'] [:output_filepaths['deploy'].rfind('/') + 1] + 'attention.txt', 'w', encoding='utf-8') as fp: # Compute attention tokens_with_attentions = [] for sample_id in range(len(y_pred)): attention = attentions[int( sample_id / parameters['batch_size'])][ sample_id % parameters['batch_size']] # Remove padded dimension attention = attention[:dataset. token_lengths[ 'deploy'] [sample_id]] # Save current attention fp.write("{}\t{:05.2f}\t".format( y_pred[sample_id][0], y_pred[sample_id][1])) fp.write(' '.join(dataset.tokens['deploy'] [sample_id]) + '\t') fp.write(' '.join( [str(a) for a in attention.flatten()]) + '\n') # Sum over columns (we combine all the annotation vectors) attention = np.sum(attention, axis=1) # Normalize to sum at 1 attention = attention / np.linalg.norm( attention) # Keep only high confidence if y_pred[sample_id][1] >= parameters[ 'attention_visualization_conf']: tokens_with_attentions.append( (y_pred[sample_id][0], y_pred[sample_id][1], dataset.tokens['deploy'] [sample_id], attention)) # Plot attention utils_plots.visualize_attention( tokens_with_attentions, dataset.unique_labels, output_filepaths['deploy'] [:output_filepaths['deploy'].rfind('/') + 1], parameters['attention_visualization_conf']) break elif epoch_number != 0: total_loss, total_accuracy = train.train_step( sess, dataset, model, parameters) print('Mean loss: {:.2f}\tMean accuracy: {:.2f}'. format(np.mean(total_loss), 100.0 * np.mean(total_accuracy)), flush=True) epoch_elapsed_training_time = time.time( ) - epoch_start_time print('Training completed in {0:.2f} seconds'.format( epoch_elapsed_training_time), flush=True) y_pred, y_true, output_filepaths, _ = train.predict_labels( sess, model, parameters, dataset, epoch_number, stats_graph_folder, dataset_filepaths) # Evaluate model: save and plot results evaluate.evaluate_model(results, dataset, y_pred, y_true, stats_graph_folder, epoch_number, epoch_start_time, output_filepaths, parameters) # Save model model_saver.save( sess, os.path.join( model_folder, 'model_{0:05d}.ckpt'.format(epoch_number))) # Early stop valid_accuracy = results['epoch'][epoch_number][0][ 'valid']['accuracy_score'] if valid_accuracy > previous_best_valid_accuracy: bad_counter = 0 previous_best_valid_accuracy = valid_accuracy previous_best_test_accuracy = results['epoch'][ epoch_number][0]['test']['accuracy_score'] else: bad_counter += 1 print( "The last {0} epochs have not shown improvements on the validation set." .format(bad_counter)) print("Best valid with test performances in epoch " + str(epoch_number - bad_counter) + ": {:05.2f}%\t{:05.2f}%".format( previous_best_valid_accuracy, previous_best_test_accuracy)) if bad_counter >= parameters['patience']: print('Early Stop!') results['execution_details']['early_stop'] = True break if epoch_number >= parameters[ 'maximum_number_of_epochs']: break except KeyboardInterrupt: results['execution_details']['keyboard_interrupt'] = True print('Training interrupted') print('Finishing the experiment') end_time = time.time() results['execution_details'][ 'train_duration'] = end_time - start_time results['execution_details']['train_end'] = end_time evaluate.save_results(results, stats_graph_folder) sess.close() # release the session's resources
def conll_to_brat(conll_input_filepath, conll_output_filepath, brat_original_folder, brat_output_folder, overwrite=False): ''' convert conll file in conll-filepath to brat annotations and output to brat_output_folder, with reference to the existing text files in brat_original_folder if brat_original_folder does not exist or contain any text file, then the text files are generated from conll files, and conll file is updated with filenames and token offsets accordingly. conll_input_filepath: path to conll file to convert to brat annotations conll_output_filepath: path to output conll file with filename and offsets that are compatible with brat annotations brat_original_folder: folder that contains the original .txt (and .ann) files that are formatted according to brat. .txt files are used to check if the token offsets match and generate the annotation from conll. brat_output_folder: folder to output the text and brat annotations .txt files are copied from brat_original_folder to brat_output_folder ''' verbose = False dataset_type = utils.get_basename_without_extension(conll_input_filepath) print("Formatting {0} set from CONLL to BRAT... ".format(dataset_type), end='') # if brat_original_folder does not exist or have any text file if not os.path.exists(brat_original_folder) or len(glob.glob(os.path.join(brat_original_folder, '*.txt'))) == 0: assert (conll_input_filepath != conll_output_filepath) generate_reference_text_file_for_conll(conll_input_filepath, conll_output_filepath, brat_original_folder) utils.create_folder_if_not_exists(brat_output_folder) conll_file = codecs.open(conll_output_filepath, 'r', 'latin-1', errors='replace') previous_token_label = 'O' previous_filename = '' text_filepath = '' text = '' entity_id = 1 entities = [] entity = {} line_count = 0 for line in conll_file: line = line.strip().split(' ') # New sentence if len(line) == 0 or len(line[0]) == 0 or '-DOCSTART-' in line[0]: # Add the last entity if entity != {}: if verbose: print("entity: {0}".format(entity)) entities.append(entity) entity_id += 1 entity = {} previous_token_label = 'O' continue filename = str(line[1]) # New file if filename != previous_filename: output_entities(brat_output_folder, previous_filename, entities, text_filepath, text, overwrite=overwrite) text_filepath = os.path.join(brat_original_folder, '{0}.txt'.format(filename)) with codecs.open(text_filepath, 'r', 'latin-1', errors='replace') as f: text = f.read() previous_token_label = 'O' previous_filename = filename entity_id = 1 entities = [] entity = {} label = str(line[-1]).replace('_', '-') # For LOCATION-OTHER if label == 'O': # Previous entity ended if previous_token_label != 'O': if verbose: print("entity: {0}".format(entity)) entities.append(entity) entity_id += 1 entity = {} previous_token_label = 'O' continue token = {} token['text'] = str(line[0]) token['start'] = int(line[2]) token['end'] = int(line[3]) # check that the token text matches the original if token['text'] != text[token['start']:token['end']].replace(' ', '-'): print("Warning: conll and brat text do not match.") print("\tCONLL: {0}".format(token['text'])) print("\tBRAT : {0}".format(text[token['start']:token['end']])) token['label'] = label[2:] if label[:2] == 'B-': if previous_token_label != 'O': # End the previous entity if verbose: print("entity: {0}".format(entity)) entities.append(entity) entity_id += 1 # Start a new entity entity = token elif label[:2] == 'I-': # Entity continued if previous_token_label == token['label']: # if there is no newline between the entity and the token if '\n' not in text[entity['end']:token['start']]: # Update entity entity['text'] = entity['text'] + ' ' + token['text'] entity['end'] = token['end'] else: # newline between the entity and the token # End the previous entity if verbose: print("entity: {0}".format(entity)) entities.append(entity) entity_id += 1 # Start a new entity entity = token elif previous_token_label != 'O': # TODO: count BI or II incompatibility # End the previous entity if verbose: print("entity: {0}".format(entity)) entities.append(entity) entity_id += 1 # Start new entity entity = token else: # previous_token_label == 'O' # TODO: count OI incompatibility # Start new entity entity = token previous_token_label = token['label'] output_entities(brat_output_folder, previous_filename, entities, text_filepath, text, overwrite=overwrite) conll_file.close() print('Done.')
def main(argv=sys.argv): ''' NeuroNER main method Args: parameters_filepath the path to the parameters file output_folder the path to the output folder ''' arguments = parse_arguments(argv[1:]) parameters, conf_parameters = load_parameters( arguments['parameters_filepath'], arguments=arguments) dataset_filepaths, dataset_brat_folders = get_valid_dataset_filepaths( parameters) check_parameter_compatiblity(parameters, dataset_filepaths) # Load dataset dataset = ds.Dataset(verbose=parameters['verbose'], debug=parameters['debug']) dataset.load_dataset(dataset_filepaths, parameters) # Create graph and session with tf.device('/gpu:0'): with tf.Graph().as_default(): session_conf = tf.ConfigProto( intra_op_parallelism_threads=parameters[ 'number_of_cpu_threads'], inter_op_parallelism_threads=parameters[ 'number_of_cpu_threads'], device_count={ 'CPU': 1, 'GPU': parameters['number_of_gpus'] }, allow_soft_placement=True, # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): start_time = time.time() experiment_timestamp = utils.get_current_time_in_miliseconds() results = {} results['epoch'] = {} results['execution_details'] = {} results['execution_details']['train_start'] = start_time results['execution_details'][ 'time_stamp'] = experiment_timestamp results['execution_details']['early_stop'] = False results['execution_details']['keyboard_interrupt'] = False results['execution_details']['num_epochs'] = 0 results['model_options'] = copy.copy(parameters) dataset_name = utils.get_basename_without_extension( parameters['dataset_text_folder']) model_name = dataset_name utils.create_folder_if_not_exists(parameters['output_folder']) stats_graph_folder = os.path.join( parameters['output_folder'], model_name) # Folder where to save graphs final_weights_folder = os.path.join( parameters['output_folder'], 'weights') utils.create_folder_if_not_exists(stats_graph_folder) utils.create_folder_if_not_exists(final_weights_folder) model_folder = os.path.join(stats_graph_folder, 'model') utils.create_folder_if_not_exists(model_folder) # saving the parameter setting to the output model dir. For later resuming training with open(os.path.join(model_folder, 'parameters.ini'), 'w') as parameters_file: conf_parameters.write(parameters_file) tensorboard_log_folder = os.path.join(stats_graph_folder, 'tensorboard_logs') utils.create_folder_if_not_exists(tensorboard_log_folder) tensorboard_log_folders = {} for dataset_type in dataset_filepaths.keys(): tensorboard_log_folders[dataset_type] = os.path.join( stats_graph_folder, 'tensorboard_logs', dataset_type) utils.create_folder_if_not_exists( tensorboard_log_folders[dataset_type]) pickle.dump( dataset, open(os.path.join(model_folder, 'dataset.pickle'), 'wb')) # Instantiate the model # graph initialization should be before FileWriter, otherwise the graph will not appear in TensorBoard model = EntityLSTM(dataset, parameters) # Instantiate the writers for TensorBoard writers = {} for dataset_type in dataset_filepaths.keys(): writers[dataset_type] = tf.summary.FileWriter( tensorboard_log_folders[dataset_type], graph=sess.graph) # embedding_writer has to write in model_folder, otherwise TensorBoard won't be able to view embeddings embedding_writer = tf.summary.FileWriter(model_folder) embeddings_projector_config = projector.ProjectorConfig() tensorboard_token_embeddings = embeddings_projector_config.embeddings.add( ) tensorboard_token_embeddings.tensor_name = model.token_embedding_weights.name token_list_file_path = os.path.join( model_folder, 'tensorboard_metadata_tokens.tsv') tensorboard_token_embeddings.metadata_path = os.path.relpath( token_list_file_path, '..') tensorboard_character_embeddings = embeddings_projector_config.embeddings.add( ) tensorboard_character_embeddings.tensor_name = model.character_embedding_weights.name character_list_file_path = os.path.join( model_folder, 'tensorboard_metadata_characters.tsv') tensorboard_character_embeddings.metadata_path = os.path.relpath( character_list_file_path, '..') projector.visualize_embeddings(embedding_writer, embeddings_projector_config) # Write metadata for TensorBoard embeddings token_list_file = codecs.open(token_list_file_path, 'w', 'latin-1') for token_index in range(dataset.vocabulary_size): token_list_file.write('{0}\n'.format( dataset.index_to_token[token_index])) token_list_file.close() character_list_file = codecs.open(character_list_file_path, 'w', 'latin-1') for character_index in range(dataset.alphabet_size): if character_index == dataset.PADDING_CHARACTER_INDEX: character_list_file.write('PADDING\n') else: character_list_file.write('{0}\n'.format( dataset.index_to_character[character_index])) character_list_file.close() # Initialize the model sess.run(tf.global_variables_initializer()) if not parameters['use_pretrained_model']: model.load_pretrained_token_embeddings( sess, dataset, parameters) # Start training + evaluation loop. Each iteration corresponds to 1 epoch. patience_counter = 0 f1_score_best = 0 f1_scores = {'train-F1': [], 'valid-F1': [], 'test-F1': []} f1_scores_conll = { 'train-F1': [], 'valid-F1': [], 'test-F1': [] } transition_params_trained = np.random.rand( len(dataset.unique_labels) + 2, len(dataset.unique_labels) + 2) model_saver = tf.train.Saver( max_to_keep=parameters['num_of_model_to_keep']) epoch_number = -1 try: while True: step = 0 epoch_number += 1 print('\nStarting epoch {0}'.format(epoch_number)) epoch_start_time = time.time() # use pre-trained model and epoch_number = 0 if parameters[ 'use_pretrained_model'] and epoch_number == 0: if parameters['use_adapter']: parameters['use_adapter'] = False transition_params_trained = train.restore_pretrained_model( parameters, dataset, sess, model, model_saver) print( 'Getting the 3-label predictions from the step1 model.' ) all_pred_labels, y_pred_for_adapter, y_true_for_adapter, \ output_filepaths = train.predict_labels(sess, model, transition_params_trained, parameters, dataset, epoch_number, stats_graph_folder, dataset_filepaths, for_adapter=True) # use the label2idx mapping (for adapter) in the dataset to transform all_pred_labels all_pred_indices = {} for dataset_type in dataset_filepaths.keys(): all_pred_indices[dataset_type] = [] for i in range( len(all_pred_labels[dataset_type]) ): indices = [ dataset. label_adapter_to_index[label] for label in all_pred_labels[dataset_type][i] ] all_pred_indices[dataset_type].append( indices) # and use binarizer to transform to ndarray label_binarizer_adapter = sklearn.preprocessing.LabelBinarizer( ) label_binarizer_adapter.fit( range( max(dataset.index_to_label_adapter. keys()) + 1)) predicted_label_adapter_vector_indices = {} for dataset_type in dataset_filepaths.keys(): predicted_label_adapter_vector_indices[ dataset_type] = [] for label_indices_sequence in all_pred_indices[ dataset_type]: predicted_label_adapter_vector_indices[ dataset_type].append( label_binarizer_adapter. transform( label_indices_sequence)) parameters['use_adapter'] = True if parameters['train_model'] and parameters[ 'add_class']: transition_params_trained, model, glo_step = \ train.restore_model_parameters_from_pretrained_model(parameters, dataset, sess, model, model_saver) init_new_vars_op = tf.initialize_variables( [glo_step]) sess.run(init_new_vars_op) else: transition_params_trained = \ train.restore_pretrained_model(parameters, dataset, sess, model, model_saver) for dataset_type in dataset_filepaths.keys(): writers[dataset_type] = tf.summary.FileWriter( tensorboard_log_folders[dataset_type], graph=sess.graph) # embedding_writer has to write in model_folder, otherwise TensorBoard won't be able to view embeddings embedding_writer = tf.summary.FileWriter( model_folder) # epoch_number != 0, no matter use or not use pre-trained model elif epoch_number != 0: # Train model: loop over all sequences of training set with shuffling sequence_numbers = list( range(len(dataset.token_indices['train']))) random.shuffle(sequence_numbers) for sequence_number in sequence_numbers: transition_params_trained, W_before_crf = train.train_step( sess, dataset, sequence_number, model, transition_params_trained, parameters) step += 1 epoch_elapsed_training_time = time.time( ) - epoch_start_time print('Training completed in {0:.2f} seconds'.format( epoch_elapsed_training_time), flush=False) if parameters[ 'use_adapter']: # model evaluation, using adapter # pass the pred_for_adapter as label_indices vector original_label_adapter_vector_indices = dataset.label_adapter_vector_indices dataset.label_adapter_vector_indices = predicted_label_adapter_vector_indices y_pred, y_true, output_filepaths = train.predict_labels( sess, model, transition_params_trained, parameters, dataset, epoch_number, stats_graph_folder, dataset_filepaths) evaluate.evaluate_model(results, dataset, y_pred, y_true, stats_graph_folder, epoch_number, epoch_start_time, output_filepaths, parameters) dataset.label_adapter_vector_indices = original_label_adapter_vector_indices else: # model evaluation, not using adapter y_pred, y_true, output_filepaths = train.predict_labels( sess, model, transition_params_trained, parameters, dataset, epoch_number, stats_graph_folder, dataset_filepaths) # Evaluate model: save and plot results evaluate.evaluate_model(results, dataset, y_pred, y_true, stats_graph_folder, epoch_number, epoch_start_time, output_filepaths, parameters) summary = sess.run(model.summary_op, feed_dict=None) writers['train'].add_summary(summary, epoch_number) writers['train'].flush() utils.copytree(writers['train'].get_logdir(), model_folder) # Early stopping train_f1_score = results['epoch'][epoch_number][0][ 'train']['f1_score']['weighted'] valid_f1_score = results['epoch'][epoch_number][0][ 'valid']['f1_score']['weighted'] test_f1_score = results['epoch'][epoch_number][0][ 'test']['f1_score']['weighted'] f1_scores['train-F1'].append(train_f1_score) f1_scores['valid-F1'].append(valid_f1_score) f1_scores['test-F1'].append(test_f1_score) train_f1_score_conll = results['epoch'][epoch_number][ 0]['train']['f1_conll']['micro'] valid_f1_score_conll = results['epoch'][epoch_number][ 0]['valid']['f1_conll']['micro'] test_f1_score_conll = results['epoch'][epoch_number][ 0]['test']['f1_conll']['micro'] f1_scores_conll['train-F1'].append( train_f1_score_conll) f1_scores_conll['valid-F1'].append( valid_f1_score_conll) f1_scores_conll['test-F1'].append(test_f1_score_conll) if valid_f1_score > f1_score_best: patience_counter = 0 f1_score_best = valid_f1_score # Save the best model model_saver.save( sess, os.path.join(model_folder, 'best_model.ckpt')) print( 'updated model to current epoch : epoch {:d}'. format(epoch_number)) print('the model is saved in: {:s}'.format( model_folder)) else: patience_counter += 1 print("In epoch {:d}, the valid F1 is : {:f}".format( epoch_number, valid_f1_score)) print( "The last {0} epochs have not shown improvements on the validation set." .format(patience_counter)) if patience_counter >= parameters['patience']: print('Early Stop!') results['execution_details']['early_stop'] = True # save last model model_saver.save( sess, os.path.join(model_folder, 'last_model.ckpt')) print('the last model is saved in: {:s}'.format( model_folder)) break if epoch_number >= parameters[ 'maximum_number_of_epochs'] and not parameters[ 'refine_with_crf']: break if not parameters['use_pretrained_model']: plot_name = 'F1-summary-step1.svg' else: plot_name = 'F1-summary-step2.svg' print('Sklearn result:') for k, l in f1_scores.items(): print(k, l) print('Conll result:') for k, l in f1_scores_conll.items(): print(k, l) utils_plots.plot_f1( f1_scores, os.path.join(stats_graph_folder, '..', plot_name), 'F1 score summary') # TODO: in step 1, for task a, add the best deploy data to step 2 train set, and call script print('(sklearn micro) test F1:') micro_f1 = ','.join([ str(results['epoch'][ep][0]['test']['f1_score'] ['micro']) for ep in range(epoch_number + 1) ]) print(micro_f1) print('(sklearn macro) test F1:') macro_f1 = ','.join([ str(results['epoch'][ep][0]['test']['f1_score'] ['macro']) for ep in range(epoch_number + 1) ]) print(macro_f1) except KeyboardInterrupt: results['execution_details']['keyboard_interrupt'] = True print('Training interrupted') print('Finishing the experiment') end_time = time.time() results['execution_details'][ 'train_duration'] = end_time - start_time results['execution_details']['train_end'] = end_time evaluate.save_results(results, stats_graph_folder) for dataset_type in dataset_filepaths.keys(): writers[dataset_type].close() sess.close() # release the session's resources
def initialize_population(self, params: dict): """ Initializes params.population_size sender and receiver models Args: params (required): params obtained from argparse """ if params.save_example_batch: create_folder_if_not_exists(self.run_folder + "/messages") if params.single_pool: create_folder_if_not_exists(self.run_folder + "/agents") if params.evolution: create_folder_if_not_exists(self.run_folder + "/agents_genotype") else: create_folder_if_not_exists(self.run_folder + "/senders") create_folder_if_not_exists(self.run_folder + "/receivers") if params.evolution: create_folder_if_not_exists(self.run_folder + "/senders_genotype") create_folder_if_not_exists(self.run_folder + "/receivers_genotype") for i in range(params.population_size): sender_genotype = None receiver_genotype = None if params.evolution: sender_genotype = generate_genotype( num_nodes=params.init_nodes) receiver_genotype = generate_genotype( num_nodes=params.init_nodes) if params.single_pool: self.agents.append( SingleAgent(self.run_folder, params, genotype=sender_genotype, agent_id=i)) else: self.senders.append( SenderAgent(self.run_folder, params, genotype=sender_genotype, agent_id=i)) self.receivers.append( ReceiverAgent(self.run_folder, params, genotype=receiver_genotype, agent_id=i))
def train(self, max_number_of_epoch, model_folder, dropout_rate=0.5): # stats_graph_folder, experiment_timestamp = utils.create_stats_graph_folder(parameters) # Initialize and save execution details start_time = time.time() utils.create_folder_if_not_exists(model_folder) pickle.dump(self.dataset, open(os.path.join(model_folder, 'dataset.pickle'), 'wb')) pickle.dump( self.parameters, open(os.path.join(model_folder, 'parameters.pickle'), 'wb')) bad_counter = 0 # number of epochs with no improvement on the validation test in terms of F1-score previous_best_valid_f1_score = -100 epoch_number = -1 while True: step = 0 epoch_number += 1 print('\nStarting epoch {0}'.format(epoch_number)) epoch_start_time = time.time() if epoch_number != 0: # Train model: loop over all sequences of training set with shuffling sequence_numbers = list( range(len(self.dataset.token_indices['train']))) random.shuffle(sequence_numbers) for sequence_number in sequence_numbers: self.transition_params_trained = train.train_step( self.sess, self.dataset, sequence_number, self.model, dropout_rate) step += 1 if step % 10 == 0: print('Training {0:.2f}% done'.format( step / len(sequence_numbers) * 100), end='\r', flush=True) epoch_elapsed_training_time = time.time() - epoch_start_time print('Training completed in {0:.2f} seconds'.format( epoch_elapsed_training_time), flush=True) f1_score = {} for data_type in ['train', 'valid', 'test']: if data_type not in self.dataset.label_indices.keys(): continue _, _, f1_score[data_type] = train.evaluate_step( sess=self.sess, dataset_type=data_type, dataset=self.dataset, model=self.model, transition_params_trained=self.transition_params_trained, tagging_format=self.tagging_format) # if epoch_number % 3 ==0: self.model.saver.save(self.sess, os.path.join(model_folder, 'model.ckpt')) if abs(f1_score['valid'][-2] - previous_best_valid_f1_score) < 0.1: bad_counter += 1 else: bad_counter = 0 if bad_counter > 10: break previous_best_valid_f1_score = f1_score['valid'][-2] if epoch_number > max_number_of_epoch: break
parameters) # Initialize and save execution details start_time = time.time() results = {} results['epoch'] = {} results['execution_details'] = {} results['execution_details']['train_start'] = start_time results['execution_details']['time_stamp'] = experiment_timestamp results['execution_details']['early_stop'] = False results['execution_details']['keyboard_interrupt'] = False results['execution_details']['num_epochs'] = 0 results['model_options'] = copy.copy(parameters) model_folder = os.path.join(stats_graph_folder, 'model') utils.create_folder_if_not_exists(model_folder) pickle.dump(dataset, open(os.path.join(model_folder, 'dataset.pickle'), 'wb')) tensorboard_log_folder = os.path.join(stats_graph_folder, 'tensorboard_logs') utils.create_folder_if_not_exists(tensorboard_log_folder) tensorboard_log_folders = {} for dataset_type in dataset_filepaths.keys(): tensorboard_log_folders[dataset_type] = os.path.join( stats_graph_folder, 'tensorboard_logs', dataset_type) utils.create_folder_if_not_exists(tensorboard_log_folders[dataset_type]) # Instantiate the writers for TensorBoard writers = {} for dataset_type in dataset_filepaths.keys(): writers[dataset_type] = tf.summary.FileWriter(
def main(args): experiments = utils.load_experiments() #parameters, conf_parameters = load_parameters2() #if args.file: # parameters['predict_text'] = args.file #parameters = process_input(parameters) #if not parameters["use_pretrained_model"]: # raise IOError('Set use_pretrained_model parameter to True if you want to predict') #dataset_filepaths = get_valid_dataset_filepaths(parameters) #check_parameter_compatiblity(parameters, dataset_filepaths) pprint(experiments) time_stamp = utils.get_current_time_in_miliseconds() result_file = '{0}_{1}'.format(args.experiment_set + "_" + "results_", time_stamp + ".txt") print(result_file) with open(os.path.join("../predictions",result_file), "w", encoding="utf-8") as file: for elem in experiments['experiments'][args.experiment_set]: trained_model = elem[0] test = elem[1] print("======================") print("Train on {0}, test {1}".format(trained_model,test)) print("======================") pretrained_model_folder = os.path.dirname(experiments['models'][trained_model]) dataset = pickle.load(open(os.path.join(pretrained_model_folder, 'dataset.pickle'), 'rb')) parameters, conf_parameters = load_parameters(os.path.join(pretrained_model_folder, 'parameters.ini'), verbose=False) parameters['train_model'] = False parameters['use_pretrained_model'] = True parameters['dataset_predict'] = experiments['datasets'][test] parameters['pretrained_model_name'] = "{0}_on_{1}".format(trained_model,test) parameters['pretrained_model_checkpoint_filepath'] = experiments['models'][trained_model] dataset_filepaths = get_valid_dataset_filepaths(parameters) pprint(parameters) #sys.exit() #if args.file: # parameters['predict_text'] = args.file #parameters = process_input(parameters) # Load dataset #dataset = ds.Dataset(verbose=parameters['verbose'], debug=parameters['debug']) #dataset.load_vocab_word_embeddings(parameters) #pretrained_model_folder = os.path.dirname(parameters['pretrained_model_checkpoint_filepath']) #dataset = pickle.load(open(os.path.join(pretrained_model_folder, 'dataset.pickle'), 'rb')) #dataset.load_dataset(dataset_filepaths, parameters) dataset_type = "predict" dataset.labels[dataset_type], dataset.tokens[dataset_type], _, _, _ = dataset._parse_dataset(dataset_filepaths.get(dataset_type, None), parameters['language']) #dataset.load_vocab_word_embeddings(parameters) iteration_number = 0 dataset.token_to_index = dict() dataset.number_of_unknown_tokens = 0 for token_sentence in tqdm(dataset.tokens['predict']): for token in token_sentence: if iteration_number == dataset.UNK_TOKEN_INDEX: iteration_number += 1 if iteration_number == dataset.PADDING_TOKEN_INDEX: iteration_number += 1 if token == "CD": a=1 if not utils_nlp.is_token_in_pretrained_embeddings(token, dataset.vocab_embeddings, parameters): if parameters['embedding_type'] == 'fasttext': dataset.token_to_index[token] = iteration_number iteration_number += 1 else: dataset.token_to_index[token] = dataset.UNK_TOKEN_INDEX dataset.number_of_unknown_tokens += 1 dataset.tokens_mapped_to_unk.append(token) else: if token not in dataset.token_to_index: dataset.token_to_index[token] = iteration_number iteration_number += 1 dataset_type = "predict" for dataset_type in dataset_filepaths.keys(): dataset.token_indices[dataset_type] = [] dataset.characters[dataset_type] = [] dataset.character_indices[dataset_type] = [] dataset.token_lengths[dataset_type] = [] dataset.sequence_lengths[dataset_type] = [] dataset.longest_token_length_in_sequence[dataset_type] = [] # character_indices_padded[dataset_type] = [] for token_sequence in dataset.tokens[dataset_type]: dataset.token_indices[dataset_type].append([dataset.token_to_index.get(token, dataset.UNK_TOKEN_INDEX) for token in token_sequence]) dataset.characters[dataset_type].append([list(token) for token in token_sequence]) dataset.character_indices[dataset_type].append( [[dataset.character_to_index.get(character,dataset.UNK_CHARACTER_INDEX) for character in token] for token in token_sequence]) dataset.token_lengths[dataset_type].append([len(token) for token in token_sequence]) dataset.sequence_lengths[dataset_type].append(len(token_sequence)) dataset.longest_token_length_in_sequence[dataset_type].append(max(dataset.token_lengths[dataset_type][-1])) # character_indices_padded[dataset_type].append([ utils.pad_list(temp_token_indices, longest_token_length_in_sequence, self.PADDING_CHARACTER_INDEX) # for temp_token_indices in character_indices[dataset_type][-1]]) dataset.label_indices[dataset_type] = [] for label_sequence in dataset.labels[dataset_type]: dataset.label_indices[dataset_type].append([dataset.label_to_index[label] for label in label_sequence]) tmp_vector = [0] * len(dataset.unique_labels) tmp_vector[dataset.label_to_index["O"]] = 1 dataset.PADDING_LABEL_VECTOR = tmp_vector for dataset_type in dataset_filepaths.keys(): dataset.label_vector_indices[dataset_type] = [] for label_indices_sequence in dataset.label_indices[dataset_type]: vector_sequence = [] for indice in label_indices_sequence: vector = [0] * len(dataset.unique_labels) vector[indice] = 1 vector_sequence.append(vector) dataset.label_vector_indices[dataset_type].append(vector_sequence) # Create graph and session with tf.Graph().as_default(): session_conf = tf.ConfigProto( intra_op_parallelism_threads=parameters['number_of_cpu_threads'], inter_op_parallelism_threads=parameters['number_of_cpu_threads'], device_count={'CPU': 1, 'GPU': parameters['number_of_gpus']}, allow_soft_placement=True, # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist log_device_placement=False, ) session_conf.gpu_options.allow_growth = True sess = tf.Session(config=session_conf) model = EntityLSTM(dataset, parameters) model_saver = tf.train.Saver() prediction_folder = os.path.join('..', 'predictions') utils.create_folder_if_not_exists(prediction_folder) dataset_name = parameters['pretrained_model_name'] model_name = '{0}_{1}'.format(dataset_name, time_stamp) prediction_folder = os.path.join(prediction_folder, model_name) utils.create_folder_if_not_exists(prediction_folder) epoch_number = 100 #dataset_name = utils.get_basename_without_extension(parameters['dataset_test']) with open(os.path.join(prediction_folder, 'parameters.ini'), 'w') as parameters_file: conf_parameters.write(parameters_file) if parameters['use_pretrained_model']: # Restore pretrained model parameters transition_params_trained = train.restore_model_parameters_from_pretrained_model(parameters, dataset, sess, model, model_saver) model.load_pretrained_token_embeddings(sess, dataset, parameters) start_time = time.time() results = {} results['epoch'] = {} results['execution_details'] = {} results['execution_details']['train_start'] = start_time results['execution_details']['time_stamp'] = start_time results['execution_details']['early_stop'] = False results['execution_details']['keyboard_interrupt'] = False results['execution_details']['num_epochs'] = epoch_number results['model_options'] = copy.copy(parameters) demo = parameters['pretrained_model_name'] == "demo" y_pred, y_true, output_filepaths = train.predict_labels(sess, model, transition_params_trained, parameters, dataset, epoch_number, prediction_folder, dataset_filepaths, demo=demo) conll_output_file = evaluate.evaluate_model(results, dataset, y_pred, y_true, prediction_folder, epoch_number, start_time , output_filepaths, parameters) file.write(parameters['pretrained_model_name'] + "\n") with open(conll_output_file, "r") as conll_file: conll = conll_file.read() file.write(conll) file.write("\n\n\n") if parameters['pretrained_model_name'] == "demo": print("============") print(" Prediction ") print("============") i = 0 for sentence in dataset.tokens['predict']: for token in sentence: predict_label = dataset.index_to_label[y_pred['predict'][i]] if dataset.index_to_label[y_pred['predict'][i]] != "O": print(token,predict_label) else: print(token) i += 1 print("") else: raise IOError('Set use_pretrained_model parameter to True')
def generate_reference_text_file_for_conll(conll_filepath, text_folder): ''' generates reference text files and adds the corresponding filename and token offsets to conll file. conll_filepath: path to a conll-formatted file without filename and token offsets text_folder: folder to write the reference text file to ''' dataset_type = utils.get_basename_without_extension(conll_filepath) conll_file = codecs.open(conll_filepath, 'r', 'UTF-8') utils.create_folder_if_not_exists(text_folder) text = '' new_conll_string = '' character_index = 0 document_count = 0 text_base_filename = '{0}_text_{1}'.format(dataset_type, str(document_count).zfill(5)) for line in conll_file: split_line = line.strip().split(' ') # New document if '-DOCSTART-' in split_line[0]: new_conll_string += line if len(text) != 0: with codecs.open( os.path.join(text_folder, '{0}.txt'.format(text_base_filename)), 'w', 'UTF-8') as f: f.write(text) text = '' character_index = 0 document_count += 1 text_base_filename = '{0}_text_{1}'.format( dataset_type, str(document_count).zfill(5)) continue # New sentence elif len(split_line) == 0 or len(split_line[0]) == 0: new_conll_string += '\n' if text != '': text += '\n' character_index += 1 continue token = split_line[0] start = character_index end = start + len(token) text += token + ' ' character_index += len(token) + 1 new_conll_string += ' '.join( [token, text_base_filename, str(start), str(end)] + split_line[1:]) + '\n' if len(text) != 0: with codecs.open( os.path.join(text_folder, '{0}.txt'.format(text_base_filename)), 'w', 'UTF-8') as f: f.write(text) conll_file.close() original_conll_filepath = os.path.join( os.path.dirname(conll_filepath), '{0}_original.txt'.format(dataset_type)) shutil.copyfile(conll_filepath, original_conll_filepath) with codecs.open(conll_filepath, 'w', 'UTF-8') as f: f.write(new_conll_string)
def main(): parameters, dataset_filepaths = load_parameters() # Load dataset dataset = ds.Dataset() dataset.load_dataset(dataset_filepaths, parameters) # Create graph and session with tf.Graph().as_default(): session_conf = tf.ConfigProto( device_count={ 'CPU': 1, 'GPU': 1 }, allow_soft_placement= True, # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): # Initialize and save execution details start_time = time.time() experiment_timestamp = utils.get_current_time_in_miliseconds() results = {} results['epoch'] = {} results['execution_details'] = {} results['execution_details']['train_start'] = start_time results['execution_details']['time_stamp'] = experiment_timestamp results['execution_details']['early_stop'] = False results['execution_details']['keyboard_interrupt'] = False results['execution_details']['num_epochs'] = 0 results['model_options'] = copy.copy(parameters) dataset_name = utils.get_basename_without_extension( parameters['dataset_text_folder']) model_name = '{0}_{1}'.format( dataset_name, results['execution_details']['time_stamp']) output_folder = os.path.join('..', 'output') utils.create_folder_if_not_exists(output_folder) stats_graph_folder = os.path.join( output_folder, model_name) # Folder where to save graphs utils.create_folder_if_not_exists(stats_graph_folder) model_folder = os.path.join(stats_graph_folder, 'model') utils.create_folder_if_not_exists(model_folder) tensorboard_log_folder = os.path.join(stats_graph_folder, 'tensorboard_logs') utils.create_folder_if_not_exists(tensorboard_log_folder) tensorboard_log_folders = {} for dataset_type in ['train', 'valid', 'test']: tensorboard_log_folders[dataset_type] = os.path.join( stats_graph_folder, 'tensorboard_logs', dataset_type) utils.create_folder_if_not_exists( tensorboard_log_folders[dataset_type]) pickle.dump( dataset, open(os.path.join(stats_graph_folder, 'dataset.pickle'), 'wb')) # Instantiate the model # graph initialization should be before FileWriter, otherwise the graph will not appear in TensorBoard model = EntityLSTM(dataset, parameters) # Instantiate the writers for TensorBoard writers = {} for dataset_type in ['train', 'valid', 'test']: writers[dataset_type] = tf.summary.FileWriter( tensorboard_log_folders[dataset_type], graph=sess.graph) embedding_writer = tf.summary.FileWriter( model_folder ) # embedding_writer has to write in model_folder, otherwise TensorBoard won't be able to view embeddings embeddings_projector_config = projector.ProjectorConfig() tensorboard_token_embeddings = embeddings_projector_config.embeddings.add( ) tensorboard_token_embeddings.tensor_name = model.token_embedding_weights.name token_list_file_path = os.path.join( model_folder, 'tensorboard_metadata_tokens.tsv') tensorboard_token_embeddings.metadata_path = os.path.relpath( token_list_file_path, '..') tensorboard_character_embeddings = embeddings_projector_config.embeddings.add( ) tensorboard_character_embeddings.tensor_name = model.character_embedding_weights.name character_list_file_path = os.path.join( model_folder, 'tensorboard_metadata_characters.tsv') # 'metadata.tsv' tensorboard_character_embeddings.metadata_path = os.path.relpath( character_list_file_path, '..') projector.visualize_embeddings(embedding_writer, embeddings_projector_config) # Write metadata for TensorBoard embeddings token_list_file = open(token_list_file_path, 'w') for token_index in range(dataset.vocabulary_size): token_list_file.write('{0}\n'.format( dataset.index_to_token[token_index])) token_list_file.close() character_list_file = open(character_list_file_path, 'w') print('len(dataset.character_to_index): {0}'.format( len(dataset.character_to_index))) print('len(dataset.index_to_character): {0}'.format( len(dataset.index_to_character))) for character_index in range(dataset.alphabet_size): if character_index == dataset.PADDING_CHARACTER_INDEX: character_list_file.write('PADDING\n') else: character_list_file.write('{0}\n'.format( dataset.index_to_character[character_index])) character_list_file.close() # Initialize the model sess.run(tf.global_variables_initializer()) model.load_pretrained_token_embeddings(sess, dataset, parameters) # Start training + evaluation loop. Each iteration corresponds to 1 epoch. step = 0 bad_counter = 0 # number of epochs with no improvement on the validation test in terms of F1-score previous_best_valid_f1_score = 0 transition_params_trained = np.random.rand( len(dataset.unique_labels), len(dataset.unique_labels)) model_saver = tf.train.Saver( max_to_keep=parameters['maximum_number_of_epochs'] ) # defaults to saving all variables epoch_number = -1 try: while True: epoch_number += 1 #epoch_number = math.floor(step / len(dataset.token_indices['train'])) print('\nStarting epoch {0}'.format(epoch_number), end='') epoch_start_time = time.time() #print('step: {0}'.format(step)) # Train model: loop over all sequences of training set with shuffling sequence_numbers = list( range(len(dataset.token_indices['train']))) random.shuffle(sequence_numbers) for sequence_number in sequence_numbers: transition_params_trained = train.train_step( sess, dataset, sequence_number, model, transition_params_trained, parameters) step += 1 if step % 100 == 0: print('.', end='', flush=True) #break print('.', flush=True) #print('step: {0}'.format(step)) # Predict labels using trained model y_pred = {} y_true = {} output_filepaths = {} for dataset_type in ['train', 'valid', 'test']: #print('dataset_type: {0}'.format(dataset_type)) prediction_output = train.prediction_step( sess, dataset, dataset_type, model, transition_params_trained, step, stats_graph_folder, epoch_number, parameters) y_pred[dataset_type], y_true[ dataset_type], output_filepaths[ dataset_type] = prediction_output # model_options = None epoch_elapsed_training_time = time.time( ) - epoch_start_time print( 'epoch_elapsed_training_time: {0:.2f} seconds'.format( epoch_elapsed_training_time)) results['execution_details']['num_epochs'] = epoch_number # Evaluate model: save and plot results evaluate.evaluate_model(results, dataset, y_pred, y_true, stats_graph_folder, epoch_number, epoch_start_time, output_filepaths, parameters) # Save model model_saver.save( sess, os.path.join(model_folder, 'model_{0:05d}.ckpt'.format(epoch_number)) ) #, global_step, latest_filename, meta_graph_suffix, write_meta_graph, write_state) # Save TensorBoard logs summary = sess.run(model.summary_op, feed_dict=None) writers['train'].add_summary(summary, epoch_number) # Early stop valid_f1_score = results['epoch'][epoch_number][0][ 'valid']['f1_score']['micro'] if valid_f1_score > previous_best_valid_f1_score: bad_counter = 0 previous_best_valid_f1_score = valid_f1_score else: bad_counter += 1 if bad_counter > parameters['patience']: print('Early Stop!') results['execution_details']['early_stop'] = True break if epoch_number > parameters['maximum_number_of_epochs']: break # break # debugging except KeyboardInterrupt: results['execution_details']['keyboard_interrupt'] = True # assess_model.save_results(results, stats_graph_folder) print('Training interrupted') print('Finishing the experiment') end_time = time.time() results['execution_details'][ 'train_duration'] = end_time - start_time results['execution_details']['train_end'] = end_time evaluate.save_results(results, stats_graph_folder) sess.close() # release the session's resources
def save(self): utils.create_folder_if_not_exists(self.save_folder) classed_image = os.path.join(self.save_folder, "classed_images") utils.create_folder_if_not_exists(classed_image) voc_path = os.path.join(self.save_folder, "voc") utils.create_folder_if_not_exists(voc_path) jpeg_image_path = os.path.join(voc_path, "JPEGImages") utils.create_folder_if_not_exists(jpeg_image_path) annotation_path = os.path.join(voc_path, "Annotations") utils.create_folder_if_not_exists(annotation_path) file_name = datetime.datetime.now().strftime('%Y%m%d-%H%M%S') annotation_xml = Element('annotation') size_xml = SubElement(annotation_xml, 'size') width_xml = SubElement(size_xml, 'width') width_xml.text = str(self.raw_image.size[0]) height_xml = SubElement(size_xml, 'height') height_xml.text = str(self.raw_image.size[1]) depth_xml = SubElement(size_xml, 'depth') depth_xml.text = str(3) for i, label in enumerate(self.labels_listbox.get(0, END)): found = self.canvas.find_withtag(label) label_path = os.path.join(classed_image, label) utils.create_folder_if_not_exists(label_path) for j, found_index in enumerate(found): # Process crop image xmin, ymin, xmax, ymax = self.canvas.bbox(found_index) bbox = (xmin, ymin, xmax, ymax) crop = self.raw_image.crop(bbox) crop_path = os.path.join( label_path, str.format("{0}-{1}[{2}].jpg", label, file_name, j)) # Remove iligeal part xmin = max(xmin, 0) ymin = max(ymin, 0) xmax = min(xmax, self.raw_image.size[0]) ymax = min(ymin, self.raw_image.size[1]) crop.save(crop_path) # Process xml object object_xml = SubElement(annotation_xml, 'object') name_xml = SubElement(object_xml, 'name') name_xml.text = str(label) difficult_xml = SubElement(object_xml, 'difficult') difficult_xml.text = str(0) bndbox_xml = SubElement(object_xml, 'bndbox') xmin_xml = SubElement(bndbox_xml, 'xmin') xmin_xml.text = str(xmin) ymin_xml = SubElement(bndbox_xml, 'ymin') ymin_xml.text = str(ymin) xmax_xml = SubElement(bndbox_xml, 'xmax') xmax_xml.text = str(xmax) ymax_xml = SubElement(bndbox_xml, 'ymax') ymax_xml.text = str(ymax) pass with open( os.path.join(annotation_path, str.format("{0}.xml", file_name)), 'w') as xml: tree = ElementTree(annotation_xml) tree.write(xml, encoding='unicode') self.raw_image.save( os.path.join(jpeg_image_path, str.format("{0}.jpeg", file_name)))
def main(_): vocab = read_vocab('data/ICLR_Review_all_with_decision-w2i.pkl') glove_embs = load_glove('glove.6B.{}d.txt'.format(FLAGS.emb_size), FLAGS.emb_size, vocab) data_reader = DataReader( train_file='data/ICLR_Review_all_with_decision-train.pkl', dev_file='data/ICLR_Review_all_with_decision-dev.pkl', test_file='data/ICLR_Review_all_with_decision-test.pkl') config = tf.ConfigProto(allow_soft_placement=FLAGS.allow_soft_placement) with tf.Session(config=config) as sess: model = Model(cell_dim=FLAGS.cell_dim, att_dim=FLAGS.att_dim, vocab_size=len(vocab), emb_size=FLAGS.emb_size, num_classes=FLAGS.num_classes, dropout_rate=FLAGS.dropout_rate, pretrained_embs=glove_embs) loss = loss_fn(model.labels, model.logits) train_op, global_step = train_fn(loss) batch_acc, total_acc, acc_update, metrics_init, predictions = eval_fn( model.labels, model.logits) summary_op = tf.summary.merge_all() sess.run(tf.global_variables_initializer()) train_writer.add_graph(sess.graph) saver = tf.train.Saver(max_to_keep=FLAGS.num_checkpoints) print('\n{}> Start training'.format(datetime.now())) result_save_folder = str(datetime.now()) output_folder = os.path.join('.', 'output') create_folder_if_not_exists(output_folder) stats_graph_folder = os.path.join( output_folder, result_save_folder) # Folder where to save graphs create_folder_if_not_exists(stats_graph_folder) epoch = 0 valid_step = 0 test_step = 0 train_test_prop = len(data_reader.train_data) / len( data_reader.test_data) test_batch_size = int(FLAGS.batch_size / train_test_prop) best_acc = float('-inf') while epoch < FLAGS.num_epochs: epoch += 1 print('\n{}> Epoch: {}'.format(datetime.now(), epoch)) sess.run(metrics_init) all_labels = [] all_y_pred = [] for batch_docs, batch_labels in data_reader.read_train_set( FLAGS.batch_size, shuffle=True): _step, _, _loss, _acc, _, y_pred_batch = sess.run( [ global_step, train_op, loss, batch_acc, acc_update, predictions ], feed_dict=model.get_feed_dict(batch_docs, batch_labels, training=True)) all_labels += batch_labels #y_pred_batch_array = y_pred_batch.eval(session=sess) y_pred_batch_list = y_pred_batch.tolist() all_y_pred += y_pred_batch_list if _step % FLAGS.display_step == 0: _summary = sess.run(summary_op, feed_dict=model.get_feed_dict( batch_docs, batch_labels)) train_writer.add_summary(_summary, global_step=_step) print('Training accuracy = {:.2f}'.format( sess.run(total_acc) * 100)) save_results(all_labels, all_y_pred, stats_graph_folder, 'train', epoch) sess.run(metrics_init) all_valid_labels = [] all_valid_y_pred = [] for batch_docs, batch_labels in data_reader.read_valid_set( test_batch_size): _loss, _acc, _, valid_y_pred_batch = sess.run( [loss, batch_acc, acc_update, predictions], feed_dict=model.get_feed_dict(batch_docs, batch_labels)) all_valid_labels += batch_labels valid_y_pred_batch_list = valid_y_pred_batch.tolist() all_valid_y_pred += valid_y_pred_batch_list valid_step += 1 if valid_step % FLAGS.display_step == 0: _summary = sess.run(summary_op, feed_dict=model.get_feed_dict( batch_docs, batch_labels)) valid_writer.add_summary(_summary, global_step=valid_step) print('Validation accuracy = {:.2f}'.format( sess.run(total_acc) * 100)) #save_optimized_presicion(all_valid_labels, all_valid_y_pred, stats_graph_folder, 'valid', epoch) #save_distance_measure(all_valid_labels, all_valid_y_pred, stats_graph_folder, 'valid', epoch) save_results(all_valid_labels, all_valid_y_pred, stats_graph_folder, 'valid', epoch) sess.run(metrics_init) all_test_labels = [] all_test_y_pred = [] for batch_docs, batch_labels in data_reader.read_test_set( test_batch_size): _loss, _acc, _, test_y_pred_batch = sess.run( [loss, batch_acc, acc_update, predictions], feed_dict=model.get_feed_dict(batch_docs, batch_labels)) all_test_labels += batch_labels test_y_pred_batch_list = test_y_pred_batch.tolist() all_test_y_pred += test_y_pred_batch_list test_step += 1 if test_step % FLAGS.display_step == 0: _summary = sess.run(summary_op, feed_dict=model.get_feed_dict( batch_docs, batch_labels)) test_writer.add_summary(_summary, global_step=test_step) test_acc = sess.run(total_acc) * 100 print('Testing accuracy = {:.2f}'.format(test_acc)) #save_optimized_presicion(all_test_labels, all_test_y_pred, stats_graph_folder, 'test', epoch) #save_distance_measure(all_test_labels, all_test_y_pred, stats_graph_folder, 'test', epoch) save_results(all_test_labels, all_test_y_pred, stats_graph_folder, 'test', epoch) if test_acc > best_acc: best_acc = test_acc saver.save(sess, FLAGS.checkpoint_dir) print('Best testing accuracy = {:.2f}'.format(best_acc)) print("{} Optimization Finished!".format(datetime.now())) print('Best testing accuracy = {:.2f}'.format(best_acc))
def main(args): parameters, conf_parameters = load_parameters() if args.file: parameters['predict_text'] = args.file parameters = process_input(parameters) dataset_filepaths = get_valid_dataset_filepaths(parameters) # Load dataset dataset = ds.Dataset(verbose=parameters['verbose'], debug=parameters['debug']) dataset.load_vocab_word_embeddings(parameters) pretrained_model_folder = os.path.dirname(parameters['pretrained_model_checkpoint_filepath']) dataset = pickle.load(open(os.path.join(pretrained_model_folder, 'dataset.pickle'), 'rb')) dataset.load_dataset(dataset_filepaths, parameters) dataset_type = "predict" dataset.labels[dataset_type], dataset.tokens[dataset_type], _, _, _ = dataset._parse_dataset(dataset_filepaths.get(dataset_type, None), parameters['language']) #dataset.load_vocab_word_embeddings(parameters) iteration_number = 0 dataset.token_to_index = dict() for token_sentence in dataset.tokens['predict']: for token in token_sentence: if iteration_number == dataset.UNK_TOKEN_INDEX: iteration_number += 1 if iteration_number == dataset.PADDING_TOKEN_INDEX: iteration_number += 1 if not utils_nlp.is_token_in_pretrained_embeddings(token, dataset.vocab_embeddings, parameters): if parameters['embedding_type'] == 'glove': dataset.token_to_index[token] = dataset.UNK_TOKEN_INDEX dataset.number_of_unknown_tokens += 1 dataset.tokens_mapped_to_unk.append(token) elif parameters['embedding_type'] == 'fasttext': dataset.token_to_index[token] = iteration_number iteration_number += 1 else: raise AssertionError("Embedding type not recognized") else: if token not in dataset.token_to_index: dataset.token_to_index[token] = iteration_number iteration_number += 1 dataset_type = "predict" for dataset_type in dataset_filepaths.keys(): dataset.token_indices[dataset_type] = [] dataset.characters[dataset_type] = [] dataset.character_indices[dataset_type] = [] dataset.token_lengths[dataset_type] = [] dataset.sequence_lengths[dataset_type] = [] dataset.longest_token_length_in_sequence[dataset_type] = [] # character_indices_padded[dataset_type] = [] for token_sequence in dataset.tokens[dataset_type]: dataset.token_indices[dataset_type].append([dataset.token_to_index.get(token, dataset.UNK_TOKEN_INDEX) for token in token_sequence]) dataset.characters[dataset_type].append([list(token) for token in token_sequence]) dataset.character_indices[dataset_type].append( [[dataset.character_to_index.get(character,dataset.UNK_CHARACTER_INDEX) for character in token] for token in token_sequence]) dataset.token_lengths[dataset_type].append([len(token) for token in token_sequence]) dataset.sequence_lengths[dataset_type].append(len(token_sequence)) dataset.longest_token_length_in_sequence[dataset_type].append(max(dataset.token_lengths[dataset_type][-1])) # character_indices_padded[dataset_type].append([ utils.pad_list(temp_token_indices, longest_token_length_in_sequence, self.PADDING_CHARACTER_INDEX) # for temp_token_indices in character_indices[dataset_type][-1]]) dataset.label_indices[dataset_type] = [] for label_sequence in dataset.labels[dataset_type]: dataset.label_indices[dataset_type].append([dataset.label_to_index[label] for label in label_sequence]) tmp_vector = [0] * len(dataset.unique_labels) tmp_vector[dataset.label_to_index["O"]] = 1 dataset.PADDING_LABEL_VECTOR = tmp_vector for dataset_type in dataset_filepaths.keys(): dataset.label_vector_indices[dataset_type] = [] for label_indices_sequence in dataset.label_indices[dataset_type]: vector_sequence = [] for indice in label_indices_sequence: vector = [0] * len(dataset.unique_labels) vector[indice] = 1 vector_sequence.append(vector) dataset.label_vector_indices[dataset_type].append(vector_sequence) # Create graph and session with tf.Graph().as_default(): session_conf = tf.ConfigProto( intra_op_parallelism_threads=parameters['number_of_cpu_threads'], inter_op_parallelism_threads=parameters['number_of_cpu_threads'], device_count={'CPU': 1, 'GPU': parameters['number_of_gpus']}, allow_soft_placement=True, # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist log_device_placement=False, ) session_conf.gpu_options.allow_growth = True sess = tf.Session(config=session_conf) model = EntityLSTM(dataset, parameters) model_saver = tf.train.Saver() prediction_folder = os.path.join('..', 'predictions') utils.create_folder_if_not_exists(prediction_folder) dataset_name = parameters['pretrained_model_name'] model_name = '{0}_{1}'.format(parameters["language"] + "_" + dataset_name, utils.get_current_time_in_miliseconds()) prediction_folder = os.path.join(prediction_folder, model_name) utils.create_folder_if_not_exists(prediction_folder) epoch_number = 100 #dataset_name = utils.get_basename_without_extension(parameters['dataset_test']) with open(os.path.join(prediction_folder, 'parameters.ini'), 'w') as parameters_file: conf_parameters.write(parameters_file) if parameters['use_pretrained_model']: # Restore pretrained model parameters transition_params_trained = train.restore_model_parameters_from_pretrained_model(parameters, dataset, sess, model, model_saver) model.load_pretrained_token_embeddings(sess, dataset, parameters) demo = parameters['pretrained_model_name'] == "demo" y_pred, y_true, output_filepaths = train.predict_labels(sess, model, transition_params_trained, parameters, dataset, epoch_number, prediction_folder, dataset_filepaths, demo=demo) if parameters['pretrained_model_name'] == "demo": print("============") print(" Prediction ") print("============") i = 0 for sentence in dataset.tokens['predict']: for token in sentence: predict_label = dataset.index_to_label[y_pred['predict'][i]] if dataset.index_to_label[y_pred['predict'][i]] != "O": print(token,predict_label) else: print(token) i += 1 print("") else: raise IOError('Set use_pretrained_model parameter to True')
def main(argv=sys.argv): arguments = parse_arguments(argv[1:]) parameters, conf_parameters = load_parameters( arguments['parameters_filepath'], arguments=arguments) dataset_filepaths, dataset_brat_folders = get_valid_dataset_filepaths( parameters) check_parameter_compatiblity(parameters, dataset_filepaths) # Load dataset dataset = ds.Dataset(verbose=parameters['verbose'], debug=parameters['debug']) dataset.load_dataset(dataset_filepaths, parameters) # Create graph and session with tf.device('/gpu:0'): with tf.Graph().as_default(): session_conf = tf.ConfigProto( intra_op_parallelism_threads=parameters[ 'number_of_cpu_threads'], inter_op_parallelism_threads=parameters[ 'number_of_cpu_threads'], device_count={ 'CPU': 1, 'GPU': parameters['number_of_gpus'] }, allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): start_time = time.time() experiment_timestamp = utils.get_current_time_in_miliseconds() results = {} results['epoch'] = {} results['execution_details'] = {} results['execution_details']['train_start'] = start_time results['execution_details'][ 'time_stamp'] = experiment_timestamp results['execution_details']['early_stop'] = False results['execution_details']['keyboard_interrupt'] = False results['execution_details']['num_epochs'] = 0 results['model_options'] = copy.copy(parameters) dataset_name = utils.get_basename_without_extension( parameters['dataset_text_folder']) model_name = dataset_name utils.create_folder_if_not_exists(parameters['output_folder']) stats_graph_folder = os.path.join( parameters['output_folder'], model_name) # Folder where to save graphs final_weights_folder = os.path.join( parameters['output_folder'], 'weights') utils.create_folder_if_not_exists(stats_graph_folder) utils.create_folder_if_not_exists(final_weights_folder) model_folder = os.path.join(stats_graph_folder, 'model') utils.create_folder_if_not_exists(model_folder) with open(os.path.join(model_folder, 'parameters.ini'), 'w') as parameters_file: conf_parameters.write(parameters_file) tensorboard_log_folder = os.path.join(stats_graph_folder, 'tensorboard_logs') utils.create_folder_if_not_exists(tensorboard_log_folder) tensorboard_log_folders = {} for dataset_type in dataset_filepaths.keys(): tensorboard_log_folders[dataset_type] = os.path.join( stats_graph_folder, 'tensorboard_logs', dataset_type) utils.create_folder_if_not_exists( tensorboard_log_folders[dataset_type]) pickle.dump( dataset, open(os.path.join(model_folder, 'dataset.pickle'), 'wb')) model = EntityLSTM(dataset, parameters) writers = {} for dataset_type in dataset_filepaths.keys(): writers[dataset_type] = tf.summary.FileWriter( tensorboard_log_folders[dataset_type], graph=sess.graph) embedding_writer = tf.summary.FileWriter(model_folder) embeddings_projector_config = projector.ProjectorConfig() tensorboard_token_embeddings = embeddings_projector_config.embeddings.add( ) tensorboard_token_embeddings.tensor_name = model.token_embedding_weights.name token_list_file_path = os.path.join( model_folder, 'tensorboard_metadata_tokens.tsv') tensorboard_token_embeddings.metadata_path = os.path.relpath( token_list_file_path, '..') tensorboard_character_embeddings = embeddings_projector_config.embeddings.add( ) tensorboard_character_embeddings.tensor_name = model.character_embedding_weights.name character_list_file_path = os.path.join( model_folder, 'tensorboard_metadata_characters.tsv') tensorboard_character_embeddings.metadata_path = os.path.relpath( character_list_file_path, '..') projector.visualize_embeddings(embedding_writer, embeddings_projector_config) token_list_file = codecs.open(token_list_file_path, 'w', 'latin-1') for token_index in range(dataset.vocabulary_size): token_list_file.write('{0}\n'.format( dataset.index_to_token[token_index])) token_list_file.close() character_list_file = codecs.open(character_list_file_path, 'w', 'latin-1') for character_index in range(dataset.alphabet_size): if character_index == dataset.PADDING_CHARACTER_INDEX: character_list_file.write('PADDING\n') else: character_list_file.write('{0}\n'.format( dataset.index_to_character[character_index])) character_list_file.close() # Initialize the model sess.run(tf.global_variables_initializer()) if not parameters['use_pretrained_model']: model.load_pretrained_token_embeddings( sess, dataset, parameters) patience_counter = 0 # number of epochs with no improvement on the validation test in terms of F1-score f1_score_best = 0 f1_scores = {'train-F1': [], 'valid-F1': [], 'test-F1': []} transition_params_trained = np.random.rand( len(dataset.unique_labels) + 2, len(dataset.unique_labels) + 2) model_saver = tf.train.Saver( max_to_keep=parameters['num_of_model_to_keep'] ) #, reshape= True) # defaults to saving all variables epoch_number = -1 try: while True: step = 0 epoch_number += 1 print('\nStarting epoch {0}'.format(epoch_number)) epoch_start_time = time.time() if parameters[ 'use_pretrained_model'] and epoch_number == 0: if parameters['use_corrector']: parameters['use_corrector'] = False transition_params_trained = train.restore_pretrained_model( parameters, dataset, sess, model, model_saver) print( 'Getting the 3-label predictions from the step1 model.' ) all_pred_labels, y_pred_for_corrector, y_true_for_corrector, \ output_filepaths = train.predict_labels(sess, model, transition_params_trained, parameters, dataset, epoch_number, stats_graph_folder, dataset_filepaths, for_corrector = True) all_pred_indices = {} #defaultdict(list) for dataset_type in dataset_filepaths.keys(): all_pred_indices[dataset_type] = [] for i in range( len(all_pred_labels[dataset_type]) ): indices = [ dataset. label_corrector_to_index[label] for label in all_pred_labels[dataset_type][i] ] all_pred_indices[dataset_type].append( indices) label_binarizer_corrector = sklearn.preprocessing.LabelBinarizer( ) label_binarizer_corrector.fit( range( max(dataset.index_to_label_corrector. keys()) + 1)) predicted_label_corrector_vector_indices = {} for dataset_type in dataset_filepaths.keys(): predicted_label_corrector_vector_indices[ dataset_type] = [] for label_indices_sequence in all_pred_indices[ dataset_type]: predicted_label_corrector_vector_indices[ dataset_type].append( label_binarizer_corrector. transform( label_indices_sequence)) parameters['use_corrector'] = True transition_params_trained, model, glo_step = \ train.restore_model_parameters_from_pretrained_model(parameters, dataset, sess, model, model_saver) for dataset_type in dataset_filepaths.keys(): writers[dataset_type] = tf.summary.FileWriter( tensorboard_log_folders[dataset_type], graph=sess.graph) embedding_writer = tf.summary.FileWriter( model_folder) init_new_vars_op = tf.initialize_variables( [glo_step]) sess.run(init_new_vars_op) elif epoch_number != 0: sequence_numbers = list( range(len(dataset.token_indices['train']))) random.shuffle(sequence_numbers) for sequence_number in sequence_numbers: transition_params_trained, W_before_crf = train.train_step( sess, dataset, sequence_number, model, transition_params_trained, parameters) step += 1 epoch_elapsed_training_time = time.time( ) - epoch_start_time print('Training completed in {0:.2f} seconds'.format( epoch_elapsed_training_time), flush=False) if parameters['use_corrector']: original_label_corrector_vector_indices = dataset.label_corrector_vector_indices dataset.label_corrector_vector_indices = predicted_label_corrector_vector_indices y_pred, y_true, output_filepaths = train.predict_labels( sess, model, transition_params_trained, parameters, dataset, epoch_number, stats_graph_folder, dataset_filepaths) # Evaluate model: save and plot results evaluate.evaluate_model(results, dataset, y_pred, y_true, stats_graph_folder, epoch_number, epoch_start_time, output_filepaths, parameters) dataset.label_corrector_vector_indices = original_label_corrector_vector_indices else: y_pred, y_true, output_filepaths = train.predict_labels( sess, model, transition_params_trained, parameters, dataset, epoch_number, stats_graph_folder, dataset_filepaths) # Evaluate model: save and plot results evaluate.evaluate_model(results, dataset, y_pred, y_true, stats_graph_folder, epoch_number, epoch_start_time, output_filepaths, parameters) summary = sess.run(model.summary_op, feed_dict=None) writers['train'].add_summary(summary, epoch_number) writers['train'].flush() utils.copytree(writers['train'].get_logdir(), model_folder) # Early stopping train_f1_score = results['epoch'][epoch_number][0][ 'train']['f1_score']['micro'] valid_f1_score = results['epoch'][epoch_number][0][ 'valid']['f1_score']['micro'] test_f1_score = results['epoch'][epoch_number][0][ 'test']['f1_score']['micro'] f1_scores['train-F1'].append(train_f1_score) f1_scores['valid-F1'].append(valid_f1_score) f1_scores['test-F1'].append(test_f1_score) if valid_f1_score > f1_score_best: patience_counter = 0 f1_score_best = valid_f1_score # Save the best model model_saver.save( sess, os.path.join(model_folder, 'best_model.ckpt')) print( 'updated model to current epoch : epoch {:d}'. format(epoch_number)) print('the model is saved in: {:s}'.format( model_folder)) ### newly deleted else: patience_counter += 1 print("In epoch {:d}, the valid F1 is : {:f}".format( epoch_number, valid_f1_score)) print( "The last {0} epochs have not shown improvements on the validation set." .format(patience_counter)) if patience_counter >= parameters['patience']: print('Early Stop!') results['execution_details']['early_stop'] = True if epoch_number >= parameters[ 'maximum_number_of_epochs'] and parameters[ 'refine_with_crf']: model = train.refine_with_crf( parameters, sess, model, model_saver) print('refine model with CRF ...') for additional_epoch in range( parameters['additional_epochs_with_crf']): print('Additional {:d}th epoch'.format( additional_epoch)) sequence_numbers = list( range(len(dataset.token_indices['train']))) random.shuffle(sequence_numbers) for sequence_number in sequence_numbers: transition_params_trained, W_before_crf = train.train_step( sess, dataset, sequence_number, model, transition_params_trained, parameters) step += 1 epoch_elapsed_training_time = time.time( ) - epoch_start_time print( 'Additional training completed in {0:.2f} seconds' .format(epoch_elapsed_training_time), flush=False) y_pred, y_true, output_filepaths = train.predict_labels( sess, model, transition_params_trained, parameters, dataset, epoch_number, stats_graph_folder, dataset_filepaths) evaluate.evaluate_model( results, dataset, y_pred, y_true, stats_graph_folder, epoch_number, epoch_start_time, output_filepaths, parameters) summary = sess.run(model.summary_op, feed_dict=None) writers['train'].add_summary( summary, epoch_number) writers['train'].flush() utils.copytree(writers['train'].get_logdir(), model_folder) if epoch_number >= parameters[ 'maximum_number_of_epochs'] and not parameters[ 'refine_with_crf']: break if not parameters['use_pretrained_model']: plot_name = 'F1-summary-step1.svg' else: plot_name = 'F1-summary-step2.svg' for k, l in f1_scores.items(): print(k, l) utils_plots.plot_f1( f1_scores, os.path.join(stats_graph_folder, '..', plot_name), 'F1 score summary') except KeyboardInterrupt: results['execution_details']['keyboard_interrupt'] = True print('Training interrupted') print('Finishing the experiment') end_time = time.time() results['execution_details'][ 'train_duration'] = end_time - start_time results['execution_details']['train_end'] = end_time evaluate.save_results(results, stats_graph_folder) for dataset_type in dataset_filepaths.keys(): writers[dataset_type].close() sess.close()
def main(): #### Parameters - start conf_parameters = configparser.ConfigParser() conf_parameters.read(os.path.join('.', 'parameters.ini')) nested_parameters = utils.convert_configparser_to_dictionary( conf_parameters) parameters = {} for k, v in nested_parameters.items(): parameters.update(v) for k, v in parameters.items(): if k in [ 'remove_unknown_tokens', 'character_embedding_dimension', 'character_lstm_hidden_state_dimension', 'token_embedding_dimension', 'token_lstm_hidden_state_dimension', 'patience', 'maximum_number_of_epochs', 'maximum_training_time', 'number_of_cpu_threads', 'number_of_gpus' ]: parameters[k] = int(v) if k in ['dropout_rate']: parameters[k] = float(v) if k in [ 'use_character_lstm', 'is_character_lstm_bidirect', 'is_token_lstm_bidirect', 'use_crf' ]: parameters[k] = distutils.util.strtobool(v) pprint(parameters) # Load dataset dataset_filepaths = {} dataset_filepaths['train'] = os.path.join( parameters['dataset_text_folder'], 'train.txt') dataset_filepaths['valid'] = os.path.join( parameters['dataset_text_folder'], 'valid.txt') dataset_filepaths['test'] = os.path.join(parameters['dataset_text_folder'], 'test.txt') dataset = ds.Dataset() dataset.load_dataset(dataset_filepaths, parameters) with tf.Graph().as_default(): session_conf = tf.ConfigProto( device_count={ 'CPU': 1, 'GPU': 1 }, allow_soft_placement= True, # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): # Instantiate model model = EntityLSTM(dataset, parameters) sess.run(tf.global_variables_initializer()) model.load_pretrained_token_embeddings(sess, dataset, parameters) # Initialize and save execution details start_time = time.time() experiment_timestamp = utils.get_current_time_in_miliseconds() results = {} #results['model_options'] = copy.copy(model_options) #results['model_options'].pop('optimizer', None) results['epoch'] = {} results['execution_details'] = {} results['execution_details']['train_start'] = start_time results['execution_details']['time_stamp'] = experiment_timestamp results['execution_details']['early_stop'] = False results['execution_details']['keyboard_interrupt'] = False results['execution_details']['num_epochs'] = 0 results['model_options'] = copy.copy(parameters) dataset_name = utils.get_basename_without_extension( parameters['dataset_text_folder'] ) #opts.train.replace('/', '_').split('.')[0] # 'conll2003en' model_name = '{0}_{1}'.format( dataset_name, results['execution_details']['time_stamp']) output_folder = os.path.join('..', 'output') utils.create_folder_if_not_exists(output_folder) stats_graph_folder = os.path.join( output_folder, model_name) # Folder where to save graphs #print('stats_graph_folder: {0}'.format(stats_graph_folder)) utils.create_folder_if_not_exists(stats_graph_folder) # model_folder = os.path.join(stats_graph_folder, 'model') # utils.create_folder_if_not_exists(model_folder) step = 0 bad_counter = 0 previous_best_valid_f1_score = 0 transition_params_trained = np.random.rand( len(dataset.unique_labels), len(dataset.unique_labels)) try: while True: epoch_number = math.floor( step / len(dataset.token_indices['train'])) print('\nStarting epoch {0}'.format(epoch_number), end='') epoch_start_time = time.time() #print('step: {0}'.format(step)) # Train model: loop over all sequences of training set with shuffling sequence_numbers = list( range(len(dataset.token_indices['train']))) random.shuffle(sequence_numbers) for sequence_number in sequence_numbers: transition_params_trained = train.train_step( sess, dataset, sequence_number, model, transition_params_trained, parameters) step += 1 if step % 100 == 0: print('.', end='', flush=True) #break print('.', flush=True) #print('step: {0}'.format(step)) # Predict labels using trained model all_predictions = {} all_y_true = {} output_filepaths = {} for dataset_type in ['train', 'valid', 'test']: #print('dataset_type: {0}'.format(dataset_type)) prediction_output = train.prediction_step( sess, dataset, dataset_type, model, transition_params_trained, step, stats_graph_folder, epoch_number, parameters) all_predictions[dataset_type], all_y_true[ dataset_type], output_filepaths[ dataset_type] = prediction_output # model_options = None epoch_elapsed_training_time = time.time( ) - epoch_start_time print( 'epoch_elapsed_training_time: {0:.2f} seconds'.format( epoch_elapsed_training_time)) results['execution_details']['num_epochs'] = epoch_number # Evaluate model: save and plot results evaluate.evaluate_model(results, dataset, all_predictions, all_y_true, stats_graph_folder, epoch_number, epoch_start_time, output_filepaths) # Early stop valid_f1_score = results['epoch'][epoch_number][0][ 'valid']['f1_score']['micro'] if valid_f1_score > previous_best_valid_f1_score: bad_counter = 0 previous_best_valid_f1_score = valid_f1_score else: bad_counter += 1 if bad_counter > parameters['patience']: print('Early Stop!') results['execution_details']['early_stop'] = True break if epoch_number > parameters['maximum_number_of_epochs']: break # break # debugging except KeyboardInterrupt: results['execution_details']['keyboard_interrupt'] = True # assess_model.save_results(results, stats_graph_folder) print('Training interrupted') print('Finishing the experiment') end_time = time.time() results['execution_details'][ 'train_duration'] = end_time - start_time results['execution_details']['train_end'] = end_time evaluate.save_results(results, stats_graph_folder) sess.close() # release the session's resources
def main(languages): #embeddings_type = ['polyglot', 'fasttext'] #embeddings_type = ['fasttext', 'fasttext_noOOV'] embeddings_type = ['fasttext_noOOV'] character_lstm = [True] embedding_language = ['target', 'source'] combination = product(languages, embeddings_type, embedding_language, character_lstm) create_folder_if_not_exists(os.path.join("..", "log")) experiment_timestamp = utils.get_current_time_in_miliseconds() log_file = os.path.join("..", "log", "experiment-{}.log".format(experiment_timestamp)) for language, emb_type, emb_language, char_lstm in combination: conf_parameters = load_parameters() conf_parameters = set_datasets(conf_parameters, language) conf_parameters.set('ann','use_character_lstm', str(char_lstm)) conf_parameters.set('ann','embedding_type', emb_type) conf_parameters.set('ann','embedding_language', emb_language) if emb_type == 'polyglot': conf_parameters.set('ann', 'embedding_dimension', str(64)) elif 'fasttext' in emb_type: conf_parameters.set('ann', 'embedding_dimension', str(300)) else: raise("Uknown embedding type") if emb_language == 'source': conf_parameters.set('dataset', 'language', constants.MAPPING_LANGUAGE[language]) else: conf_parameters.set('dataset', 'language', language) parameters, conf_parameters = parse_parameters(conf_parameters) start_time = time.time() experiment_timestamp = utils.get_current_time_in_miliseconds() results = {} results['epoch'] = {} results['execution_details'] = {} results['execution_details']['train_start'] = start_time results['execution_details']['time_stamp'] = experiment_timestamp results['execution_details']['early_stop'] = False results['execution_details']['keyboard_interrupt'] = False results['execution_details']['num_epochs'] = 0 results['model_options'] = copy.copy(parameters) dataset_name = utils.get_basename_without_extension(parameters['dataset_train']) model_name = '{0}_{1}_{2}_{3}_{4}'.format(language, emb_type, char_lstm, emb_language, results['execution_details']['time_stamp']) sys.stdout = open(os.path.join("..", "log", model_name), "w") print(language, emb_type, char_lstm, emb_language) with open(log_file, "a") as file: file.write("Experiment: {}\n".format(model_name)) file.write("Start time:{}\n".format(experiment_timestamp)) file.write("-------------------------------------\n\n") pprint(parameters) dataset_filepaths = get_valid_dataset_filepaths(parameters) check_parameter_compatiblity(parameters, dataset_filepaths) previous_best_valid_epoch = -1 # Load dataset dataset = ds.Dataset(verbose=parameters['verbose'], debug=parameters['debug']) dataset.load_vocab_word_embeddings(parameters) dataset.load_dataset(dataset_filepaths, parameters) # Create graph and session with tf.Graph().as_default(): session_conf = tf.ConfigProto( intra_op_parallelism_threads=parameters['number_of_cpu_threads'], inter_op_parallelism_threads=parameters['number_of_cpu_threads'], device_count={'CPU': 1, 'GPU': parameters['number_of_gpus']}, allow_soft_placement=True, # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist log_device_placement=False ) session_conf.gpu_options.allow_growth = True sess = tf.Session(config=session_conf) with sess.as_default(): # Initialize and save execution details print(model_name) output_folder = os.path.join('..', 'output') utils.create_folder_if_not_exists(output_folder) stats_graph_folder = os.path.join(output_folder, model_name) # Folder where to save graphs utils.create_folder_if_not_exists(stats_graph_folder) model_folder = os.path.join(stats_graph_folder, 'model') utils.create_folder_if_not_exists(model_folder) with open(os.path.join(model_folder, 'parameters.ini'), 'w') as parameters_file: conf_parameters.write(parameters_file) tensorboard_log_folder = os.path.join(stats_graph_folder, 'tensorboard_logs') utils.create_folder_if_not_exists(tensorboard_log_folder) tensorboard_log_folders = {} for dataset_type in dataset_filepaths.keys(): tensorboard_log_folders[dataset_type] = os.path.join(stats_graph_folder, 'tensorboard_logs', dataset_type) utils.create_folder_if_not_exists(tensorboard_log_folders[dataset_type]) # del dataset.embeddings_matrix if not parameters['use_pretrained_model']: pickle.dump(dataset, open(os.path.join(model_folder, 'dataset.pickle'), 'wb')) # dataset.load_pretrained_word_embeddings(parameters) # Instantiate the model # graph initialization should be before FileWriter, otherwise the graph will not appear in TensorBoard model = EntityLSTM(dataset, parameters) # Instantiate the writers for TensorBoard writers = {} for dataset_type in dataset_filepaths.keys(): writers[dataset_type] = tf.summary.FileWriter(tensorboard_log_folders[dataset_type], graph=sess.graph) embedding_writer = tf.summary.FileWriter( model_folder) # embedding_writer has to write in model_folder, otherwise TensorBoard won't be able to view embeddings embeddings_projector_config = projector.ProjectorConfig() tensorboard_token_embeddings = embeddings_projector_config.embeddings.add() tensorboard_token_embeddings.tensor_name = model.token_embedding_weights.name token_list_file_path = os.path.join(model_folder, 'tensorboard_metadata_tokens.tsv') tensorboard_token_embeddings.metadata_path = os.path.relpath(token_list_file_path, '..') if parameters['use_character_lstm']: tensorboard_character_embeddings = embeddings_projector_config.embeddings.add() tensorboard_character_embeddings.tensor_name = model.character_embedding_weights.name character_list_file_path = os.path.join(model_folder, 'tensorboard_metadata_characters.tsv') tensorboard_character_embeddings.metadata_path = os.path.relpath(character_list_file_path, '..') projector.visualize_embeddings(embedding_writer, embeddings_projector_config) # Write metadata for TensorBoard embeddings token_list_file = codecs.open(token_list_file_path, 'w', 'UTF-8') for token_index in range(len(dataset.index_to_token)): token_list_file.write('{0}\n'.format(dataset.index_to_token[token_index])) token_list_file.close() if parameters['use_character_lstm']: character_list_file = codecs.open(character_list_file_path, 'w', 'UTF-8') for character_index in range(dataset.alphabet_size): if character_index == dataset.PADDING_CHARACTER_INDEX: character_list_file.write('PADDING\n') else: character_list_file.write('{0}\n'.format(dataset.index_to_character[character_index])) character_list_file.close() try: # Initialize the model sess.run(tf.global_variables_initializer()) if not parameters['use_pretrained_model']: model.load_pretrained_token_embeddings(sess, dataset, parameters) # Start training + evaluation loop. Each iteration corresponds to 1 epoch. bad_counter = 0 # number of epochs with no improvement on the validation test in terms of F1-score previous_best_valid_f1_score = -1 transition_params_trained = np.random.rand(len(dataset.unique_labels), len( dataset.unique_labels)) # TODO np.random.rand(len(dataset.unique_labels)+2,len(dataset.unique_labels)+2) model_saver = tf.train.Saver( max_to_keep=None) # parameters['maximum_number_of_epochs']) # defaults to saving all variables epoch_number = 0 while True: step = 0 epoch_number += 1 print('\nStarting epoch {0}'.format(epoch_number)) epoch_start_time = time.time() if parameters['use_pretrained_model'] and epoch_number == 1: # Restore pretrained model parameters transition_params_trained = train.restore_model_parameters_from_pretrained_model(parameters, dataset, sess, model, model_saver) elif epoch_number != 0: # Train model: loop over all sequences of training set with shuffling sequence_numbers = list(range(len(dataset.token_indices['train']))) random.shuffle(sequence_numbers) data_counter = 0 sub_id = 0 for i in tqdm(range(0, len(sequence_numbers), parameters['batch_size']), "Training epoch {}".format(epoch_number), mininterval=1): data_counter += parameters['batch_size'] if data_counter >= 20000: data_counter = 0 sub_id += 0.001 print("Intermediate evaluation number: ", sub_id) epoch_elapsed_training_time = time.time() - epoch_start_time print('Training completed in {0:.2f} seconds'.format(epoch_elapsed_training_time), flush=True) y_pred, y_true, output_filepaths = train.predict_labels(sess, model, transition_params_trained, parameters, dataset, epoch_number + sub_id, stats_graph_folder, dataset_filepaths) # Evaluate model: save and plot results evaluate.evaluate_model(results, dataset, y_pred, y_true, stats_graph_folder, epoch_number, epoch_start_time, output_filepaths, parameters) # Save model model_saver.save(sess, os.path.join(model_folder, 'model_{0:07.3f}.ckpt'.format( epoch_number + sub_id))) # Save TensorBoard logs summary = sess.run(model.summary_op, feed_dict=None) writers['train'].add_summary(summary, epoch_number) writers['train'].flush() utils.copytree(writers['train'].get_logdir(), model_folder) # Early stop valid_f1_score = results['epoch'][epoch_number][0]['valid']['f1_score']['micro'] if valid_f1_score > previous_best_valid_f1_score: bad_counter = 0 previous_best_valid_f1_score = valid_f1_score else: bad_counter += 1 sequence_number = sequence_numbers[i: i + parameters['batch_size']] transition_params_trained, loss = train.train_step(sess, dataset, sequence_number, model, transition_params_trained, parameters) epoch_elapsed_training_time = time.time() - epoch_start_time print('Training completed in {0:.2f} seconds'.format(epoch_elapsed_training_time), flush=True) y_pred, y_true, output_filepaths = train.predict_labels(sess, model, transition_params_trained, parameters, dataset, epoch_number, stats_graph_folder, dataset_filepaths) # Evaluate model: save and plot results evaluate.evaluate_model(results, dataset, y_pred, y_true, stats_graph_folder, epoch_number, epoch_start_time, output_filepaths, parameters) # Save model model_saver.save(sess, os.path.join(model_folder, 'model_{0:05d}.ckpt'.format(epoch_number))) # Save TensorBoard logs summary = sess.run(model.summary_op, feed_dict=None) writers['train'].add_summary(summary, epoch_number) writers['train'].flush() utils.copytree(writers['train'].get_logdir(), model_folder) # Early stop valid_f1_score = results['epoch'][epoch_number][0]['valid']['f1_score']['micro'] if valid_f1_score > previous_best_valid_f1_score: bad_counter = 0 previous_best_valid_f1_score = valid_f1_score previous_best_valid_epoch = epoch_number else: bad_counter += 1 print("The last {0} epochs have not shown improvements on the validation set.".format( bad_counter)) if bad_counter >= parameters['patience']: print('Early Stop!') results['execution_details']['early_stop'] = True break if epoch_number >= parameters['maximum_number_of_epochs']: break keep_only_best_model(model_folder,previous_best_valid_epoch ,parameters['maximum_number_of_epochs']+1) except KeyboardInterrupt: results['execution_details']['keyboard_interrupt'] = True print('Training interrupted') # remove the experiment remove_experiment = input("Do you want to remove the experiment? (yes/y/Yes)") if remove_experiment in ["Yes", "yes", "y"]: shutil.rmtree(stats_graph_folder) print("Folder removed") else: print('Finishing the experiment') end_time = time.time() results['execution_details']['train_duration'] = end_time - start_time results['execution_details']['train_end'] = end_time evaluate.save_results(results, stats_graph_folder) sys.stdout.close() except Exception: logging.exception("") remove_experiment = input("Do you want to remove the experiment? (yes/y/Yes)") if remove_experiment in ["Yes", "yes", "y"]: shutil.rmtree(stats_graph_folder) print("Folder removed") sys.stdout.close() sess.close() # release the session's resources sys.stdout.close()
senders = { 'ykaner': [0.54, 'yk12953'], 'noamh': [1 - developer_fee[0], '2'], 'rotemsd': [1 - developer_fee[0], '10'], # 'ohad1': [0.26, 'yk12953'], 'duperyuyu': [0.7, 'duper'], 'Israel_Ben_Ari': [0.5, 'buying'] } owners = {'noamh': [0.5, '2'], 'rotemsd': [0.5, '10']} token_data_f = 'token_data' senders_f = 'senders.json' owners_f = 'owners.json' utils.create_folder_if_not_exists(token_data_f) if not os.path.exists(os.path.join(token_data_f, senders_f)): with open(os.path.join(token_data_f, senders_f), 'w') as f: json.dump(senders, f, indent='\t') else: with open(os.path.join(token_data_f, senders_f), 'r') as f: senders = json.load(f) if not os.path.exists(os.path.join(token_data_f, owners_f)): with open(os.path.join(token_data_f, owners_f), 'w') as f: json.dump(senders, f, indent='\t') else: with open(os.path.join(token_data_f, owners_f), 'r') as f: senders = json.load(f) def choose_token(sender):
def fit(self): parameters = self.parameters conf_parameters = self.conf_parameters dataset_filepaths = self.dataset_filepaths dataset = self.dataset dataset_brat_folders = self.dataset_brat_folders sess = self.sess model = self.model transition_params_trained = self.transition_params_trained stats_graph_folder, experiment_timestamp = self._create_stats_graph_folder(parameters) # Initialize and save execution details start_time = time.time() results = {} results['epoch'] = {} results['execution_details'] = {} results['execution_details']['train_start'] = start_time results['execution_details']['time_stamp'] = experiment_timestamp results['execution_details']['early_stop'] = False results['execution_details']['keyboard_interrupt'] = False results['execution_details']['num_epochs'] = 0 results['model_options'] = copy.copy(parameters) model_folder = os.path.join(stats_graph_folder, 'model') utils.create_folder_if_not_exists(model_folder) with open(os.path.join(model_folder, 'parameters.ini'), 'w') as parameters_file: conf_parameters.write(parameters_file) pickle.dump(dataset, open(os.path.join(model_folder, 'dataset.pickle'), 'wb')) tensorboard_log_folder = os.path.join(stats_graph_folder, 'tensorboard_logs') utils.create_folder_if_not_exists(tensorboard_log_folder) tensorboard_log_folders = {} for dataset_type in dataset_filepaths.keys(): tensorboard_log_folders[dataset_type] = os.path.join(stats_graph_folder, 'tensorboard_logs', dataset_type) utils.create_folder_if_not_exists(tensorboard_log_folders[dataset_type]) # Instantiate the writers for TensorBoard writers = {} for dataset_type in dataset_filepaths.keys(): writers[dataset_type] = tf.summary.FileWriter(tensorboard_log_folders[dataset_type], graph=sess.graph) embedding_writer = tf.summary.FileWriter(model_folder) # embedding_writer has to write in model_folder, otherwise TensorBoard won't be able to view embeddings embeddings_projector_config = projector.ProjectorConfig() tensorboard_token_embeddings = embeddings_projector_config.embeddings.add() tensorboard_token_embeddings.tensor_name = model.token_embedding_weights.name token_list_file_path = os.path.join(model_folder, 'tensorboard_metadata_tokens.tsv') tensorboard_token_embeddings.metadata_path = os.path.relpath(token_list_file_path, '..') tensorboard_character_embeddings = embeddings_projector_config.embeddings.add() tensorboard_character_embeddings.tensor_name = model.character_embedding_weights.name character_list_file_path = os.path.join(model_folder, 'tensorboard_metadata_characters.tsv') tensorboard_character_embeddings.metadata_path = os.path.relpath(character_list_file_path, '..') projector.visualize_embeddings(embedding_writer, embeddings_projector_config) # Write metadata for TensorBoard embeddings token_list_file = codecs.open(token_list_file_path,'w', 'UTF-8') for token_index in range(dataset.vocabulary_size): token_list_file.write('{0}\n'.format(dataset.index_to_token[token_index])) token_list_file.close() character_list_file = codecs.open(character_list_file_path,'w', 'UTF-8') for character_index in range(dataset.alphabet_size): if character_index == dataset.PADDING_CHARACTER_INDEX: character_list_file.write('PADDING\n') else: character_list_file.write('{0}\n'.format(dataset.index_to_character[character_index])) character_list_file.close() # Start training + evaluation loop. Each iteration corresponds to 1 epoch. bad_counter = 0 # number of epochs with no improvement on the validation test in terms of F1-score previous_best_valid_f1_score = 0 epoch_number = -1 try: while True: step = 0 epoch_number += 1 print('\nStarting epoch {0}'.format(epoch_number)) epoch_start_time = time.time() if epoch_number != 0: # Train model: loop over all sequences of training set with shuffling sequence_numbers=list(range(len(dataset.token_indices['train']))) random.shuffle(sequence_numbers) for sequence_number in sequence_numbers: transition_params_trained = train.train_step(sess, dataset, sequence_number, model, parameters) step += 1 if step % 10 == 0: print('Training {0:.2f}% done'.format(step/len(sequence_numbers)*100), end='\r', flush=True) epoch_elapsed_training_time = time.time() - epoch_start_time print('Training completed in {0:.2f} seconds'.format(epoch_elapsed_training_time), flush=True) y_pred, y_true, output_filepaths = train.predict_labels(sess, model, transition_params_trained, parameters, dataset, epoch_number, stats_graph_folder, dataset_filepaths) # Evaluate model: save and plot results evaluate.evaluate_model(results, dataset, y_pred, y_true, stats_graph_folder, epoch_number, epoch_start_time, output_filepaths, parameters) if parameters['use_pretrained_model'] and not parameters['train_model']: conll_to_brat.output_brat(output_filepaths, dataset_brat_folders, stats_graph_folder) break # Save model model.saver.save(sess, os.path.join(model_folder, 'model_{0:05d}.ckpt'.format(epoch_number))) # Save TensorBoard logs summary = sess.run(model.summary_op, feed_dict=None) writers['train'].add_summary(summary, epoch_number) writers['train'].flush() utils.copytree(writers['train'].get_logdir(), model_folder) # Early stop valid_f1_score = results['epoch'][epoch_number][0]['valid']['f1_score']['micro'] if valid_f1_score > previous_best_valid_f1_score: bad_counter = 0 previous_best_valid_f1_score = valid_f1_score conll_to_brat.output_brat(output_filepaths, dataset_brat_folders, stats_graph_folder, overwrite=True) self.transition_params_trained = transition_params_trained else: bad_counter += 1 print("The last {0} epochs have not shown improvements on the validation set.".format(bad_counter)) if bad_counter >= parameters['patience']: print('Early Stop!') results['execution_details']['early_stop'] = True break if epoch_number >= parameters['maximum_number_of_epochs']: break except KeyboardInterrupt: results['execution_details']['keyboard_interrupt'] = True print('Training interrupted') print('Finishing the experiment') end_time = time.time() results['execution_details']['train_duration'] = end_time - start_time results['execution_details']['train_end'] = end_time evaluate.save_results(results, stats_graph_folder) for dataset_type in dataset_filepaths.keys(): writers[dataset_type].close()
def fit(self): ''' Dùng để train data ''' parameters = self.parameters conf_parameters = self.conf_parameters dataset_filepaths = self.dataset_filepaths dataset = self.dataset dataset_brat_folders = self.dataset_brat_folders sess = self.sess model = self.model transition_params_trained = self.transition_params_trained stats_graph_folder, experiment_timestamp = self._create_stats_graph_folder(parameters) # Khởi tạo và lưu các thông tin của lần chạy start_time = time.time() results = {} results['epoch'] = {} ''' An epoch, in Machine Learning, is the entire processing by the learning algorithm of the entire train-set. Ex: The MNIST train set is composed by 55000 samples. Once the algorithm processed all those 55000 samples an epoch is passed. ''' results['execution_details'] = {} results['execution_details']['train_start'] = start_time # Thời gian bắt đầu chạy results['execution_details']['time_stamp'] = experiment_timestamp # Nhãn thời gian results['execution_details']['early_stop'] = False # Cho biết có lỗi xảy ra nên bị dừng sớm ko results['execution_details']['keyboard_interrupt'] = False # Cho biết có bị dừng bởi keyboard results['execution_details']['num_epochs'] = 0 # Số lượng epoch đã chạy results['model_options'] = copy.copy(parameters) # Các tham số model_folder = os.path.join(stats_graph_folder, 'model') # output/en.../model utils.create_folder_if_not_exists(model_folder) # Save value cac parameters vao file parameters.ini with open(os.path.join(model_folder, 'parameters.ini'), 'w') as parameters_file: conf_parameters.write(parameters_file) # Log các tham số ra file pickle.dump(dataset, open(os.path.join(model_folder, 'dataset.pickle'), 'wb')) # Dump dataset thành pickle file để lần sau chạy # Tạo folder tensorboard logs để dùng cho việc vẽ biểu đồ sau này tensorboard_log_folder = os.path.join(stats_graph_folder, 'tensorboard_logs') # folder lưu file log của tensorboard -> dùng cho việc plot biểu đồ lên utils.create_folder_if_not_exists(tensorboard_log_folder) tensorboard_log_folders = {} for dataset_type in dataset_filepaths.keys(): tensorboard_log_folders[dataset_type] = os.path.join(stats_graph_folder, 'tensorboard_logs', dataset_type) utils.create_folder_if_not_exists(tensorboard_log_folders[dataset_type]) # Khởi tạo các writers cho tensorboard writers = {} # Có nhiều nhất 4 writers train, test, valid, deploy for dataset_type in dataset_filepaths.keys(): writers[dataset_type] = tf.summary.FileWriter(tensorboard_log_folders[dataset_type], graph=sess.graph) embedding_writer = tf.summary.FileWriter(model_folder) # embedding_writer has to write in model_folder, otherwise TensorBoard won't be able to view embeddings # Dùng cho việc visualize embedding bằng tensorboard embeddings_projector_config = projector.ProjectorConfig() tensorboard_token_embeddings = embeddings_projector_config.embeddings.add() tensorboard_token_embeddings.tensor_name = model.token_embedding_weights.name token_list_file_path = os.path.join(model_folder, 'tensorboard_metadata_tokens.tsv') tensorboard_token_embeddings.metadata_path = 'tensorboard_metadata_tokens.tsv'#os.path.relpath(token_list_file_path, '..') tensorboard_character_embeddings = embeddings_projector_config.embeddings.add() tensorboard_character_embeddings.tensor_name = model.character_embedding_weights.name character_list_file_path = os.path.join(model_folder, 'tensorboard_metadata_characters.tsv') tensorboard_character_embeddings.metadata_path = 'tensorboard_metadata_characters.tsv'#os.path.relpath(character_list_file_path, '..') # Saves a configuration file that TensorBoard will read during startup. projector.visualize_embeddings(embedding_writer, embeddings_projector_config) # Ghi token vào file tsv dùng làm metadata cho embedding token_list_file = codecs.open(token_list_file_path,'w', 'UTF-8') for token_index in range(dataset.vocabulary_size): token_list_file.write('{0}\n'.format(dataset.index_to_token[token_index])) token_list_file.close() # Ghi characters vào file tsv dùng làm metadata cho embedding character_list_file = codecs.open(character_list_file_path,'w', 'UTF-8') for character_index in range(dataset.alphabet_size): if character_index == dataset.PADDING_CHARACTER_INDEX: character_list_file.write('PADDING\n') else: character_list_file.write('{0}\n'.format(dataset.index_to_character[character_index])) character_list_file.close() # Start training + evaluation loop. Each iteration corresponds to 1 epoch. bad_counter = 0 # number of epochs with no improvement on the validation test in terms of F1-score previous_best_valid_f1_score = 0 # f1-Score tốt nhất ở các lần chạy trước epoch_number = -1 try: while True: step = 0 epoch_number += 1 print('\nStarting epoch {0}'.format(epoch_number)) epoch_start_time = time.time() if epoch_number != 0: # Train model: loop over all sequences of training set with shuffling sequence_numbers=list(range(len(dataset.token_indices['train']))) print("----****____") print(dataset.token_indices['train'][:10]) random.shuffle(sequence_numbers) # Thuc hien train for sequence_number in sequence_numbers: transition_params_trained = train.train_step(sess, dataset, sequence_number, model, parameters) step += 1 if step % 10 == 0: print('Training {0:.2f}% done'.format(step/len(sequence_numbers)*100), end='\r', flush=True) # Tinh thoi gian thuc hien 1 epoch epoch_elapsed_training_time = time.time() - epoch_start_time print('Training completed in {0:.2f} seconds'.format(epoch_elapsed_training_time), flush=True) y_pred, y_true, output_filepaths = train.predict_labels(sess, model, transition_params_trained, parameters, dataset, epoch_number, stats_graph_folder, dataset_filepaths) # Evaluate model: save and plot results evaluate.evaluate_model(results, dataset, y_pred, y_true, stats_graph_folder, epoch_number, epoch_start_time, output_filepaths, parameters) if parameters['use_pretrained_model'] and not parameters['train_model']: conll_to_brat.output_brat(output_filepaths, dataset_brat_folders, stats_graph_folder) break # Save model model.saver.save(sess, os.path.join(model_folder, 'model_{0:05d}.ckpt'.format(epoch_number))) # Save TensorBoard logs summary = sess.run(model.summary_op, feed_dict=None) writers['train'].add_summary(summary, epoch_number) writers['train'].flush() utils.copytree(writers['train'].get_logdir(), model_folder) # Early stop valid_f1_score = results['epoch'][epoch_number][0]['valid']['f1_score']['micro'] # If do chinh xac cua epoch > do chinh xac cua epoch truoc if valid_f1_score > previous_best_valid_f1_score: bad_counter = 0 previous_best_valid_f1_score = valid_f1_score conll_to_brat.output_brat(output_filepaths, dataset_brat_folders, stats_graph_folder, overwrite=True) self.transition_params_trained = transition_params_trained else: bad_counter += 1 print("The last {0} epochs have not shown improvements on the validation set.".format(bad_counter)) # If bad_counter den mot muc gioi han parameters['patience'] = 10 (gia tri khoi tao) finish train if bad_counter >= parameters['patience']: print('Early Stop!') results['execution_details']['early_stop'] = True break # Neu so epoch >= so luong epoch toi da quy dinh --> ket thuc train if epoch_number >= parameters['maximum_number_of_epochs']: break except KeyboardInterrupt: results['execution_details']['keyboard_interrupt'] = True print('Training interrupted') # Ket thuc train luu cac tham so time, ket qua print('Finishing the experiment') end_time = time.time() results['execution_details']['train_duration'] = end_time - start_time results['execution_details']['train_end'] = end_time evaluate.save_results(results, stats_graph_folder) for dataset_type in dataset_filepaths.keys(): writers[dataset_type].close()