def _read_from_config(config, fea_only): def _get_max_seq_length_from_config_str(config_str): max_seq_length=[int(e) for e in config_str.split(',')] if len(max_seq_length) == 1: max_seq_length = max_seq_length[0] else: assert len(max_seq_length) == 6 max_seq_length_list = max_seq_length max_seq_length = dict() max_seq_length['chunk_size_fea'] = max_seq_length_list[0] max_seq_length['chunk_step_fea'] = max_seq_length_list[1] max_seq_length['chunk_size_lab'] = max_seq_length_list[2] max_seq_length['chunk_step_lab'] = max_seq_length_list[3] max_seq_length['window_shift'] = max_seq_length_list[4] max_seq_length['window_size'] = max_seq_length_list[5] return max_seq_length to_do=config['exp']['to_do'] if to_do=='train': max_seq_length=_get_max_seq_length_from_config_str(config['batches']['max_seq_length_train']) if to_do=='valid': max_seq_length=_get_max_seq_length_from_config_str(config['batches']['max_seq_length_valid']) if to_do=='forward': max_seq_length=-1 # do to break forward sentences fea_only=True fea_dict, lab_dict, arch_dict = dict_fea_lab_arch(config, fea_only) seq_model = is_sequential_dict(config, arch_dict) return to_do, max_seq_length, fea_dict, lab_dict, arch_dict, seq_model
cfg_file = sys.argv[1] if not (os.path.exists(cfg_file)): sys.stderr.write('ERROR: The config file %s does not exist!\n' % (cfg_file)) sys.exit(0) else: config = configparser.ConfigParser() config.read(cfg_file) # Reading and parsing optional arguments from command line (e.g.,--optimization,lr=0.002) [section_args, field_args, value_args] = read_args_command_line(sys.argv, config) # list all the features, labels, and architecture actually used in the model section [fea_dict, lab_dict, arch_dict] = dict_fea_lab_arch(config) # check automatically if the model is sequential seq_model = is_sequential_dict(config, arch_dict) # Setting torch seed seed = int(config['exp']['seed']) torch.manual_seed(seed) random.seed(seed) np.random.seed(seed) # Reading config parameters use_cuda = strtobool(config['exp']['use_cuda']) save_gpumem = strtobool(config['exp']['save_gpumem']) multi_gpu = strtobool(config['exp']['multi_gpu']) is_production = strtobool(config['exp']['production'])
def read_lab_fea(cfg_file, fea_only, shared_list, output_folder): # Reading chunk-specific cfg file (first argument-mandatory file) if not (os.path.exists(cfg_file)): sys.stderr.write('ERROR: The config file %s does not exist!\n' % cfg_file) sys.exit(0) else: config = configparser.ConfigParser() config.read(cfg_file) # Reading some cfg parameters to_do = config['exp']['to_do'] if to_do == 'train': max_seq_length = int( config['batches']['max_seq_length_train'] ) # *(int(info_file[-13:-10])+1) # increasing over the epochs if to_do == 'valid': max_seq_length = int(config['batches']['max_seq_length_valid']) if to_do == 'forward': max_seq_length = -1 # do to break forward sentences [fea_dict, lab_dict, arch_dict] = dict_fea_lab_arch(config, fea_only) [cw_left_max, cw_right_max] = compute_cw_max(fea_dict) fea_index = 0 cnt_fea = 0 for fea in fea_dict.keys(): # reading the features fea_scp = fea_dict[fea][1] fea_opts = fea_dict[fea][2] cw_left = int(fea_dict[fea][3]) cw_right = int(fea_dict[fea][4]) cnt_lab = 0 # Production case, we don't have labels (lab_name = none) if fea_only: lab_dict.update({'lab_name': 'none'}) for lab in lab_dict.keys(): # Production case, we don't have labels (lab_name = none) if fea_only: lab_folder = None lab_opts = None else: lab_folder = lab_dict[lab][1] lab_opts = lab_dict[lab][2] [data_name_fea, data_set_fea, data_end_index_fea ] = load_chunk(fea_scp, fea_opts, lab_folder, lab_opts, cw_left, cw_right, max_seq_length, output_folder, fea_only) # making the same dimenion for all the features (compensating for different context windows) labs_fea = data_set_fea[cw_left_max - cw_left:data_set_fea.shape[0] - (cw_right_max - cw_right), -1] data_set_fea = data_set_fea[cw_left_max - cw_left:data_set_fea.shape[0] - (cw_right_max - cw_right), 0:-1] data_end_index_fea = data_end_index_fea - (cw_left_max - cw_left) data_end_index_fea[-1] = data_end_index_fea[-1] - (cw_right_max - cw_right) if cnt_fea == 0 and cnt_lab == 0: data_set = data_set_fea labs = labs_fea data_end_index = data_end_index_fea data_end_index = data_end_index_fea data_name = data_name_fea fea_dict[fea].append(fea_index) fea_index = fea_index + data_set_fea.shape[1] fea_dict[fea].append(fea_index) fea_dict[fea].append(fea_dict[fea][6] - fea_dict[fea][5]) else: if cnt_fea == 0: labs = np.column_stack((labs, labs_fea)) if cnt_lab == 0: data_set = np.column_stack((data_set, data_set_fea)) fea_dict[fea].append(fea_index) fea_index = fea_index + data_set_fea.shape[1] fea_dict[fea].append(fea_index) fea_dict[fea].append(fea_dict[fea][6] - fea_dict[fea][5]) # Checks if lab_names are the same for all the features if not (data_name == data_name_fea): sys.stderr.write( 'ERROR: different sentence ids are detected for the different features. Plase check again input feature lists"\n' ) sys.exit(0) # Checks if end indexes are the same for all the features if not (data_end_index == data_end_index_fea).all(): sys.stderr.write( 'ERROR end_index must be the same for all the sentences"\n' ) sys.exit(0) cnt_lab = cnt_lab + 1 cnt_fea = cnt_fea + 1 cnt_lab = 0 if not fea_only: for lab in lab_dict.keys(): lab_dict[lab].append(data_set.shape[1] + cnt_lab) cnt_lab = cnt_lab + 1 data_set = np.column_stack((data_set, labs)) # check automatically if the model is sequential seq_model = is_sequential_dict(config, arch_dict) # Randomize if the model is not sequential if not (seq_model) and to_do != 'forward': np.random.shuffle(data_set) # Split dataset in many part. If the dataset is too big, we can have issues to copy it into the shared memory # (due to pickle limits) # N_split=10 # data_set=np.array_split(data_set, N_split) # Adding all the elements in the shared list shared_list.append(data_name) shared_list.append(data_end_index) shared_list.append(fea_dict) shared_list.append(lab_dict) shared_list.append(arch_dict) shared_list.append(data_set)