示例#1
0
 def _read_from_config(config, fea_only):
     def _get_max_seq_length_from_config_str(config_str):
         max_seq_length=[int(e) for e in config_str.split(',')]
         if len(max_seq_length) == 1:
             max_seq_length = max_seq_length[0]
         else:
             assert len(max_seq_length) == 6
             max_seq_length_list = max_seq_length
             max_seq_length = dict()
             max_seq_length['chunk_size_fea'] = max_seq_length_list[0]
             max_seq_length['chunk_step_fea'] = max_seq_length_list[1]
             max_seq_length['chunk_size_lab'] = max_seq_length_list[2]
             max_seq_length['chunk_step_lab'] = max_seq_length_list[3]
             max_seq_length['window_shift'] = max_seq_length_list[4]
             max_seq_length['window_size'] = max_seq_length_list[5]
         return max_seq_length
     
     to_do=config['exp']['to_do']
     if to_do=='train':
         max_seq_length=_get_max_seq_length_from_config_str(config['batches']['max_seq_length_train'])
     if to_do=='valid':
         max_seq_length=_get_max_seq_length_from_config_str(config['batches']['max_seq_length_valid'])
     if to_do=='forward':
         max_seq_length=-1 # do to break forward sentences
         fea_only=True
     fea_dict, lab_dict, arch_dict = dict_fea_lab_arch(config, fea_only)
     seq_model = is_sequential_dict(config, arch_dict)
     return to_do, max_seq_length, fea_dict, lab_dict, arch_dict, seq_model
示例#2
0
cfg_file = sys.argv[1]

if not (os.path.exists(cfg_file)):
    sys.stderr.write('ERROR: The config file %s does not exist!\n' %
                     (cfg_file))
    sys.exit(0)
else:
    config = configparser.ConfigParser()
    config.read(cfg_file)

# Reading and parsing optional arguments from command line (e.g.,--optimization,lr=0.002)
[section_args, field_args,
 value_args] = read_args_command_line(sys.argv, config)

# list all the features, labels, and architecture actually used in the model section
[fea_dict, lab_dict, arch_dict] = dict_fea_lab_arch(config)

# check automatically if the model is sequential
seq_model = is_sequential_dict(config, arch_dict)

# Setting torch seed
seed = int(config['exp']['seed'])
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

# Reading config parameters
use_cuda = strtobool(config['exp']['use_cuda'])
save_gpumem = strtobool(config['exp']['save_gpumem'])
multi_gpu = strtobool(config['exp']['multi_gpu'])
is_production = strtobool(config['exp']['production'])
示例#3
0
def read_lab_fea(cfg_file, fea_only, shared_list, output_folder):
    # Reading chunk-specific cfg file (first argument-mandatory file)
    if not (os.path.exists(cfg_file)):
        sys.stderr.write('ERROR: The config file %s does not exist!\n' %
                         cfg_file)
        sys.exit(0)
    else:
        config = configparser.ConfigParser()
        config.read(cfg_file)

    # Reading some cfg parameters
    to_do = config['exp']['to_do']

    if to_do == 'train':
        max_seq_length = int(
            config['batches']['max_seq_length_train']
        )  # *(int(info_file[-13:-10])+1) # increasing over the epochs

    if to_do == 'valid':
        max_seq_length = int(config['batches']['max_seq_length_valid'])

    if to_do == 'forward':
        max_seq_length = -1  # do to break forward sentences

    [fea_dict, lab_dict, arch_dict] = dict_fea_lab_arch(config, fea_only)
    [cw_left_max, cw_right_max] = compute_cw_max(fea_dict)

    fea_index = 0
    cnt_fea = 0
    for fea in fea_dict.keys():

        # reading the features
        fea_scp = fea_dict[fea][1]
        fea_opts = fea_dict[fea][2]
        cw_left = int(fea_dict[fea][3])
        cw_right = int(fea_dict[fea][4])

        cnt_lab = 0

        # Production case, we don't have labels (lab_name = none)
        if fea_only:
            lab_dict.update({'lab_name': 'none'})
        for lab in lab_dict.keys():
            # Production case, we don't have labels (lab_name = none)
            if fea_only:
                lab_folder = None
                lab_opts = None
            else:
                lab_folder = lab_dict[lab][1]
                lab_opts = lab_dict[lab][2]

            [data_name_fea, data_set_fea, data_end_index_fea
             ] = load_chunk(fea_scp, fea_opts, lab_folder, lab_opts, cw_left,
                            cw_right, max_seq_length, output_folder, fea_only)

            # making the same dimenion for all the features (compensating for different context windows)
            labs_fea = data_set_fea[cw_left_max -
                                    cw_left:data_set_fea.shape[0] -
                                    (cw_right_max - cw_right), -1]
            data_set_fea = data_set_fea[cw_left_max -
                                        cw_left:data_set_fea.shape[0] -
                                        (cw_right_max - cw_right), 0:-1]
            data_end_index_fea = data_end_index_fea - (cw_left_max - cw_left)
            data_end_index_fea[-1] = data_end_index_fea[-1] - (cw_right_max -
                                                               cw_right)

            if cnt_fea == 0 and cnt_lab == 0:
                data_set = data_set_fea
                labs = labs_fea
                data_end_index = data_end_index_fea
                data_end_index = data_end_index_fea
                data_name = data_name_fea

                fea_dict[fea].append(fea_index)
                fea_index = fea_index + data_set_fea.shape[1]
                fea_dict[fea].append(fea_index)
                fea_dict[fea].append(fea_dict[fea][6] - fea_dict[fea][5])

            else:
                if cnt_fea == 0:
                    labs = np.column_stack((labs, labs_fea))

                if cnt_lab == 0:
                    data_set = np.column_stack((data_set, data_set_fea))
                    fea_dict[fea].append(fea_index)
                    fea_index = fea_index + data_set_fea.shape[1]
                    fea_dict[fea].append(fea_index)
                    fea_dict[fea].append(fea_dict[fea][6] - fea_dict[fea][5])

                # Checks if lab_names are the same for all the features
                if not (data_name == data_name_fea):
                    sys.stderr.write(
                        'ERROR: different sentence ids are detected for the different features. Plase check again input feature lists"\n'
                    )
                    sys.exit(0)

                # Checks if end indexes are the same for all the features
                if not (data_end_index == data_end_index_fea).all():
                    sys.stderr.write(
                        'ERROR end_index must be the same for all the sentences"\n'
                    )
                    sys.exit(0)

            cnt_lab = cnt_lab + 1

        cnt_fea = cnt_fea + 1

    cnt_lab = 0
    if not fea_only:
        for lab in lab_dict.keys():
            lab_dict[lab].append(data_set.shape[1] + cnt_lab)
            cnt_lab = cnt_lab + 1

    data_set = np.column_stack((data_set, labs))

    # check automatically if the model is sequential
    seq_model = is_sequential_dict(config, arch_dict)

    # Randomize if the model is not sequential
    if not (seq_model) and to_do != 'forward':
        np.random.shuffle(data_set)

    # Split dataset in many part. If the dataset is too big, we can have issues to copy it into the shared memory
    # (due to pickle limits)
    # N_split=10
    # data_set=np.array_split(data_set, N_split)

    # Adding all the elements in the shared list
    shared_list.append(data_name)
    shared_list.append(data_end_index)
    shared_list.append(fea_dict)
    shared_list.append(lab_dict)
    shared_list.append(arch_dict)
    shared_list.append(data_set)