Exemplo n.º 1
0
def read_lab_fea(cfg_file, fea_only, shared_list, output_folder):
    # Reading chunk-specific cfg file (first argument-mandatory file)
    if not (os.path.exists(cfg_file)):
        sys.stderr.write('ERROR: The config file %s does not exist!\n' %
                         cfg_file)
        sys.exit(0)
    else:
        config = configparser.ConfigParser()
        config.read(cfg_file)

    # Reading some cfg parameters
    to_do = config['exp']['to_do']

    if to_do == 'train':
        max_seq_length = int(
            config['batches']['max_seq_length_train']
        )  # *(int(info_file[-13:-10])+1) # increasing over the epochs

    if to_do == 'valid':
        max_seq_length = int(config['batches']['max_seq_length_valid'])

    if to_do == 'forward':
        max_seq_length = -1  # do to break forward sentences

    [fea_dict, lab_dict, arch_dict] = dict_fea_lab_arch(config, fea_only)
    [cw_left_max, cw_right_max] = compute_cw_max(fea_dict)

    fea_index = 0
    cnt_fea = 0
    for fea in fea_dict.keys():

        # reading the features
        fea_scp = fea_dict[fea][1]
        fea_opts = fea_dict[fea][2]
        cw_left = int(fea_dict[fea][3])
        cw_right = int(fea_dict[fea][4])

        cnt_lab = 0

        # Production case, we don't have labels (lab_name = none)
        if fea_only:
            lab_dict.update({'lab_name': 'none'})
        for lab in lab_dict.keys():
            # Production case, we don't have labels (lab_name = none)
            if fea_only:
                lab_folder = None
                lab_opts = None
            else:
                lab_folder = lab_dict[lab][1]
                lab_opts = lab_dict[lab][2]

            [data_name_fea, data_set_fea, data_end_index_fea
             ] = load_chunk(fea_scp, fea_opts, lab_folder, lab_opts, cw_left,
                            cw_right, max_seq_length, output_folder, fea_only)

            # making the same dimenion for all the features (compensating for different context windows)
            labs_fea = data_set_fea[cw_left_max -
                                    cw_left:data_set_fea.shape[0] -
                                    (cw_right_max - cw_right), -1]
            data_set_fea = data_set_fea[cw_left_max -
                                        cw_left:data_set_fea.shape[0] -
                                        (cw_right_max - cw_right), 0:-1]
            data_end_index_fea = data_end_index_fea - (cw_left_max - cw_left)
            data_end_index_fea[-1] = data_end_index_fea[-1] - (cw_right_max -
                                                               cw_right)

            if cnt_fea == 0 and cnt_lab == 0:
                data_set = data_set_fea
                labs = labs_fea
                data_end_index = data_end_index_fea
                data_end_index = data_end_index_fea
                data_name = data_name_fea

                fea_dict[fea].append(fea_index)
                fea_index = fea_index + data_set_fea.shape[1]
                fea_dict[fea].append(fea_index)
                fea_dict[fea].append(fea_dict[fea][6] - fea_dict[fea][5])

            else:
                if cnt_fea == 0:
                    labs = np.column_stack((labs, labs_fea))

                if cnt_lab == 0:
                    data_set = np.column_stack((data_set, data_set_fea))
                    fea_dict[fea].append(fea_index)
                    fea_index = fea_index + data_set_fea.shape[1]
                    fea_dict[fea].append(fea_index)
                    fea_dict[fea].append(fea_dict[fea][6] - fea_dict[fea][5])

                # Checks if lab_names are the same for all the features
                if not (data_name == data_name_fea):
                    sys.stderr.write(
                        'ERROR: different sentence ids are detected for the different features. Plase check again input feature lists"\n'
                    )
                    sys.exit(0)

                # Checks if end indexes are the same for all the features
                if not (data_end_index == data_end_index_fea).all():
                    sys.stderr.write(
                        'ERROR end_index must be the same for all the sentences"\n'
                    )
                    sys.exit(0)

            cnt_lab = cnt_lab + 1

        cnt_fea = cnt_fea + 1

    cnt_lab = 0
    if not fea_only:
        for lab in lab_dict.keys():
            lab_dict[lab].append(data_set.shape[1] + cnt_lab)
            cnt_lab = cnt_lab + 1

    data_set = np.column_stack((data_set, labs))

    # check automatically if the model is sequential
    seq_model = is_sequential_dict(config, arch_dict)

    # Randomize if the model is not sequential
    if not (seq_model) and to_do != 'forward':
        np.random.shuffle(data_set)

    # Split dataset in many part. If the dataset is too big, we can have issues to copy it into the shared memory
    # (due to pickle limits)
    # N_split=10
    # data_set=np.array_split(data_set, N_split)

    # Adding all the elements in the shared list
    shared_list.append(data_name)
    shared_list.append(data_end_index)
    shared_list.append(fea_dict)
    shared_list.append(lab_dict)
    shared_list.append(arch_dict)
    shared_list.append(data_set)
Exemplo n.º 2
0
        config['batches']['max_seq_length_train']
    )  #*(int(info_file[-13:-10])+1) # increasing over the epochs
    batch_size = int(config['batches']['batch_size_train'])

if to_do == 'valid':
    max_seq_length = int(config['batches']['max_seq_length_valid'])
    batch_size = int(config['batches']['batch_size_valid'])

if to_do == 'forward':
    max_seq_length = -1  # do to break forward sentences
    batch_size = 1

start_time = time.time()

# Compute the maximum context window in the feature dict
[cw_left_max, cw_right_max] = compute_cw_max(fea_dict)

# Reading all the features and labels
[data_name, data_set,
 data_end_index] = read_lab_fea(fea_dict, lab_dict, cw_left_max, cw_right_max,
                                max_seq_length, is_production)

# Randomize if the model is not sequential
if not (seq_model) and to_do != 'forward':
    np.random.shuffle(data_set)

elapsed_time_reading = time.time() - start_time

# converting numpy tensors into pytorch tensors and put them on GPUs if specified
start_time = time.time()
if not (save_gpumem) and use_cuda:
Exemplo n.º 3
0
 def _read_features_and_labels(fea_dict, lab_dict, max_seq_length, fea_only, output_folder):
     def _get_fea_config_from_dict(fea_dict_entr):
         fea_scp = fea_dict_entr[1]
         fea_opts = fea_dict_entr[2]
         cw_left = int(fea_dict_entr[3])
         cw_right = int(fea_dict_entr[4])
         return fea_scp, fea_opts, cw_left, cw_right
     def _get_lab_config_from_dict(lab_dict_entr, fea_only):
         if fea_only:
             lab_folder = None 
             lab_opts = None
         else:
             lab_folder = lab_dict_entr[1]
             lab_opts = lab_dict_entr[2]
         return lab_folder, lab_opts
     def _compensate_for_different_context_windows(data_set_fea, data_set_lab, cw_left_max, cw_left, cw_right_max, cw_right, data_end_index_fea, data_end_index_lab):
         data_set_lab = np.take(data_set_lab, range(cw_left_max-cw_left,data_set_lab.shape[0]-(cw_right_max-cw_right)), axis=0, mode='clip')
         data_set_fea = np.take(data_set_fea, range(cw_left_max-cw_left,data_set_fea.shape[0]-(cw_right_max-cw_right)), axis=0, mode='clip')
         data_end_index_fea = data_end_index_fea - (cw_left_max - cw_left)
         data_end_index_lab = data_end_index_lab - (cw_left_max - cw_left)
         data_end_index_fea[-1] = data_end_index_fea[-1] - (cw_right_max - cw_right)
         data_end_index_lab[-1] = data_end_index_lab[-1] - (cw_right_max - cw_right)
         return data_set_lab, data_set_fea, data_end_index_fea, data_end_index_lab
     def _update_data(data_set, labs, fea_dict, fea, fea_index, data_set_fea, labs_fea, cnt_fea, cnt_lab):
         if cnt_fea==0 and cnt_lab==0:
             data_set=data_set_fea
             labs=labs_fea
             fea_dict[fea].append(fea_index)
             fea_index=fea_index+data_set_fea.shape[1]
             fea_dict[fea].append(fea_index)
             fea_dict[fea].append(fea_dict[fea][6]-fea_dict[fea][5])
         elif cnt_fea==0 and (not cnt_fea==0):
             labs=np.column_stack((labs,labs_fea))
         elif (not cnt_fea==0) and cnt_fea==0:
             data_set=np.column_stack((data_set,data_set_fea))
             fea_dict[fea].append(fea_index)
             fea_index=fea_index+data_set_fea.shape[1]
             fea_dict[fea].append(fea_index)
             fea_dict[fea].append(fea_dict[fea][6]-fea_dict[fea][5])
         return data_set, labs, fea_dict, fea_index
     def _check_consistency(data_name, data_name_fea, data_end_index_fea_ini, data_end_index_fea, data_end_index_lab_ini, data_end_index_lab):
         if not (data_name == data_name_fea):
             sys.stderr.write('ERROR: different sentence ids are detected for the different features. Plase check again input feature lists"\n')
             sys.exit(0)
         if not (data_end_index_fea_ini == data_end_index_fea).all():
             sys.stderr.write('ERROR end_index must be the same for all the sentences"\n')
             sys.exit(0)
         if not (data_end_index_lab_ini == data_end_index_lab).all():
             sys.stderr.write('ERROR end_index must be the same for all the sentences"\n')
             sys.exit(0)
     def _update_lab_dict(lab_dict, data_set):
         cnt_lab=0
         for lab in lab_dict.keys():
             lab_dict[lab].append(data_set.shape[1]+cnt_lab)
             cnt_lab=cnt_lab+1
         return lab_dict
     def _load_chunk_refac01(fea_scp,fea_opts,lab_folder,lab_opts,left,right,max_sequence_length, output_folder,fea_only=False):
         [data_name,data_set,data_lab,end_index_fea,end_index_lab]=load_dataset(fea_scp,fea_opts,lab_folder,lab_opts,left,right, max_sequence_length, output_folder, fea_only)
         # TODO: this function will currently only work well if no context window is given or fea and lab have the same time dimensionality
         # Context window
         if left!=0 or right!=0:
             data_set=context_window(data_set,left,right)
         end_index_fea = end_index_fea - left
         end_index_lab = end_index_lab - left
         end_index_fea[-1] = end_index_fea[-1] - right
         end_index_lab[-1] = end_index_lab[-1] - right
         # mean and variance normalization
         data_set=(data_set-np.mean(data_set,axis=0))/np.std(data_set,axis=0)
         # Label processing
         data_lab=data_lab-data_lab.min()
         if right>0:
             data_lab=data_lab[left:-right]
         else:
             data_lab=data_lab[left:]   
         if len(data_set.shape) == 1:
             data_set = np.expand_dims(data_set, -1)
         return [data_name, data_set, data_lab, end_index_fea, end_index_lab]
     
     cw_left_max, cw_right_max = compute_cw_max(fea_dict)
     fea_index=0
     cnt_fea=0
     data_name = None 
     data_end_index_fea_ini = None 
     data_end_index_lab_ini = None 
     data_set = None
     labs = None
     for fea in fea_dict.keys():
         fea_scp, fea_opts, cw_left, cw_right = _get_fea_config_from_dict(fea_dict[fea])
         cnt_lab=0
         if fea_only:
             lab_dict.update({'lab_name':'none'})
         for lab in lab_dict.keys():
             lab_folder, lab_opts = _get_lab_config_from_dict(lab_dict[lab], fea_only)
             data_name_fea, data_set_fea, data_set_lab, data_end_index_fea, data_end_index_lab = _load_chunk_refac01(fea_scp, fea_opts, lab_folder, lab_opts, cw_left, cw_right, max_seq_length, output_folder, fea_only)
             labs_fea, data_set_fea, data_end_index_fea, data_end_index_lab = _compensate_for_different_context_windows(data_set_fea, data_set_lab, cw_left_max, cw_left, cw_right_max, cw_right, data_end_index_fea, data_end_index_lab)
             if cnt_fea == 0 and cnt_lab == 0:
                 data_end_index_fea_ini = data_end_index_fea
                 data_end_index_lab_ini = data_end_index_lab
                 data_name = data_name_fea
             data_set, labs, fea_dict, fea_index = _update_data(data_set, labs, fea_dict, fea, fea_index, data_set_fea, labs_fea, cnt_fea, cnt_lab)
             _check_consistency(data_name, data_name_fea, data_end_index_fea_ini, data_end_index_fea, data_end_index_lab_ini, data_end_index_lab)
             cnt_lab=cnt_lab+1
         cnt_fea=cnt_fea+1
     if not fea_only:
         lab_dict = _update_lab_dict(lab_dict, data_set)
     return data_name, data_end_index_fea_ini, data_end_index_lab_ini, fea_dict, lab_dict, data_set, labs