def encode(flags_obj): """Run Wide-Deep training and eval loop. Args: flags_obj: An object containing parsed flag values. """ dict_data_training = { 'dir_data': DIRPROJECT + 'data/', 'data_prefix': 'nz', 'dataset': '20012016', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': { 'method': 'FUSION' }, 'grouping': 'verylightgrouping' } dataset_options_training = DatasetOptions(dict_data_training) dict_data_encoding = { 'dir_data': DIRPROJECT + 'data/', 'data_prefix': 'nz', 'dataset': '2017', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': { 'method': 'FUSION' }, 'grouping': 'verylightgrouping' } dataset_options_encoding = DatasetOptions(dict_data_encoding) feature_columns = FeatureColumnsAutoEncoderNZ( dataset_options=dataset_options_encoding) dict_dataset_options = { 'train': dataset_options_training, 'eval': None, 'test': dataset_options_encoding } nn = AutoEncoderModel('test', dict_dataset_options, feature_columns, flags_obj) diag_encodings = nn.encode() print('diag_encodings --> main diag: ' + str(diag_encodings[0].shape)) print('diag_encodings --> secondary diags: ' + str(diag_encodings[1].shape)) main_diag_encodings = diag_encodings[0] sec_diag_encodings = diag_encodings[1] dataset_encoding = Dataset(dataset_options_encoding) df_encoding = dataset_encoding.getDf() print('df_encoding: ' + str(df_encoding.shape)) num_encoded_dim = main_diag_encodings.shape[1] dir_data = dataset_options_encoding.getDirData() dataset = dataset_options_encoding.getDatasetName() data_prefix = dataset_options_encoding.getDataPrefix() demographic_featurename = dataset_options_encoding.getFilenameOptionDemographicFeatures( ) featureset_str = dataset_options_encoding.getFeatureSetStr() encoding = dataset_options_encoding.getEncodingScheme() name_event_column = dataset_options_encoding.getEventColumnName() name_main_diag = dataset_options_encoding.getNameMainDiag() name_sec_diag = dataset_options_encoding.getNameSecDiag() df_encoding_sec_diag = df_encoding[name_event_column].to_frame() df_encoding_main_diag = df_encoding[name_event_column].to_frame() num_encoded_dim = sec_diag_encodings.shape[1] for k in range(0, num_encoded_dim): new_col_secdiag = name_sec_diag + '_dim_' + str(k) df_encoding_sec_diag[new_col_secdiag] = sec_diag_encodings[:, k] new_col_maindiag = name_main_diag + '_dim_' + str(k) df_encoding_main_diag[new_col_maindiag] = main_diag_encodings[:, k] print('df_encoding_main_diag: ' + str(df_encoding_main_diag.shape)) print('df_encoding_sec_diag: ' + str(df_encoding_sec_diag.shape)) filename_sec_diag_encoding = dir_data + 'data_' + data_prefix + '_' + dataset + '_' + name_sec_diag + '_' + str( num_encoded_dim) + 'dim.csv' filename_main_diag_encoding = dir_data + 'data_' + data_prefix + '_' + dataset + '_' + name_main_diag + '_' + str( num_encoded_dim) + 'dim.csv' list_df = [ df_encoding_sec_diag[i:i + 10000] for i in range(0, df_encoding_sec_diag.shape[0], 10000) ] list_df[0].to_csv(filename_sec_diag_encoding, index=False, line_terminator='\n') for l in list_df[1:]: l.to_csv(filename_sec_diag_encoding, index=False, line_terminator='\n', header=False, mode='a') list_df = [ df_encoding_main_diag[i:i + 10000] for i in range(0, df_encoding_main_diag.shape[0], 10000) ] list_df[0].to_csv(filename_main_diag_encoding, index=False, line_terminator='\n') for l in list_df[1:]: l.to_csv(filename_main_diag_encoding, index=False, line_terminator='\n', header=False, mode='a')
'data_prefix': 'nz', 'dataset': str(year), 'encoding': 'embedding', 'grouping': 'verylightgrouping', 'newfeatures': None, 'featurereduction': { 'method': 'FUSION' } } options_dataset_year = DatasetOptions(dict_options_dataset) dataset_year = Dataset(options_dataset_year) if balanced: df_year = dataset_year.getBalancedSubSet() else: df_year = dataset_year.getDf() #df_year['main_diag'] = df_year['main_diag'].apply(convertDiagToInd) print(df_year.shape) df_all_years = df_all_years.append(df_year) print('df balanced all years: ' + str(df_all_years.shape)) encoding = options_dataset_year.getEncodingScheme() grouping = options_dataset_year.getGroupingName() featureset = options_dataset_year.getFeatureSetStr() filename_data_years = dirData + 'data_nz_' + str(min(years)) + str( max(years) ) + '_' + featureset + '_' + encoding + '_' + grouping + '.csv' df_all_years.to_csv(filename_data_years, line_terminator='\n', index=False)