def load_features(): features_df_file_name = 'features_aggregate_single.csv' outcomes_df_file_name = 'chart_outcomes.csv' features_df = pd.read_csv( os.path.join(features_directory, features_df_file_name)) outcomes_df = pd.read_csv( os.path.join(features_directory, outcomes_df_file_name)) features_df = features_df[:num_datapoints] outcome_variable_name = 'all_one_trace_type' outcomes = ['line', 'scatter', 'bar'] outcomes_df_subset = outcomes_df[outcomes_df[outcome_variable_name].isin( outcomes)][['fid', outcome_variable_name]] final_df = pd.merge(features_df, outcomes_df_subset, on='fid', how='inner') final_df = final_df.drop(['fid'], axis=1, inplace=False, errors='ignore') final_df.sample(frac=1.0) last_index = final_df.columns.get_loc(outcome_variable_name) X = final_df.iloc[:, :last_index] y = final_df.iloc[:, last_index] y = pd.get_dummies(y).values.argmax(1) res = RandomOverSampler(random_state=RANDOM_STATE) X, y = res.fit_sample(X, y) # shuffle X and y in unison, and then return return util.unison_shuffle(X, y)
def load_features(): features_array = [] outcomes_array = [] features_sizes = [0] # as we load each by_field/features_{} # we record the shape of each Panda Dataframe returned # and combine the df into one giant matrix # each df has a common set of agg. features, and differ in the number of field features # so we merge the df on the agg. features, and impute the missing features # with the average value from other examples # after imputing is done, we re-separate our giant df by number of columns # such that all examples with the same number of columns in the original # data are saved to the same file for num_fields in range(1, max_fields + 1): features_df_file_name = 'by_field/features_{}.csv'.format(num_fields) outcomes_df_file_name = 'by_field/outcomes_{}.csv'.format(num_fields) features_df = pd.read_csv(os.path.join(features_directory, features_df_file_name), nrows=num_datapoints) outcomes_df = pd.read_csv( os.path.join(features_directory, outcomes_df_file_name)) features_sizes.append(features_df.shape[0]) features_array.append(features_df) outcomes_array.append(outcomes_df) # here we combine features_array and outcomes_array, and delete the original arrays to save memory # any missing features are first filled in with N/A features_df = pd.concat(features_array, axis=0, ignore_index=True) outcomes_df = pd.concat(outcomes_array, axis=0, ignore_index=True) del features_array, outcomes_array # drop the fid, impute any N/A entries, and readd fid features_id_column = features_df[['fid']] features_df = features_df.drop(['fid'], axis=1, inplace=False, errors='ignore') features_df = process_features_df(features_df) features_df = pd.concat([features_df, features_id_column], axis=1) # add feature representing num_fields for each training example # and concat it with our features_df num_fields_array = [] for num_fields in range(1, max_fields + 1): np_array = np.zeros((features_sizes[num_fields], ), dtype=np.int64) np_array.fill(num_fields) num_fields_array.append(np_array) num_fields_array = np.concatenate(num_fields_array) assert num_fields_array.shape[0] == features_df.shape[0] num_fields_array = pd.DataFrame( {"special_original_num_fields": num_fields_array}) features_df = pd.concat([features_df, num_fields_array], axis=1) # process outcomes outcome_variable_name = 'all_one_trace_type' outcomes = ['line', 'scatter', 'bar'] outcomes_df_subset = outcomes_df[outcomes_df[outcome_variable_name].isin( outcomes)][['fid', outcome_variable_name]] # Join features and outcomes final_df = pd.merge(features_df, outcomes_df_subset, on='fid', how='inner') final_df = final_df.drop(['fid'], axis=1, inplace=False, errors='ignore') del features_df, outcomes_df_subset # filter out examples with same num fields # drop our special_original_num_fields column # and save the matrices to disk for num_fields in range(1, max_fields + 1): X_with_field = final_df[final_df['special_original_num_fields'] == num_fields] X_with_field = X_with_field.drop(['special_original_num_fields'], axis=1, inplace=False, errors='ignore') X = X_with_field.iloc[:, :-1] y = X_with_field.iloc[:, -1] y = pd.get_dummies(y).values.argmax(1) res = RandomOverSampler(random_state=RANDOM_STATE) X, y = res.fit_sample(X, y) X, y = util.unison_shuffle(X, y) util.save_matrices_to_disk(X, y, [0.1, 0.1], saves_directory, 'field_' + str(num_fields), num_datapoints)
def load_features(task): log_file = log_dir + 'loading_task_' + str(task['pref_id']) + '.txt' load_logger = logger(log_file, task) dataset_prediction_task_to_outcomes = { 'all_one_trace_type': { 'two': ['line', 'bar'], 'three': ['line', 'scatter', 'bar'], 'six': ['line', 'scatter', 'bar', 'box', 'histogram', 'pie'], }, 'has_single_src': { 'two': [True, False] }, 'num_x_axes': { 'numeric': [i for i in range(5)] }, 'num_y_axes': { 'numeric': [i for i in range(5)] } } field_prediction_task_to_outcomes = { 'trace_type': { 'two': ['line', 'bar'], 'three': ['line', 'scatter', 'bar'], 'six': ['line', 'scatter', 'bar', 'box', 'histogram', 'heatmap'], }, 'is_xsrc': { 'two': [True, False] }, 'is_ysrc': { 'two': [True, False] }, 'is_x_or_y': { 'two': ['x', 'y'] }, 'is_single_src': { 'two': [True, False] } } if task['dataset'] == 'dataset': task['features_df_file_name'] = 'features_aggregate_single_pairwise.csv' task['outcomes_df_file_name'] = 'chart_outcomes.csv' task['id_field'] = 'fid' prediction_task_to_outcomes = dataset_prediction_task_to_outcomes else: assert task['dataset'] == 'field' task['features_df_file_name'] = 'field_level_features.csv' task['outcomes_df_file_name'] = 'field_level_outcomes.csv' task['id_field'] = 'field_id' prediction_task_to_outcomes = field_prediction_task_to_outcomes features_df = pd.read_csv( join(features_directory, task['features_df_file_name']), nrows=num_datapoints) outcomes_df = pd.read_csv( join(features_directory, task['outcomes_df_file_name']), nrows=num_datapoints) feature_names_by_type = pickle.load( open( join(features_directory, feature_set_lookup_file_name), 'rb')) # print(features_df) # print('Initial Features:', features_df.shape) # print('Initial Outcomes:', outcomes_df.shape) # load_logger.log_dict(feature_names_by_type) # load_logger.log('\n') # load_logger.log(features_df) load_logger.log('Initial Features: ' + str(features_df.shape)) load_logger.log('Initial Outcomes: ' + str(outcomes_df.shape)) if task['dataset'] == 'field': def is_x_or_y(is_xsrc, is_ysrc): if is_xsrc and pd.isnull(is_ysrc): return 'x' if is_ysrc and pd.isnull(is_xsrc): return 'y' else: return None outcomes_df['is_x_or_y'] = np.vectorize(is_x_or_y)(outcomes_df['is_xsrc'], outcomes_df['is_ysrc']) outcomes_df['is_single_src'] = outcomes_df['is_single_xsrc'] | outcomes_df['is_single_ysrc'] outcomes_df_subset = format_outcomes_df(load_logger, outcomes_df, task['outcome_variable_name'], prediction_task_to_outcomes[ task['outcome_variable_name'] ] [task['prediction_task'] ], id_field=task['id_field']) final_df = join_features_and_outcomes(features_df, outcomes_df_subset, on=task['id_field']) last_index = final_df.columns.get_loc(task['outcome_variable_name']) X = final_df.iloc[:, :last_index] y = final_df.iloc[:, last_index] # print('Intermediate Outcomes:', y.shape) # value_counts = y.value_counts() # print('Value counts:') # print(value_counts) load_logger.log('Final DF Shape: ' + str(final_df.shape)) load_logger.log('Last Index: ' + str(last_index)) load_logger.log('Intermediate Outcomes: ' + str(y.shape)) load_logger.log('Value counts: \n' + str(y.value_counts())) # delete variables to save memory! del final_df, outcomes_df task_types = ['dimensions', 'types', 'values', 'names'] for task_name in task_types: names = get_feature_set_names_by_type( feature_names_by_type, task_type=task['dataset'], feature_set=task_name) indices = [X.columns.get_loc(c) for c in names if c in X.columns] # print('task is ' + task_name + ' and indices are:') #print('names are {}'.format(names) ) # print(indices) # load_logger.log('task is ' + task_name + ' and indices are: ') # load_logger.log(indices) y = pd.get_dummies(y).values.argmax(1) if task['sampling_mode'] == 'over': res = RandomOverSampler(random_state=RANDOM_STATE) X, y = res.fit_sample(X, y) elif task['sampling_mode'] == 'under': res = RandomUnderSampler(random_state=RANDOM_STATE) X, y = res.fit_sample(X, y) elif isinstance(task['sampling_mode'], int): X_resampled_arrays, y_resampled_arrays = [], [] for outcome in np.unique(y): outcome_mask = (y == outcome) X_resampled_outcome, y_resampled_outcome = resample( X[outcome_mask], y[outcome_mask], n_samples=task['sampling_mode'], random_state=RANDOM_STATE ) X_resampled_arrays.append(X_resampled_outcome) y_resampled_arrays.append(y_resampled_outcome) X, y = np.concatenate(X_resampled_arrays).astype( np.float64), np.concatenate(y_resampled_arrays) else: X, y = X.values.astype(np.float64), y # print('Final Features:', X.shape) # print('Final Outcomes:', y.shape) load_logger.log('Final Features:' + str(X.shape)) load_logger.log('Final Outcomes:' + str(y.shape)) unique, counts = np.unique(y, return_counts=True) load_logger.log('Value counts after sampling:') load_logger.log_dict(dict(zip(unique, counts))) load_logger.log('\n') del load_logger return util.unison_shuffle(X, y)
def load_features_and_save_id(task, logger, use_seperation=False, split='train'): # settings for tasks dataset_prediction_task_to_outcomes = { 'all_one_trace_type': { 'two': ['line', 'bar'], 'three': ['line', 'scatter', 'bar'], 'six': ['line', 'scatter', 'bar', 'box', 'histogram', 'pie'], }, 'has_single_src': { 'two': [True, False] }, 'num_x_axes': { 'numeric': [i for i in range(5)] }, 'num_y_axes': { 'numeric': [i for i in range(5)] } } field_prediction_task_to_outcomes = { 'trace_type': { 'two': ['line', 'bar'], 'three': ['line', 'scatter', 'bar'], 'six': ['line', 'scatter', 'bar', 'box', 'histogram', 'heatmap'], }, 'is_xsrc': { 'two': [True, False] }, 'is_ysrc': { 'two': [True, False] }, 'is_x_or_y': { 'two': ['x', 'y'] }, 'is_single_src': { 'two': [True, False] } } if task['dataset'] == 'dataset': task['features_df_file_name'] = 'features_aggregate_single_pairwise.csv' task['outcomes_df_file_name'] = 'chart_outcomes.csv' task['id_field'] = 'fid' prediction_task_to_outcomes = dataset_prediction_task_to_outcomes else: assert task['dataset'] == 'field' task['features_df_file_name'] = 'field_level_features.csv' task['outcomes_df_file_name'] = 'field_level_outcomes.csv' task['id_field'] = 'field_id' prediction_task_to_outcomes = field_prediction_task_to_outcomes # read original feature and outcome files features_df = pd.read_csv( join(features_directory, task['features_df_file_name']), nrows=num_datapoints) outcomes_df = pd.read_csv( join(features_directory, task['outcomes_df_file_name']), nrows=num_datapoints) # feature_names_by_type = pickle.load( # open(join(features_directory, feature_set_lookup_file_name), 'rb')) logger.log('Initial Features: ' + str(features_df.shape)) logger.log('Initial Outcomes: ' + str(outcomes_df.shape)) # use seperation by datasets if use_seperation: # dataset = 'vizml_1k' dataset = 'vizml_full' sep_folder = '../../VisGen/data/{}/'.format(dataset) if task['dataset'] == 'dataset': if split == 'train': indexes = pd.read_csv(sep_folder+'all_indexes_train.csv') elif split == 'val': indexes = pd.read_csv(sep_folder+'all_indexes_val.csv') elif split == 'test': indexes = pd.read_csv(sep_folder+'all_indexes_test.csv') features_df = features_df.loc[indexes.dataset_f_index] outcomes_df = outcomes_df.loc[indexes.dataset_o_index] elif task['dataset'] == 'field': if split == 'train': indexes = pd.read_csv(sep_folder+'all_indexes_field_train.csv') elif split == 'val': indexes = pd.read_csv(sep_folder+'all_indexes_field_val.csv') elif split == 'test': indexes = pd.read_csv(sep_folder+'all_indexes_field_test.csv') features_df = features_df.loc[indexes.field_feature_index] outcomes_df = outcomes_df.loc[indexes.field_outcome_index] logger.log('splited features for ' + split + str(features_df.shape)) logger.log('splited outcomes for ' + split + str(outcomes_df.shape)) # deal with outcomes if task['dataset'] == 'field': def is_x_or_y(is_xsrc, is_ysrc): if is_xsrc and pd.isnull(is_ysrc): return 'x' if is_ysrc and pd.isnull(is_xsrc): return 'y' else: return None outcomes_df['is_x_or_y'] = np.vectorize(is_x_or_y)(outcomes_df['is_xsrc'], outcomes_df['is_ysrc']) outcomes_df['is_single_src'] = outcomes_df['is_single_xsrc'] | outcomes_df['is_single_ysrc'] outcomes_df_subset = paper_tasks.format_outcomes_df(logger, outcomes_df, task['outcome_variable_name'], prediction_task_to_outcomes[ task['outcome_variable_name'] ] [task['prediction_task'] ], id_field=task['id_field']) # join features and outcomes by the fid/field_id final_df = join_data_and_keep_id(features_df, outcomes_df_subset, on=task['id_field']) last_index = final_df.columns.get_loc(task['outcome_variable_name']) X = final_df.iloc[:, :last_index] y = final_df.iloc[:, last_index] logger.log('Final DF Shape: ' + str(final_df.shape)) logger.log('Last Index: ' + str(last_index)) logger.log('Intermediate Features: ' + str(X.shape)) logger.log('Index of fid in X: ' + str(X.columns.get_loc('fid'))) if task['dataset']=='field': logger.log('Index of field in X: ' + str(X.columns.get_loc('field_id'))) logger.log('Intermediate Outcomes: ' + str(y.shape)) logger.log('Value counts: \n' + str(y.value_counts())) del final_df, outcomes_df # delete variables to save memory # formatting outputs y = pd.get_dummies(y).values.argmax(1) # sampling data if split == 'test': task['sampling_mode'] = None if task['sampling_mode'] == 'over': res = RandomOverSampler(random_state=RANDOM_STATE) X, y = res.fit_sample(X, y) elif task['sampling_mode'] == 'under': res = RandomUnderSampler(random_state=RANDOM_STATE) X, y = res.fit_sample(X, y) elif isinstance(task['sampling_mode'], int): X_resampled_arrays, y_resampled_arrays = [], [] for outcome in np.unique(y): outcome_mask = (y == outcome) X_resampled_outcome, y_resampled_outcome = resample( X[outcome_mask], y[outcome_mask], n_samples=task['sampling_mode'], random_state=RANDOM_STATE) X_resampled_arrays.append(X_resampled_outcome) y_resampled_arrays.append(y_resampled_outcome) X = np.concatenate(X_resampled_arrays) #.astype(np.float64) y = np.concatenate(y_resampled_arrays) else: # X, y = X.values.astype(np.float64), y pass logger.log('Final Features:' + str(X.shape)) logger.log('Final Outcomes:' + str(y.shape)) unique, counts = np.unique(y, return_counts=True) logger.log('Value counts after sampling:') logger.log_dict(dict(zip(unique, counts))) logger.log('\n') if split != 'test': X, y = util.unison_shuffle(X, y) return X, y