예제 #1
0
def load_features():
    features_df_file_name = 'features_aggregate_single.csv'
    outcomes_df_file_name = 'chart_outcomes.csv'

    features_df = pd.read_csv(
        os.path.join(features_directory, features_df_file_name))
    outcomes_df = pd.read_csv(
        os.path.join(features_directory, outcomes_df_file_name))

    features_df = features_df[:num_datapoints]
    outcome_variable_name = 'all_one_trace_type'
    outcomes = ['line', 'scatter', 'bar']
    outcomes_df_subset = outcomes_df[outcomes_df[outcome_variable_name].isin(
        outcomes)][['fid', outcome_variable_name]]

    final_df = pd.merge(features_df, outcomes_df_subset, on='fid', how='inner')
    final_df = final_df.drop(['fid'], axis=1, inplace=False, errors='ignore')
    final_df.sample(frac=1.0)

    last_index = final_df.columns.get_loc(outcome_variable_name)
    X = final_df.iloc[:, :last_index]
    y = final_df.iloc[:, last_index]
    y = pd.get_dummies(y).values.argmax(1)

    res = RandomOverSampler(random_state=RANDOM_STATE)
    X, y = res.fit_sample(X, y)
    # shuffle X and y in unison, and then return
    return util.unison_shuffle(X, y)
예제 #2
0
def load_features():
    features_array = []
    outcomes_array = []
    features_sizes = [0]

    # as we load each by_field/features_{}
    # we record the shape of each Panda Dataframe returned
    # and combine the df into one giant matrix

    # each df has a common set of agg. features, and differ in the number of field features
    # so we merge the df on the agg. features, and impute the missing features
    # with the average value from other examples

    # after imputing is done, we re-separate our giant df by number of columns
    # such that all examples with the same number of columns in the original
    # data are saved to the same file

    for num_fields in range(1, max_fields + 1):
        features_df_file_name = 'by_field/features_{}.csv'.format(num_fields)
        outcomes_df_file_name = 'by_field/outcomes_{}.csv'.format(num_fields)
        features_df = pd.read_csv(os.path.join(features_directory,
                                               features_df_file_name),
                                  nrows=num_datapoints)
        outcomes_df = pd.read_csv(
            os.path.join(features_directory, outcomes_df_file_name))
        features_sizes.append(features_df.shape[0])
        features_array.append(features_df)
        outcomes_array.append(outcomes_df)

    # here we combine features_array and outcomes_array, and delete the original arrays to save memory
    # any missing features are first filled in with N/A
    features_df = pd.concat(features_array, axis=0, ignore_index=True)
    outcomes_df = pd.concat(outcomes_array, axis=0, ignore_index=True)
    del features_array, outcomes_array

    # drop the fid, impute any N/A entries, and readd fid
    features_id_column = features_df[['fid']]
    features_df = features_df.drop(['fid'],
                                   axis=1,
                                   inplace=False,
                                   errors='ignore')
    features_df = process_features_df(features_df)
    features_df = pd.concat([features_df, features_id_column], axis=1)

    # add feature representing num_fields for each training example
    # and concat it with our features_df
    num_fields_array = []
    for num_fields in range(1, max_fields + 1):
        np_array = np.zeros((features_sizes[num_fields], ), dtype=np.int64)
        np_array.fill(num_fields)
        num_fields_array.append(np_array)
    num_fields_array = np.concatenate(num_fields_array)

    assert num_fields_array.shape[0] == features_df.shape[0]
    num_fields_array = pd.DataFrame(
        {"special_original_num_fields": num_fields_array})
    features_df = pd.concat([features_df, num_fields_array], axis=1)

    # process outcomes
    outcome_variable_name = 'all_one_trace_type'
    outcomes = ['line', 'scatter', 'bar']
    outcomes_df_subset = outcomes_df[outcomes_df[outcome_variable_name].isin(
        outcomes)][['fid', outcome_variable_name]]

    # Join features and outcomes
    final_df = pd.merge(features_df, outcomes_df_subset, on='fid', how='inner')
    final_df = final_df.drop(['fid'], axis=1, inplace=False, errors='ignore')
    del features_df, outcomes_df_subset

    # filter out examples with same num fields
    # drop our special_original_num_fields column
    # and save the matrices to disk
    for num_fields in range(1, max_fields + 1):
        X_with_field = final_df[final_df['special_original_num_fields'] ==
                                num_fields]
        X_with_field = X_with_field.drop(['special_original_num_fields'],
                                         axis=1,
                                         inplace=False,
                                         errors='ignore')
        X = X_with_field.iloc[:, :-1]
        y = X_with_field.iloc[:, -1]
        y = pd.get_dummies(y).values.argmax(1)

        res = RandomOverSampler(random_state=RANDOM_STATE)
        X, y = res.fit_sample(X, y)
        X, y = util.unison_shuffle(X, y)
        util.save_matrices_to_disk(X, y, [0.1, 0.1], saves_directory,
                                   'field_' + str(num_fields), num_datapoints)
예제 #3
0
def load_features(task):

    log_file = log_dir + 'loading_task_' + str(task['pref_id']) + '.txt'
    load_logger = logger(log_file, task)

    dataset_prediction_task_to_outcomes = {
        'all_one_trace_type': {
            'two': ['line', 'bar'],
            'three': ['line', 'scatter', 'bar'],
            'six': ['line', 'scatter', 'bar', 'box', 'histogram', 'pie'],
        },
        'has_single_src': {
            'two': [True, False]
        },
        'num_x_axes': {
            'numeric': [i for i in range(5)]
        },
        'num_y_axes': {
            'numeric': [i for i in range(5)]
        }
    }

    field_prediction_task_to_outcomes = {
        'trace_type': {
            'two': ['line', 'bar'],
            'three': ['line', 'scatter', 'bar'],
            'six': ['line', 'scatter', 'bar', 'box', 'histogram', 'heatmap'],
        },
        'is_xsrc': {
            'two': [True, False]
        },
        'is_ysrc': {
            'two': [True, False]
        },
        'is_x_or_y': {
            'two': ['x', 'y']
        },
        'is_single_src': {
            'two': [True, False]
        }
    }

    if task['dataset'] == 'dataset':
        task['features_df_file_name'] = 'features_aggregate_single_pairwise.csv'
        task['outcomes_df_file_name'] = 'chart_outcomes.csv'
        task['id_field'] = 'fid'
        prediction_task_to_outcomes = dataset_prediction_task_to_outcomes
    else:
        assert task['dataset'] == 'field'
        task['features_df_file_name'] = 'field_level_features.csv'
        task['outcomes_df_file_name'] = 'field_level_outcomes.csv'
        task['id_field'] = 'field_id'
        prediction_task_to_outcomes = field_prediction_task_to_outcomes


    features_df = pd.read_csv(
        join(features_directory, task['features_df_file_name']),
        nrows=num_datapoints)
    outcomes_df = pd.read_csv(
        join(features_directory, task['outcomes_df_file_name']),
        nrows=num_datapoints)
    feature_names_by_type = pickle.load(
        open(
            join(features_directory, feature_set_lookup_file_name),
            'rb'))

    # print(features_df)
    # print('Initial Features:', features_df.shape)
    # print('Initial Outcomes:', outcomes_df.shape)
    # load_logger.log_dict(feature_names_by_type)
    # load_logger.log('\n')
    # load_logger.log(features_df)
    load_logger.log('Initial Features: ' + str(features_df.shape))
    load_logger.log('Initial Outcomes: ' + str(outcomes_df.shape))

    if task['dataset'] == 'field':
        def is_x_or_y(is_xsrc, is_ysrc):
            if is_xsrc and pd.isnull(is_ysrc): return 'x'
            if is_ysrc and pd.isnull(is_xsrc): return 'y'
            else:                              return None
        outcomes_df['is_x_or_y'] = np.vectorize(is_x_or_y)(outcomes_df['is_xsrc'], outcomes_df['is_ysrc'])
        outcomes_df['is_single_src'] = outcomes_df['is_single_xsrc'] | outcomes_df['is_single_ysrc']

    outcomes_df_subset = format_outcomes_df(load_logger, outcomes_df, 
                                            task['outcome_variable_name'],
                                            prediction_task_to_outcomes[ task['outcome_variable_name'] ] [task['prediction_task'] ],
                                            id_field=task['id_field'])
    
    final_df = join_features_and_outcomes(features_df, outcomes_df_subset, on=task['id_field'])
    last_index = final_df.columns.get_loc(task['outcome_variable_name'])

    X = final_df.iloc[:, :last_index]
    y = final_df.iloc[:, last_index]

    # print('Intermediate Outcomes:', y.shape)
    # value_counts = y.value_counts()
    # print('Value counts:')
    # print(value_counts)
    load_logger.log('Final DF Shape: ' + str(final_df.shape))
    load_logger.log('Last Index: ' + str(last_index))

    load_logger.log('Intermediate Outcomes: ' + str(y.shape))
    load_logger.log('Value counts: \n' + str(y.value_counts()))

    # delete variables to save memory!
    del final_df, outcomes_df

    task_types = ['dimensions', 'types', 'values', 'names']
    for task_name in task_types:
        names = get_feature_set_names_by_type(
            feature_names_by_type,
            task_type=task['dataset'],
            feature_set=task_name)
        indices = [X.columns.get_loc(c) for c in names if c in X.columns]
        # print('task is ' + task_name + ' and indices are:')
        #print('names are {}'.format(names) )
        # print(indices)
        # load_logger.log('task is ' + task_name + ' and indices are: ')
        # load_logger.log(indices)


    y = pd.get_dummies(y).values.argmax(1)

    if task['sampling_mode'] == 'over':
        res = RandomOverSampler(random_state=RANDOM_STATE)
        X, y = res.fit_sample(X, y)
    elif task['sampling_mode'] == 'under':
        res = RandomUnderSampler(random_state=RANDOM_STATE)
        X, y = res.fit_sample(X, y)
    elif isinstance(task['sampling_mode'], int):
        X_resampled_arrays, y_resampled_arrays = [], []
        for outcome in np.unique(y):
            outcome_mask = (y == outcome)
            X_resampled_outcome, y_resampled_outcome = resample(
                X[outcome_mask],
                y[outcome_mask],
                n_samples=task['sampling_mode'],
                random_state=RANDOM_STATE
            )
            X_resampled_arrays.append(X_resampled_outcome)
            y_resampled_arrays.append(y_resampled_outcome)

        X, y = np.concatenate(X_resampled_arrays).astype(
            np.float64), np.concatenate(y_resampled_arrays)
    else:
        X, y = X.values.astype(np.float64), y

    # print('Final Features:', X.shape)
    # print('Final Outcomes:', y.shape)
    load_logger.log('Final Features:' + str(X.shape))
    load_logger.log('Final Outcomes:' + str(y.shape))
    unique, counts = np.unique(y, return_counts=True)
    load_logger.log('Value counts after sampling:')
    load_logger.log_dict(dict(zip(unique, counts)))
    load_logger.log('\n')

    del load_logger
    return util.unison_shuffle(X, y)
예제 #4
0
파일: multi_eval.py 프로젝트: Bachery/vizml
def load_features_and_save_id(task, logger, use_seperation=False, split='train'):
	
	# settings for tasks
	dataset_prediction_task_to_outcomes = {
		'all_one_trace_type': {
			'two': ['line', 'bar'],
			'three': ['line', 'scatter', 'bar'],
			'six': ['line', 'scatter', 'bar', 'box', 'histogram', 'pie'],
		},
		'has_single_src': {
			'two': [True, False]
		},
		'num_x_axes': {
			'numeric': [i for i in range(5)]
		},
		'num_y_axes': {
			'numeric': [i for i in range(5)]
		}
	}
	field_prediction_task_to_outcomes = {
		'trace_type': {
			'two': ['line', 'bar'],
			'three': ['line', 'scatter', 'bar'],
			'six': ['line', 'scatter', 'bar', 'box', 'histogram', 'heatmap'],
		},
		'is_xsrc': {
			'two': [True, False]
		},
		'is_ysrc': {
			'two': [True, False]
		},
		'is_x_or_y': {
			'two': ['x', 'y']
		},
		'is_single_src': {
			'two': [True, False]
		}
	}
	if task['dataset'] == 'dataset':
		task['features_df_file_name'] = 'features_aggregate_single_pairwise.csv'
		task['outcomes_df_file_name'] = 'chart_outcomes.csv'
		task['id_field'] = 'fid'
		prediction_task_to_outcomes = dataset_prediction_task_to_outcomes
	else:
		assert task['dataset'] == 'field'
		task['features_df_file_name'] = 'field_level_features.csv'
		task['outcomes_df_file_name'] = 'field_level_outcomes.csv'
		task['id_field'] = 'field_id'
		prediction_task_to_outcomes = field_prediction_task_to_outcomes
	
	# read original feature and outcome files
	features_df = pd.read_csv(
		join(features_directory, task['features_df_file_name']),
		nrows=num_datapoints)
	outcomes_df = pd.read_csv(
		join(features_directory, task['outcomes_df_file_name']),
		nrows=num_datapoints)
	# feature_names_by_type = pickle.load(
	# 	open(join(features_directory, feature_set_lookup_file_name), 'rb'))
	logger.log('Initial Features: ' + str(features_df.shape))
	logger.log('Initial Outcomes: ' + str(outcomes_df.shape))
	
	# use seperation by datasets
	if use_seperation:
		# dataset = 'vizml_1k'
		dataset = 'vizml_full'
		sep_folder = '../../VisGen/data/{}/'.format(dataset)
		if task['dataset'] == 'dataset':
			if	 split == 'train':	indexes = pd.read_csv(sep_folder+'all_indexes_train.csv')
			elif split == 'val':	indexes = pd.read_csv(sep_folder+'all_indexes_val.csv')
			elif split == 'test':	indexes = pd.read_csv(sep_folder+'all_indexes_test.csv')
			features_df = features_df.loc[indexes.dataset_f_index]
			outcomes_df = outcomes_df.loc[indexes.dataset_o_index]
		elif task['dataset'] == 'field':
			if	 split == 'train':	indexes = pd.read_csv(sep_folder+'all_indexes_field_train.csv')
			elif split == 'val':	indexes = pd.read_csv(sep_folder+'all_indexes_field_val.csv')
			elif split == 'test':	indexes = pd.read_csv(sep_folder+'all_indexes_field_test.csv')
			features_df = features_df.loc[indexes.field_feature_index]
			outcomes_df = outcomes_df.loc[indexes.field_outcome_index]
		logger.log('splited features for ' + split + str(features_df.shape))
		logger.log('splited outcomes for ' + split + str(outcomes_df.shape))

	# deal with outcomes
	if task['dataset'] == 'field':
		def is_x_or_y(is_xsrc, is_ysrc):
			if is_xsrc and pd.isnull(is_ysrc): return 'x'
			if is_ysrc and pd.isnull(is_xsrc): return 'y'
			else:                              return None
		outcomes_df['is_x_or_y'] = np.vectorize(is_x_or_y)(outcomes_df['is_xsrc'], outcomes_df['is_ysrc'])
		outcomes_df['is_single_src'] = outcomes_df['is_single_xsrc'] | outcomes_df['is_single_ysrc']

	outcomes_df_subset = paper_tasks.format_outcomes_df(logger, outcomes_df, 
							task['outcome_variable_name'],
							prediction_task_to_outcomes[ task['outcome_variable_name'] ] [task['prediction_task'] ],
							id_field=task['id_field'])

	# join features and outcomes by the fid/field_id
	final_df = join_data_and_keep_id(features_df, outcomes_df_subset, on=task['id_field'])
	last_index = final_df.columns.get_loc(task['outcome_variable_name'])
	X = final_df.iloc[:, :last_index]
	y = final_df.iloc[:, last_index]
	logger.log('Final DF Shape: ' + str(final_df.shape))
	logger.log('Last Index: ' + str(last_index))
	logger.log('Intermediate Features: ' + str(X.shape))
	logger.log('Index of fid in X: ' + str(X.columns.get_loc('fid')))
	if task['dataset']=='field': 
		logger.log('Index of field in X: ' + str(X.columns.get_loc('field_id')))
	logger.log('Intermediate Outcomes: ' + str(y.shape))
	logger.log('Value counts: \n' + str(y.value_counts()))
	del final_df, outcomes_df	# delete variables to save memory

	# formatting outputs
	y = pd.get_dummies(y).values.argmax(1)

	# sampling data
	if split == 'test': task['sampling_mode'] = None
	if task['sampling_mode'] == 'over':
		res = RandomOverSampler(random_state=RANDOM_STATE)
		X, y = res.fit_sample(X, y)
	elif task['sampling_mode'] == 'under':
		res = RandomUnderSampler(random_state=RANDOM_STATE)
		X, y = res.fit_sample(X, y)
	elif isinstance(task['sampling_mode'], int):
		X_resampled_arrays, y_resampled_arrays = [], []
		for outcome in np.unique(y):
			outcome_mask = (y == outcome)
			X_resampled_outcome, y_resampled_outcome = resample(
				X[outcome_mask],
				y[outcome_mask],
				n_samples=task['sampling_mode'],
				random_state=RANDOM_STATE)
			X_resampled_arrays.append(X_resampled_outcome)
			y_resampled_arrays.append(y_resampled_outcome)
		X = np.concatenate(X_resampled_arrays)	#.astype(np.float64)
		y = np.concatenate(y_resampled_arrays)
	else:
		# X, y = X.values.astype(np.float64), y
		pass

	logger.log('Final Features:' + str(X.shape))
	logger.log('Final Outcomes:' + str(y.shape))
	unique, counts = np.unique(y, return_counts=True)
	logger.log('Value counts after sampling:')
	logger.log_dict(dict(zip(unique, counts)))
	logger.log('\n')

	if split != 'test':
		X, y = util.unison_shuffle(X, y)
	return X, y