def train_test_split_none(*arrays, **kwargs): """extend train_test_split to take None arrays and support split by group names. """ nones = [] new_arrays = [] for idx, arr in enumerate(arrays): if arr is None: nones.append(idx) else: new_arrays.append(arr) if kwargs["shuffle"] == "None": kwargs["shuffle"] = None group_names = kwargs.pop("group_names", None) if group_names is not None and group_names.strip(): group_names = [name.strip() for name in group_names.split(",")] new_arrays = indexable(*new_arrays) groups = kwargs["labels"] n_samples = new_arrays[0].shape[0] index_arr = np.arange(n_samples) test = index_arr[np.isin(groups, group_names)] train = index_arr[~np.isin(groups, group_names)] rval = list( chain.from_iterable( (safe_indexing(a, train), safe_indexing(a, test)) for a in new_arrays ) ) else: rval = train_test_split(*new_arrays, **kwargs) for pos in nones: rval[pos * 2: 2] = [None, None] return rval
def main(inputs, infile_array, outfile_train, outfile_test, infile_labels=None, infile_groups=None): """ Parameter --------- inputs : str File path to galaxy tool parameter infile_array : str File paths of input arrays separated by comma infile_labels : str File path to dataset containing labels infile_groups : str File path to dataset containing groups outfile_train : str File path to dataset containing train split outfile_test : str File path to dataset containing test split """ warnings.simplefilter('ignore') with open(inputs, 'r') as param_handler: params = json.load(param_handler) input_header = params['header0'] header = 'infer' if input_header else None array = pd.read_csv(infile_array, sep='\t', header=header, parse_dates=True) # train test split if params['mode_selection']['selected_mode'] == 'train_test_split': options = params['mode_selection']['options'] shuffle_selection = options.pop('shuffle_selection') options['shuffle'] = shuffle_selection['shuffle'] if infile_labels: header = 'infer' if shuffle_selection['header1'] else None col_index = shuffle_selection['col'][0] - 1 df = pd.read_csv(infile_labels, sep='\t', header=header, parse_dates=True) labels = df.iloc[:, col_index].values options['labels'] = labels train, test = train_test_split(array, **options) # cv splitter else: train, test = _get_single_cv_split(params, array, infile_labels=infile_labels, infile_groups=infile_groups) print("Input shape: %s" % repr(array.shape)) print("Train shape: %s" % repr(train.shape)) print("Test shape: %s" % repr(test.shape)) train.to_csv(outfile_train, sep='\t', header=input_header, index=False) test.to_csv(outfile_test, sep='\t', header=input_header, index=False)
def main( inputs, infile_array, outfile_train, outfile_test, infile_labels=None, infile_groups=None, ): """ Parameter --------- inputs : str File path to galaxy tool parameter infile_array : str File paths of input arrays separated by comma infile_labels : str File path to dataset containing labels infile_groups : str File path to dataset containing groups outfile_train : str File path to dataset containing train split outfile_test : str File path to dataset containing test split """ warnings.simplefilter("ignore") with open(inputs, "r") as param_handler: params = json.load(param_handler) input_header = params["header0"] header = "infer" if input_header else None array = pd.read_csv(infile_array, sep="\t", header=header, parse_dates=True) # train test split if params["mode_selection"]["selected_mode"] == "train_test_split": options = params["mode_selection"]["options"] shuffle_selection = options.pop("shuffle_selection") options["shuffle"] = shuffle_selection["shuffle"] if infile_labels: header = "infer" if shuffle_selection["header1"] else None col_index = shuffle_selection["col"][0] - 1 df = pd.read_csv(infile_labels, sep="\t", header=header, parse_dates=True) labels = df.iloc[:, col_index].values options["labels"] = labels train, test = train_test_split(array, **options) # cv splitter else: train, test = _get_single_cv_split(params, array, infile_labels=infile_labels, infile_groups=infile_groups) print("Input shape: %s" % repr(array.shape)) print("Train shape: %s" % repr(train.shape)) print("Test shape: %s" % repr(test.shape)) train.to_csv(outfile_train, sep="\t", header=input_header, index=False) test.to_csv(outfile_test, sep="\t", header=input_header, index=False)