def TrainingModel(DataSet, OutputPathModel, Verbose=True): if type(DataSet) == _pd.core.frame.DataFrame: df = DataSet else: df = _pd.read_csv(DataSet) df.index = df['Id Unico'] df.drop(['Id Unico'], inplace=True, axis=1) df['Abandono'] = df['Ab'].apply(lambda x: 0 if x < 0 else 1) df.drop(['Ab'], inplace=True, axis=1) if Verbose: print("Number of Rows: {}\nNumber Of Columns: {}\n".format( df.shape[0], df.shape[1])) print("Number of Students that Not Dropped (0) and Dropped (1)") print(_pd.value_counts(df['Abandono'], sort=True)) count_classes = _pd.value_counts(df['Abandono'], sort=True) count_classes.plot(kind='bar', rot=0) _plt.xticks(range(2), LABELS) _plt.title("Frequency by observation number") _plt.xlabel("Abandono") _plt.ylabel("Number of Observations") y = df['Abandono'] X = df.drop('Abandono', axis=1) seed = 46 #dividimos en sets de entrenamiento y test X_train, X_test, y_train, y_test = _train_test_split(X, y, train_size=0.8, random_state=seed) #os_us = _SMOTETomek(sampling_strategy='all', ratio=0.6, ) os_us = _SMOTETomek(sampling_strategy='auto', ratio=0.6, random_state=seed) X_train_res, y_train_res = os_us.fit_sample(X_train, y_train) if Verbose: print("\nDistribution before resampling {}".format(Counter(y_train))) print("Distribution after resampling {}".format(Counter(y_train_res))) num_trees = 50 rfc = _RandomForestClassifier( n_estimators=num_trees, #class_weight="balanced", random_state=seed, max_features=4) rfc.fit(X_train, y_train) if Verbose: pred_y = rfc.predict(X_test) print() _mostrar_resultados(y_test, pred_y) print("Saving the model in the following path: {}".format( OutputPathModel)) _pickle.dump(rfc, open(OutputPathModel, 'wb')) if Verbose: _plt.show()
def test_set1(size=50, plot=False): mu = [(1, 100), (2, 100), (3, 100), (4, 100)] std = _np.array([[0.05, 0], [0, 50]]) X = [] y = [] for i in range(len(mu)): for _ in range(size): X.append(_np.random.multivariate_normal(mean=mu[i], cov=std)) y.append(i) X = _np.array(X) y = _np.array(y) if plot: col = ['red', 'blue', 'green', 'orange'] _plt.figure() _plt.subplot(1, 2, 1) for i in range(X.shape[0]): _plt.plot(X[i, 0], X[i, 1], '.', color=col[y[i]]) _plt.axis('equal') _plt.subplot(1, 2, 2) for i in range(X.shape[0]): _plt.plot(X[i, 0], X[i, 1], '.', color=col[y[i]]) X_train, X_test, y_train, y_test = _train_test_split(X, y, test_size=0.1) return X_train, y_train, X_test, y_test
def temporal_train_test_split( y: ACCEPTED_Y_TYPES, X: Optional[pd.DataFrame] = None, test_size: Optional[Union[int, float]] = None, train_size: Optional[Union[int, float]] = None, fh: Optional[FORECASTING_HORIZON_TYPES] = None, ) -> SPLIT_TYPE: """Split arrays or matrices into sequential train and test subsets. Creates train/test splits over endogenous arrays an optional exogenous arrays. This is a wrapper of scikit-learn's ``train_test_split`` that does not shuffle the data. Parameters ---------- y : pd.Series Target series X : pd.DataFrame, optional (default=None) Exogenous data test_size : float, int or None, optional (default=None) If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If int, represents the relative number of test samples. If None, the value is set to the complement of the train size. If ``train_size`` is also None, it will be set to 0.25. train_size : float, int, or None, (default=None) If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the train split. If int, represents the relative number of train samples. If None, the value is automatically set to the complement of the test size. fh : ForecastingHorizon Returns ------- splitting : tuple, length=2 * len(arrays) List containing train-test split of `y` and `X` if given. References ---------- ..[1] adapted from https://github.com/alkaline-ml/pmdarima/ """ if fh is not None: if test_size is not None or train_size is not None: raise ValueError( "If `fh` is given, `test_size` and `train_size` cannot " "also be specified." ) return _split_by_fh(y, fh, X=X) else: series = (y,) if X is None else (y, X) return _train_test_split( *series, shuffle=False, stratify=None, test_size=test_size, train_size=train_size, )
def train_test_split(X: Sequence, y: Sequence) -> tuple: """Custom wrapper around Sklearn's `train_test_split` function. By using this wrapper in our code, we make sure that we always use the split parameters. """ return _train_test_split(X, y, test_size=0.2, random_state=config.RANDOM_SEED, stratify=y)
def temporal_train_test_split(y, X=None, test_size=None, train_size=None, fh=None): """Split arrays or matrices into sequential train and test subsets Creates train/test splits over endogenous arrays an optional exogenous arrays. This is a wrapper of scikit-learn's ``train_test_split`` that does not shuffle. Parameters ---------- *series : sequence of pd.Series with same length / shape[0] test_size : float, int or None, optional (default=None) If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If int, represents the relative number of test samples. If None, the value is set to the complement of the train size. If ``train_size`` is also None, it will be set to 0.25. train_size : float, int, or None, (default=None) If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the train split. If int, represents the relative number of train samples. If None, the value is automatically set to the complement of the test size. fh : ForecastingHorizon Returns ------- splitting : list, length=2 * len(arrays) List containing train-test split of inputs. References ---------- ..[1] adapted from https://github.com/alkaline-ml/pmdarima/ """ if fh is not None: if test_size is not None or train_size is not None: raise ValueError( "If `fh` is given, `test_size` and `train_size` cannot " "also be specified.") return _split_y_by_fh(y, fh, X=X) else: series = (y, ) if X is None else (y, X) return _train_test_split( *series, shuffle=False, stratify=None, test_size=test_size, train_size=train_size, )
def stratified_split(df: pd.DataFrame, frac: float, column: List[str]) -> (pd.DataFrame, pd.DataFrame): label_count = df[column].value_counts().to_dict() labels_we_can_use = df[column].apply(lambda x: label_count[x] > 1) items_with_count_one = df[~labels_we_can_use].copy() items_needing_split = df[labels_we_can_use].copy() train, test = _train_test_split(items_needing_split, test_size=frac, stratify=items_needing_split[column]) train = pd.concat([train, items_with_count_one], axis=0, sort=True) #.reset_index(drop = True) return train, test
def miml(args): """ Fetches and prepares (in a DeepDIVA friendly format) the Multi-Instance Multi-Label Image Dataset on the file system. Dataset available at: http://lamda.nju.edu.cn/data_MIMLimage.ashx Parameters ---------- args : dict List of arguments necessary to run this routine. In particular its necessary to provide output_folder as String containing the path where the dataset will be downloaded Returns ------- None """ # Download the files url = 'http://lamda.nju.edu.cn/files/miml-image-data.rar' if not os.path.exists( os.path.join(args.output_folder, 'miml-image-data.rar')): print('Downloading file!') filename = wget.download(url, out=args.output_folder) else: print('File already downloaded!') filename = os.path.join(args.output_folder, 'miml-image-data.rar') # Extract the files path_to_rar = filename path_to_output = os.path.join(args.output_folder, 'tmp_miml') rarfile.RarFile(path_to_rar).extractall(path_to_output) path_to_rar = os.path.join(path_to_output, 'original.rar') rarfile.RarFile(path_to_rar).extractall(path_to_output) path_to_rar = os.path.join(path_to_output, 'processed.rar') rarfile.RarFile(path_to_rar).extractall(path_to_output) print('Extracted files...') # Load the mat file mat = _loadmat(os.path.join(path_to_output, 'miml data.mat')) targets = mat['targets'].T classes = [item[0][0] for item in mat['class_name']] # Add filename at 0-index to correctly format the CSV headers classes.insert(0, 'filename') # Get list of all image files in the folder images = [ item for item in _get_all_files_in_folders_and_subfolders(path_to_output) if item.endswith('jpg') ] images = sorted(images, key=lambda e: int(os.path.basename(e).split('.')[0])) # Make splits train_data, test_data, train_labels, test_labels = _train_test_split( images, targets, test_size=0.2, random_state=42) train_data, val_data, train_labels, val_labels = _train_test_split( train_data, train_labels, test_size=0.2, random_state=42) # print('Size of splits\ntrain:{}\nval:{}\ntest:{}'.format(len(train_data), # len(val_data), # len(test_data))) # Make output folders dataset_root = os.path.join(args.output_folder, 'MIML') train_folder = os.path.join(dataset_root, 'train') val_folder = os.path.join(dataset_root, 'val') test_folder = os.path.join(dataset_root, 'test') make_folder_if_not_exists(dataset_root) make_folder_if_not_exists(train_folder) make_folder_if_not_exists(val_folder) make_folder_if_not_exists(test_folder) def _write_data_to_folder(data, labels, folder, classes): dest = os.path.join(folder, 'images') make_folder_if_not_exists(dest) for image, label in zip(data, labels): shutil.copy(image, dest) rows = np.column_stack( ([os.path.join('images', os.path.basename(item)) for item in data], labels)) rows = sorted(rows, key=lambda e: int(e[0].split('/')[1].split('.')[0])) output_csv = pd.DataFrame(rows) output_csv.to_csv(os.path.join(folder, 'labels.csv'), header=classes, index=False) return # Write the images to the correct folders print('Writing the data to the filesystem') _write_data_to_folder(train_data, train_labels, train_folder, classes) _write_data_to_folder(val_data, val_labels, val_folder, classes) _write_data_to_folder(test_data, test_labels, test_folder, classes) os.remove(filename) shutil.rmtree(path_to_output) print('All done!') return
def train_val_test_split(X, y, *, split: Any = 0.7): """Split datasets into train, validation, and testing sets. Given training data X and training labels y, the method will split the data into relevant training, validation, and testing sets based on the `split` parameter. Usage: The method can be called directly with training data. >>> X = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'] >>> y = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] >>> X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(X, y, split = [0.6, 0.2, 0.2]) Parameters: - X: The training data, should be lists or arrays. - y: The training labels, should be lists or arrays. - split: How you want to split the data. Either a float m, which will represent the percentage of training data, and the val/test data will have a percentage (1 - m)/2, or a list of three numbers, containing the exact float percentages for train/val/test data. Defaults to 70/15/15 split. Returns: - Six arrays: training data, validation data, test data, train labels, validation labels, test labels. """ # Verify and segment provided split. if isinstance(split, float): train_split = split val_split = test_split = (1 - split) / 2 elif isinstance(split, (list, tuple)): if not len(split) == 3: raise ValueError( "If you are providing a list of percentages for the train/val/test split, it " f"must contain three numbers, got {len(split)}.") train_split = split[0] val_split = split[1] test_split = split[2] if not train_split + val_split + test_split == 1: raise ValueError( "If you are providing a list of percentages for the train/va/test split, it " f"should add up to 1, got {train_split + val_split + test_split}" ) else: raise TypeError( "Split argument should either be a float representing the training percentage, " f"or a list containing the train/val/test percentages, got {type(split)}." ) # Convert validation/test split to relative numbers. total_test_val_split = val_split + test_split val_split = val_split / total_test_val_split # First, convert data to train/overflow. X_train, X_overflow, y_train, y_overflow = _train_test_split( X, y, train_size=train_split) # Then, convert overflow to val/test. X_val, X_test, y_val, y_test = _train_test_split(X_overflow, y_overflow, train_size=val_split) # Finally, return split training, validation, and test data. return X_train, X_val, X_test, y_train, y_val, y_test
def generate_data(extraction_data=DEFAULT_DATA_DIRECTORY, train_output_path=DEFAULT_OUTPUT_TRAIN_PATH, test_output_path=DEFAULT_OUTPUT_TEST_PATH, labels_output_path=DEFAULT_OUTPUT_LABELS_PATH): """TODO: Update this Generate classification training data from content extraction dataset This function is highly dependent on the structure of the content extraction dataset and should be used accordingly. If changes are made there, they will effect the result of this function. Check the output accordingly. Given the "Corrected" extraction data, create a CSV for each line of those files, assigning a class -- "title", "ingredient", "instruction", "other" -- to each line. Parameters ---------- extraction_data : string The path to the directory containing the "Corrected" extraction training data output_path : string The desired path of the output CSV """ original_text = _numpy.array([], dtype='object') classified_type = _numpy.array([], dtype='object') for filename in _os.listdir(extraction_data): filename = _os.path.join(extraction_data, filename) with open(filename) as text_file: if not filename.endswith('.txt'): continue lines = [line.rstrip() for line in text_file] original_text = _numpy.append(original_text, lines[0]) classified_type = _numpy.append(classified_type, 'title') current_class = '' ingredients_done = False for line in lines[1:]: if line == '': current_class = 'other' continue original_text = _numpy.append(original_text, line) classified_type = _numpy.append(classified_type, current_class) if current_class == 'other': if not ingredients_done: current_class = 'ingredient' else: current_class = 'instruction' ingredients_done = True data_frame = _pandas.DataFrame( columns=['text', 'title', 'ingredient', 'instruction', 'other']) data_frame['text'] = original_text data_frame['title'] = (classified_type == 'title').astype('int') data_frame['ingredient'] = (classified_type == 'ingredient').astype('int') data_frame['instruction'] = ( classified_type == 'instruction').astype('int') data_frame['other'] = (classified_type == 'other').astype('int') df_train, df_test = _train_test_split(data_frame) df_train.to_csv(train_output_path) df_test.to_csv(test_output_path) with open(labels_output_path, 'x') as labels_file: labels_file.write('\n'.join( ['title', 'ingredient', 'instruction', 'other']))
def temporal_train_test_split( y: ACCEPTED_Y_TYPES, X: Optional[pd.DataFrame] = None, test_size: Optional[Union[int, float]] = None, train_size: Optional[Union[int, float]] = None, fh: Optional[FORECASTING_HORIZON_TYPES] = None, ) -> SPLIT_TYPE: """Split arrays or matrices into sequential train and test subsets. Creates train/test splits over endogenous arrays an optional exogenous arrays. This is a wrapper of scikit-learn's ``train_test_split`` that does not shuffle the data. Parameters ---------- y : pd.Series Target series X : pd.DataFrame, optional (default=None) Exogenous data test_size : float, int or None, optional (default=None) If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If int, represents the relative number of test samples. If None, the value is set to the complement of the train size. If ``train_size`` is also None, it will be set to 0.25. train_size : float, int, or None, (default=None) If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the train split. If int, represents the relative number of train samples. If None, the value is automatically set to the complement of the test size. fh : ForecastingHorizon Returns ------- splitting : tuple, length=2 * len(arrays) List containing train-test split of `y` and `X` if given. References ---------- ..[1] adapted from https://github.com/alkaline-ml/pmdarima/ """ if fh is not None: if test_size is not None or train_size is not None: raise ValueError( "If `fh` is given, `test_size` and `train_size` cannot " "also be specified." ) return _split_by_fh(y, fh, X=X) else: pd_format = isinstance(y, pd.Series) or isinstance(y, pd.DataFrame) if pd_format is True and isinstance(y.index, pd.MultiIndex): ys = get_time_index(y) # Get index to group across (only indices other than timepoints index) yi_name = y.index.names yi_grp = yi_name[0:-1] # Get split into test and train data for timeindex only series = (ys,) yret = _train_test_split( *series, shuffle=False, stratify=None, test_size=test_size, train_size=train_size, ) # Convert into list indices ysl = ys.to_list() yrl1 = yret[0].to_list() yrl2 = yret[1].to_list() p1 = [index for (index, item) in enumerate(ysl) if item in yrl1] p2 = [index for (index, item) in enumerate(ysl) if item in yrl2] # Subset by group based on identified indices y_train = y.groupby(yi_grp, as_index=False).nth(p1) y_test = y.groupby(yi_grp, as_index=False).nth(p2) if X is not None: X_train = X.groupby(yi_grp, as_index=False).nth(p1) X_test = X.groupby(yi_grp, as_index=False).nth(p2) return y_train, y_test, X_train, X_test else: return y_train, y_test else: series = (y,) if X is None else (y, X) return _train_test_split( *series, shuffle=False, stratify=None, test_size=test_size, train_size=train_size, )