def train_and_save(train_file: str, valid_file: str, train_sample: int, valid_sample: int, model: str, feature_model: str, classifier: str): Model = getattr(models, model) df_train = get_dataset(train_file) df_valid = get_dataset(valid_file) n_train_total, n_valid_total = df_train.shape[0], df_valid.shape[0] n_train_sample, n_valid_sample = train_sample, valid_sample if n_train_sample > n_train_total: logging.warning( f'Training sample size ({n_train_sample}) cannot be ' f'larger than the training dataset (n={n_train_total:,d}).') n_train_sample = n_train_total if n_valid_sample > n_valid_total: logging.warning( f'Validation sample size ({n_valid_sample}) cannot be ' f'larger than the validation dataset (n={n_valid_total:,d}).') n_valid_sample = n_valid_total X_train, Y_train = read_data(df_train, sample_n=n_train_sample) X_valid, Y_valid = read_data(df_valid, sample_n=n_valid_sample) model = Model(classifier=classifier, steps=[feature_model], memory='data/feature_cache') with joblib.parallel_backend('threading', n_jobs=2): model.fit(X_train, Y_train) score = model.score(X_valid, Y_valid) logging.info('') logging.info(f'Overall F1: {score:.4f}') logging.info('') save_model(model)
def predict_one(dataset, dfs, totals, seed, fm, clf, **kwargs): """Predict for a random review""" lang = 'en' if '_en' in dataset else 'zh' X, y = read_data(dfs[0]) if totals[0] == 0: review = { 'id': 'N/A', 'content_html': '-- No matching reviews found. Please remove keyword. --' } true_labels, probas = None, None else: # get a random review random_review = dfs[0].sample(1, random_state=seed) # split to feature and labels X, y = read_data(random_review) model = load_model(fm, clf) review = random_review.to_dict('records')[0] review = { 'id': review['id'], 'content_html': highlight_subsetence( review['content_raw'], lang ).replace('\n', '<br>') } probas = predict_proba(clf, model, X) true_labels = y.replace({ np.nan: None }).values predict_labels = model.predict(X) # number of correct predictions n_correct_labels = np.sum(true_labels == predict_labels, axis=1).tolist() true_labels = true_labels.tolist() predict_labels = predict_labels.tolist() true_label_counts = [Counter(x) for x in true_labels] predict_label_counts = [Counter(x) for x in predict_labels] label_names = y.columns.tolist() n_total_labels = len(label_names) # number of labels to predict return { 'review': review, 'label_names': label_names, 'n_total_labels': n_total_labels, 'n_correct_labels': n_correct_labels, 'n_correct_labels_html': render_template( 'single/correct_count.jinja', **locals() ), 'true_label_counts': true_label_counts, 'predict_label_counts': predict_label_counts, 'true_labels': true_labels, 'predict_labels': predict_labels, 'probas': probas, 'filter_results': render_template( 'single/filter_results.jinja', **{**kwargs, **locals()}) }
def predict_df(self, df, save_to=None): """Make prediction on a data frame and save output""" # read_data returns a copy of df X, y, df = read_data(df, return_df=True) df['content'] = '' df[y.columns] = self.predict(X) if save_to: logger.info(f'Saving predictions to {save_to}...') df.to_csv(save_to, encoding="utf_8_sig", index=False) return df
'--classifier', default='SVC', choices=classifier_choices, help='Classifier used by the model') parser.add_argument('--train', default=10000, help='Number of training sample to use') parser.add_argument('--valid', default=1000, help='Number of validation sample to use') args = parser.parse_args() logging.info(f'{args}') Model = getattr(models, args.model) Classifier = getattr(classifiers, args.classifier) X_train, Y_train = read_data(get_dataset('train'), sample_n=args.train) X_valid, Y_valid = read_data(get_dataset('valid'), sample_n=args.valid) model = Model(classifier=Classifier, steps=[args.feature_model], memory='data/feature_cache') with joblib.parallel_backend('threading', n_jobs=4): model.fit(X_train, Y_train) score = model.score(X_valid, Y_valid) logging.info('') logging.info(f'Overall F1: {score:.4f}') logging.info('') save_model(model)
default='SVC', choices=classifier_choices, help='Classifier used by the model') parser.add_argument('--train', default=10000, help='Number of training sample to use') parser.add_argument('--valid', default=1000, help='Number of validation sample to use') args = parser.parse_args() logging.info(f'{args}') Model = getattr(models, args.model) Classifier = getattr(classifiers, args.classifier) X_train, Y_train = read_data(config.train_data_path, sample_n=args.train) X_valid, Y_valid = read_data(config.valid_data_path, sample_n=args.valid) model = Model(classifier=Classifier, steps=[args.feature_model], memory='data/feature_cache') with joblib.parallel_backend('threading', n_jobs=4): model.fit(X_train, Y_train) score = model.score(X_valid, Y_valid) logging.info('') logging.info(f'Overall F1: {score:.4f}') logging.info('') filename = f'{args.feature_model}_{model.name}.pkl' save_model(model, os.path.join(config.model_save_path, filename))