def training_data( tickets_data_path: str, text_column: str, label_column: str, test_size: float = 0.25, subset_size: int = -1, max_length: int = 100, pad_to_max_length: bool = True, ) -> Tuple[Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray], DistilBertTokenizer]: df = pd.read_csv(tickets_data_path) x = df[text_column].tolist() y = df[label_column].tolist() unique_labels = sorted(list(set(y))) y = encode_labels(y, unique_labels) tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-cased") tokenizer.max_length = max_length tokenizer.pad_to_max_length = pad_to_max_length print("tokenizing all texts...") x = encode_texts(tokenizer, x) subset_size = len(x) if subset_size < 0 else subset_size x_train, x_test, y_train, y_test = train_test_split(x[:subset_size], y[:subset_size], test_size=test_size, random_state=42) return (x_train, x_test, y_train, y_test), tokenizer
def score(self, experiment_path: str, result_file: str, **kwargs): """score Scores a given experiemnt path e.g., outputs probability scores for a given dataset passed as: --data features/hdf5/somedata.h5 --label features/labels/somedata.csv :param experiment_path: Path to already trained model using train :type experiment_path: str """ # Update config parameters with new kwargs config = torch.load(glob.glob( "{}/run_config*".format(experiment_path))[0], map_location=lambda storage, loc: storage) config_parameters = dict(config, **kwargs) model = torch.load(glob.glob( "{}/run_model*".format(experiment_path))[0], map_location=lambda storage, loc: storage) encoder = torch.load(glob.glob( '{}/run_encoder*'.format(experiment_path))[0], map_location=lambda storage, loc: storage) testlabel = config_parameters['testlabel'] testdata = config_parameters['testdata'] # Only a single item to evaluate if isinstance(testlabel, list) and len(testlabel) == 1: testlabel = testlabel[0] if isinstance(testdata, list) and len(testdata) == 1: testdata = testdata[0] labels_df = pd.read_csv(testlabel, sep=' ') labels_df['encoded'], encoder = utils.encode_labels( labels=labels_df['bintype'], encoder=encoder) config_parameters.setdefault('colname', ('filename', 'encoded')) dataloader = dataset.getdataloader( data_frame=labels_df, data_file=testdata, num_workers=4, batch_size=1, # do not apply any padding colname=config_parameters[ 'colname'] # For other datasets with different key names ) model = model.to(DEVICE).eval() genuine_label_idx = encoder.transform(['genuine'])[0] with torch.no_grad(), open(result_file, 'w') as wp, tqdm(total=len(dataloader), unit='utts') as pbar: datawriter = csv.writer(wp, delimiter=' ') datawriter.writerow(['filename', 'score']) for batch in dataloader: inputs, _, filenames = batch inputs = inputs.float().to(DEVICE) preds = model(inputs) for pred, filename in zip(preds, filenames): # Single batchsize datawriter.writerow([filename, pred[0].item()]) pbar.update() print("Score file can be found at {}".format(result_file))
def img_to_tfrecord(train_imgs, train_labels, output_dir, name): """ :param image_dir: image_dir just like "data/Challenge2_Training_Task12_Images/*.jpg" :param text_dir: label file dir :param text_name: label file name :param output_dir: output dir :param name: output file name :return: NULL """ tf_filename = _get_output_filename(output_dir, name) imgLists = train_imgs # return a list labels = train_labels labels_encord, lengths = encode_labels(labels, alphabet) image_format = b'JPEG' with tf.python_io.TFRecordWriter(tf_filename) as tfrecord_writer: for i, filename in enumerate(imgLists): sys.stdout.write('\r>> Converting image %d/%d' % (i + 1, len(imgLists))) sys.stdout.flush() example = tf.train.Example(features=tf.train.Features( feature={ "label/value": int64_feature(labels_encord[i]), "label/length": int64_feature(lengths[i]), "image": bytes_feature(filename) })) tfrecord_writer.write(example.SerializeToString()) print('\nFinished converting the dataset!')
def test_2layer_net(): params = init_toy_model() X, y = init_toy_data() Y_enc = ut.encode_labels(y) # Make the net layer_1 = layers.Linear(*params['W1'].T.shape, reg='frob', reg_param=0.05, init_vals=(params['W1'].T, params['b1'].ravel())) act_1 = layers.Relu() layer_2 = layers.Linear(*params['W2'].T.shape, reg='frob', reg_param=0.05, init_vals=(params['W2'].T, params['b2'].ravel())) net_2 = nn.Network([layer_1, act_1, layer_2], ls.CrossEntropy(), optim.SGD(lr=1e-5)) scores = net_2.forward(X) correct_scores = np.asarray([[-1.07260209, 0.05083871, -0.87253915], [-2.02778743, -0.10832494, -1.52641362], [-0.74225908, 0.15259725, -0.39578548], [-0.38172726, 0.10835902, -0.17328274], [-0.64417314, -0.18886813, -0.41106892]]) diff = np.sum(np.abs(scores - correct_scores)) assert (np.isclose(diff, 0.0, atol=1e-6)) loss = net_2.loss(X, Y_enc) correct_loss = 1.071696123862817 assert (np.isclose(loss, correct_loss, atol=1e-8))
def test_creating_a_single_decision_tree(request, wine_dataset): feature_names, X, y = wine_dataset y, encoder, class_names = encode_labels(y.astype(int)) X_train, X_test, y_train, y_test = train_test_split(X, y) tree = learn_tree(X_train, y_train, max_depth=5) acc = compute_accuracy(tree, X_test, y_test) print(f'Test set accuracy: {acc:2.2%}') dot_data = create_graph(tree, feature_names, class_names, palette=PALETTE) graph = graphviz.Source(dot_data, format='png') graph.render(request.node.name)
def train(self, X, y): X = np.matrix(X) y_vectorized = encode_labels(y) params = self.__random_initialize_weights() # minimize the objective function fmin = minimize(fun=self.backpropagate, x0=params, args=(X, y_vectorized), method='TNC', jac=True, options={'maxiter': 250}) theta1 = self.__unravel_theta1(fmin.x) theta2 = self.__unravel_theta2(fmin.x) return theta1, theta2
def loadFeature(name, idf_threshold=IDF_THRESHOLD, ispretrain=True): EndIndex = -1 if ispretrain is False: EndIndex = -2 featurePath = getPATH(name, idf_threshold, 'feature_and_label', ispretrain) # idx_features_labels = np.genfromtxt(join(settings.DATA_DIR, 'local', 'graph-{}'.format(idf_threshold)), dtype=np.dtype(str)) idx_features_labels = np.genfromtxt(featurePath, dtype=np.dtype(str)) features = np.array(idx_features_labels[:, 1:EndIndex], dtype=np.float32) # sparse? rawlabels = encode_labels(idx_features_labels[:, EndIndex]) pids = idx_features_labels[:, 0] return features, pids, rawlabels
def process_text(text_dict, criteria): train_text = [text_dict[x]['text'] for x in text_dict] train_tags = [text_dict[x]['tags'] for x in text_dict] #encoding tags with unique labels train_labels = encode_labels(train_tags, criteria) X_train = tokenize(train_text) mlb = MultiLabelBinarizer() y_train = mlb.fit_transform(train_labels) return X_train, y_train, train_labels
def test_creating_an_ensemble_of_trees(wine_dataset): feature_names, X, y = wine_dataset y, encoder, class_names = encode_labels(y.astype(int)) X_train, X_test, y_train, y_test = train_test_split(X, y) random_forest = RandomForestClassifier(tree_funcs=(learn_tree, predict_tree), n_trees=10, max_depth=1, min_leaf_size=5, min_split_size=10, feature_subset_size='sqrt') preds = random_forest.fit(X_train, y_train).predict(X_test) acc = np.mean(y_test == preds) print(f'Test set accuracy: {acc:2.2%}')
def test_creating_set_of_trees(trial, wine_dataset): np.random.seed(trial) feature_names, X, y = wine_dataset y, encoder, class_names = encode_labels(y.astype(int)) X_train, X_test, y_train, y_test = train_test_split(X, y) tree = learn_tree(X_train, y_train, max_depth=2) acc = compute_accuracy(tree, X_test, y_test) dot_data = create_graph(tree, feature_names, class_names, palette=PALETTE, title=f'Tree accuracy: {acc:2.2%}') graph = graphviz.Source(dot_data, format='png') graph.render('tree_%d' % trial)
def test_2layer_grad(): params = init_toy_model() X, y = init_toy_data() Y_enc = ut.encode_labels(y) # Make the net layer_1 = layers.Linear(*params['W1'].T.shape, reg='frob', reg_param=0.05, init_vals=(params['W1'].T, params['b1'].ravel())) act_1 = layers.Relu() layer_2 = layers.Linear(*params['W2'].T.shape, reg='frob', reg_param=0.05, init_vals=(params['W2'].T, params['b2'].ravel())) net_2 = nn.Network([layer_1, act_1, layer_2], ls.CrossEntropy(), optim.SGD(lr=1e-5)) loss = net_2.loss(X, Y_enc) net_2.backward() def f_change_param(param_name, U): if param_name == 3: net_2.layers[0].params['b'] = U if param_name == 2: net_2.layers[0].params['W'] = U if param_name == 1: net_2.layers[2].params['b'] = U if param_name == 0: net_2.layers[2].params['W'] = U return net_2.loss(X, Y_enc) rel_errs = np.empty(4) for param_name in range(4): f = lambda U: f_change_param(param_name, U) if param_name == 3: pass_pars = net_2.layers[0].params['b'] if param_name == 2: pass_pars = net_2.layers[0].params['W'] if param_name == 1: pass_pars = net_2.layers[2].params['b'] if param_name == 0: pass_pars = net_2.layers[2].params['W'] param_grad_num = dutil.grad_check(f, pass_pars, epsilon=1e-5) rel_errs[param_name] = ut.rel_error(param_grad_num, net_2.grads[param_name]) assert (np.allclose(rel_errs, np.zeros(4), atol=1e-7))
def main(): n_clusters = 5 dataset_path = join('datasets', 'adl') X, labels = quantize(dataset_path, n_clusters) y, encoder, classes = encode_labels(labels) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7) random_forest = RandomForestClassifier(tree_funcs=(learn_tree, predict_tree), n_trees=50, max_depth=3, min_leaf_size=5, min_split_size=10, feature_subset_size='sqrt') random_forest.fit(X_train, y_train) predictions = random_forest.predict(X_test) acc = np.mean(predictions == y_test) print(f'Test dataset predictions accuracy: {acc:2.2%}')
def img_to_tfrecord(train_imgs,train_labels,output_dir,name): """ :param image_dir: image_dir just like "data/Challenge2_Training_Task12_Images/*.jpg" :param text_dir: label file dir :param text_name: label file name :param output_dir: output dir :param name: output file name :return: NULL """ tf_filename = _get_output_filename(output_dir, name) imgLists = train_imgs # return a list labels = train_labels labels_encord,lengths = encode_labels(labels) image_format = b'JPEG' with tf.python_io.TFRecordWriter(tf_filename) as tfrecord_writer: for i, filename in enumerate(imgLists): sys.stdout.write('\r>> Converting image %d/%d' % (i + 1, len(imgLists))) sys.stdout.flush() #image_data = tf.gfile.FastGFile(filename, 'rb').read() image_data = load_image(filename) #with tf.gfile.GFile(filename, 'rb') as fid: # image_data = fid.read() # with tf.Session() as sess: # image = tf.image.decode_jpeg(image_data) # image = sess.run(image) # print(image.shape)#(32, 100, 3) example = tf.train.Example(features=tf.train.Features(feature={"label/value": int64_feature(labels_encord[i]), "label/length": int64_feature(lengths[i]), "image/encoded": bytes_feature(image_data), 'image/format': bytes_feature(image_format)})) tfrecord_writer.write(example.SerializeToString()) print('\nFinished converting the dataset!')
import numpy as np import utils as ut import nn import layers import loss as ls import optim import data_utils as dutil # Global variables X_train, y_train, X_val, y_val, X_test, y_test, X_dev, y_dev = dutil.get_CIFAR10_data( ) n = X_train.shape[1] c = 10 Y_dev_enc = ut.encode_labels(y_dev) def test_CrossEntropyLoss(): np.random.seed(1) W = np.random.randn(c, n) * 0.0001 b = np.random.randn(c, 1) * 0.0001 layer_lin = layers.Linear(n, c, init_vals=(W.T, b.ravel())) loss_func = ls.CrossEntropy() net = nn.Network([layer_lin], loss_func, optimizer=None) my_loss = net.loss(X_dev, Y_dev_enc) assert (np.isclose(my_loss, -np.log(.1), atol=1e-2)) def test_CrossEntropy_Linear_Grad(): np.random.seed(1) W = np.random.randn(c, n) * 0.0001 b = np.random.randn(c, 1) * 0.0001
def main(): X, Y = utils.get_data() Y = utils.encode_labels(Y) print(Y[-10:]) model = ANN(200) model.fit(X, Y, reg=0, show_figure=True)
def main(): X, Y = utils.get_data() Y = utils.encode_labels(Y) model = ANN_TF(200) model.fit(X, Y)
def evaluate( self, experiment_path: str, pred_file='hard_predictions_{}.txt', tag_file='tagging_predictions_{}.txt', event_file='event_{}.txt', segment_file='segment_{}.txt', class_result_file='class_result_{}.txt', time_ratio=10. / 500, postprocessing='double', threshold=None, window_size=None, save_seq=False, sed_eval=True, # Do evaluation on sound event detection ( time stamps, segemtn/evaluation based) **kwargs): """evaluate :param experiment_path: Path to already trained model using train :type experiment_path: str :param pred_file: Prediction output file, put into experiment dir :param time_resolution: Resolution in time (1. represents the model resolution) :param **kwargs: Overwrite standard args, please pass `data` and `label` """ # Update config parameters with new kwargs config = torch.load(list(Path(f'{experiment_path}').glob("run_config*"))[0], map_location='cpu') # Use previous config, but update data such as kwargs config_parameters = dict(config, **kwargs) # Default columns to search for in data config_parameters.setdefault('colname', ('filename', 'encoded')) model_parameters = torch.load( glob.glob("{}/run_model*".format(experiment_path))[0], map_location=lambda storage, loc: storage) encoder = torch.load(glob.glob( '{}/run_encoder*'.format(experiment_path))[0], map_location=lambda storage, loc: storage) strong_labels_df = pd.read_csv(config_parameters['label'], sep='\t') # Evaluation is done via the filenames, not full paths if not np.issubdtype(strong_labels_df['filename'].dtype, np.number): strong_labels_df['filename'] = strong_labels_df['filename'].apply( os.path.basename) if 'audiofilepath' in strong_labels_df.columns: # In case of ave dataset, the audiofilepath column is the main column strong_labels_df['audiofilepath'] = strong_labels_df[ 'audiofilepath'].apply(os.path.basename) colname = 'audiofilepath' # AVE else: colname = 'filename' # Dcase etc. # Problem is that we iterate over the strong_labels_df, which is ambigious # In order to conserve some time and resources just reduce strong_label to weak_label format weak_labels_df = strong_labels_df.groupby( colname)['event_label'].unique().apply( tuple).to_frame().reset_index() if "event_labels" in strong_labels_df.columns: assert False, "Data with the column event_labels are used to train not to evaluate" weak_labels_array, encoder = utils.encode_labels( labels=weak_labels_df['event_label'], encoder=encoder) dataloader = dataset.getdataloader( { 'filename': weak_labels_df['filename'].values, 'encoded': weak_labels_array, }, config_parameters['data'], batch_size=1, shuffle=False, colname=config_parameters[ 'colname'] # For other datasets with different key names ) model = getattr(models, config_parameters['model'])( inputdim=dataloader.dataset.datadim, outputdim=len(encoder.classes_), **config_parameters['model_args']) model.load_state_dict(model_parameters) model = model.to(DEVICE).eval() time_predictions, clip_predictions = [], [] sequences_to_save = [] mAP_pred, mAP_tar = [], [] with torch.no_grad(): for batch in tqdm(dataloader, unit='file', leave=False): _, target, filenames = batch clip_pred, pred, _ = self._forward(model, batch) clip_pred = clip_pred.cpu().detach().numpy() mAP_tar.append(target.numpy().squeeze(0)) mAP_pred.append(clip_pred.squeeze(0)) pred = pred.cpu().detach().numpy() if postprocessing == 'median': if threshold is None: thres = 0.5 else: thres = threshold if window_size is None: window_size = 1 filtered_pred = utils.median_filter( pred, window_size=window_size, threshold=thres) decoded_pred = utils.decode_with_timestamps( encoder, filtered_pred) elif postprocessing == 'cATP-SDS': # cATP-SDS postprocessing uses an "Optimal" configurations, assumes we have a prior # Values are taken from the Surface Disentange paper # Classes are (DCASE2018 only) # ['Alarm_bell_ringing' 'Blender' 'Cat' 'Dishes' 'Dog' # 'Electric_shaver_toothbrush' 'Frying' 'Running_water' 'Speech' # 'Vacuum_cleaner'] assert pred.shape[ -1] == 10, "Only supporting DCASE2018 for now" if threshold is None: thres = 0.5 else: thres = threshold if window_size is None: window_size = [17, 42, 17, 9, 16, 74, 85, 64, 18, 87] # P(y|x) > alpha clip_pred = utils.binarize(clip_pred, threshold=thres) pred = pred * clip_pred filtered_pred = np.zeros_like(pred) # class specific filtering via median filter for cl in range(pred.shape[-1]): # Median filtering also applies thresholding filtered_pred[..., cl] = utils.median_filter( pred[..., cl], window_size=window_size[cl], threshold=thres) decoded_pred = utils.decode_with_timestamps( encoder, filtered_pred) elif postprocessing == 'double': # Double thresholding as described in # https://arxiv.org/abs/1904.03841 if threshold is None: hi_thres, low_thres = (0.75, 0.2) else: hi_thres, low_thres = threshold filtered_pred = utils.double_threshold(pred, high_thres=hi_thres, low_thres=low_thres) decoded_pred = utils.decode_with_timestamps( encoder, filtered_pred) elif postprocessing == 'triple': # Triple thresholding as described in # Using frame level + clip level predictions if threshold is None: clip_thres, hi_thres, low_thres = (0.5, 0.75, 0.2) else: clip_thres, hi_thres, low_thres = threshold clip_pred = utils.binarize(clip_pred, threshold=clip_thres) # Apply threshold to pred = clip_pred * pred filtered_pred = utils.double_threshold(pred, high_thres=hi_thres, low_thres=low_thres) decoded_pred = utils.decode_with_timestamps( encoder, filtered_pred) for num_batch in range(len(decoded_pred)): filename = filenames[num_batch] cur_pred = pred[num_batch] cur_clip = clip_pred[num_batch].reshape(1, -1) # Clip predictions, independent of per-frame predictions bin_clips = utils.binarize(cur_clip) # Binarize with default threshold 0.5 For clips bin_clips = encoder.inverse_transform( bin_clips.reshape(1, -1))[0] # 0 since only single sample # Add each label individually into list for clip_label in bin_clips: clip_predictions.append({ 'filename': filename, 'event_label': clip_label, }) # Save each frame output, for later visualization if save_seq: labels = weak_labels_df.loc[weak_labels_df['filename'] == filename]['event_label'] to_save_df = pd.DataFrame(pred[num_batch], columns=encoder.classes_) # True labels to_save_df.rename({'variable': 'event'}, axis='columns', inplace=True) to_save_df['filename'] = filename to_save_df['pred_labels'] = np.array(labels).repeat( len(to_save_df)) sequences_to_save.append(to_save_df) label_prediction = decoded_pred[num_batch] for event_label, onset, offset in label_prediction: time_predictions.append({ 'filename': filename, 'event_label': event_label, 'onset': onset, 'offset': offset }) assert len(time_predictions) > 0, "No outputs, lower threshold?" pred_df = pd.DataFrame( time_predictions, columns=['filename', 'event_label', 'onset', 'offset']) clip_pred_df = pd.DataFrame( clip_predictions, columns=['filename', 'event_label', 'probability']) test_data_filename = os.path.splitext( os.path.basename(config_parameters['label']))[0] if save_seq: pd.concat(sequences_to_save).to_csv(os.path.join( experiment_path, 'probabilities.csv'), index=False, sep='\t', float_format="%.4f") pred_df = utils.predictions_to_time(pred_df, ratio=time_ratio) if pred_file: pred_df.to_csv(os.path.join(experiment_path, pred_file.format(test_data_filename)), index=False, sep="\t") tagging_df = metrics.audio_tagging_results(strong_labels_df, pred_df) clip_tagging_df = metrics.audio_tagging_results( strong_labels_df, clip_pred_df) print("Tagging Classwise Result: \n{}".format( tabulate(clip_tagging_df, headers='keys', showindex=False, tablefmt='github'))) print("mAP: {}".format( metrics.mAP(np.array(mAP_tar), np.array(mAP_pred)))) if tag_file: clip_tagging_df.to_csv(os.path.join( experiment_path, tag_file.format(test_data_filename)), index=False, sep='\t') if sed_eval: event_result, segment_result = metrics.compute_metrics( strong_labels_df, pred_df, time_resolution=1.0) print("Event Based Results:\n{}".format(event_result)) event_results_dict = event_result.results_class_wise_metrics() class_wise_results_df = pd.DataFrame().from_dict({ f: event_results_dict[f]['f_measure'] for f in event_results_dict.keys() }).T class_wise_results_df.to_csv(os.path.join( experiment_path, class_result_file.format(test_data_filename)), sep='\t') print("Class wise F1-Macro:\n{}".format( tabulate(class_wise_results_df, headers='keys', tablefmt='github'))) if event_file: with open( os.path.join(experiment_path, event_file.format(test_data_filename)), 'w') as wp: wp.write(event_result.__str__()) print("=" * 100) print(segment_result) if segment_file: with open( os.path.join(experiment_path, segment_file.format(test_data_filename)), 'w') as wp: wp.write(segment_result.__str__()) event_based_results = pd.DataFrame( event_result.results_class_wise_average_metrics()['f_measure'], index=['event_based']) segment_based_results = pd.DataFrame( segment_result.results_class_wise_average_metrics() ['f_measure'], index=['segment_based']) result_quick_report = pd.concat(( event_based_results, segment_based_results, )) # Add two columns tagging_macro_f1, tagging_macro_pre, tagging_macro_rec = tagging_df.loc[ tagging_df['label'] == 'macro'].values[0][1:] static_tagging_macro_f1, static_tagging_macro_pre, static_tagging_macro_rec = clip_tagging_df.loc[ clip_tagging_df['label'] == 'macro'].values[0][1:] result_quick_report.loc['Time Tagging'] = [ tagging_macro_f1, tagging_macro_pre, tagging_macro_rec ] result_quick_report.loc['Clip Tagging'] = [ static_tagging_macro_f1, static_tagging_macro_pre, static_tagging_macro_rec ] with open( os.path.join( experiment_path, 'quick_report_{}.md'.format(test_data_filename)), 'w') as wp: print(tabulate(result_quick_report, headers='keys', tablefmt='github'), file=wp) print("Quick Report: \n{}".format( tabulate(result_quick_report, headers='keys', tablefmt='github')))
def audio_tagging_results(reference, estimated): """audio_tagging_results. Returns clip-level F1 Scores :param reference: The ground truth dataframe as pd.DataFrame :param estimated: Predicted labels by the model ( thresholded ) """ if "event_label" in reference.columns: classes = reference.event_label.dropna().unique().tolist( ) + estimated.event_label.dropna().unique().tolist() encoder = MultiLabelBinarizer().fit([classes]) reference = get_audio_tagging_df(reference) estimated = get_audio_tagging_df(estimated) ref_labels, _ = utils.encode_labels(reference['event_label'], encoder=encoder) reference['event_label'] = ref_labels.tolist() est_labels, _ = utils.encode_labels(estimated['event_label'], encoder=encoder) estimated['event_label'] = est_labels.tolist() matching = reference.merge(estimated, how='outer', on="filename", suffixes=["_ref", "_pred"]) def na_values(val): if type(val) is np.ndarray: return val elif isinstance(val, list): return np.array(val) if pd.isna(val): return np.zeros(len(encoder.classes_)) return val ret_df = pd.DataFrame(columns=['label', 'f1', 'precision', 'recall']) if not estimated.empty: matching['event_label_pred'] = matching.event_label_pred.apply( na_values) matching['event_label_ref'] = matching.event_label_ref.apply(na_values) y_true = np.vstack(matching['event_label_ref'].values) y_pred = np.vstack(matching['event_label_pred'].values) ret_df.loc[:, 'label'] = encoder.classes_ for avg in [None, 'macro', 'micro']: avg_f1 = skmetrics.f1_score(y_true, y_pred, average=avg) avg_pre = skmetrics.precision_score(y_true, y_pred, average=avg) avg_rec = skmetrics.recall_score(y_true, y_pred, average=avg) # avg_auc = skmetrics.roc_auc_score(y_true, y_pred, average=avg) if avg == None: # Add for each label non pooled stats ret_df.loc[:, 'precision'] = avg_pre ret_df.loc[:, 'recall'] = avg_rec ret_df.loc[:, 'f1'] = avg_f1 # ret_df.loc[:, 'AUC'] = avg_auc else: # Append macro and micro results in last 2 rows ret_df = ret_df.append( { 'label': avg, 'precision': avg_pre, 'recall': avg_rec, 'f1': avg_f1, # 'AUC': avg_auc }, ignore_index=True) return ret_df
def train(self, config, **kwargs): """Trains a given model specified in the config file or passed as the --model parameter. All options in the config file can be overwritten as needed by passing --PARAM Options with variable lengths ( e.g., kwargs can be passed by --PARAM '{"PARAM1":VAR1, "PARAM2":VAR2}' :param config: yaml config file :param **kwargs: parameters to overwrite yaml config """ config_parameters = utils.parse_config_or_kwargs(config, **kwargs) outputdir = Path( config_parameters['outputpath'], config_parameters['model'], "{}_{}".format( datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%m'), uuid.uuid1().hex[:8])) # Early init because of creating dir checkpoint_handler = ModelCheckpoint( outputdir, 'run', n_saved=1, require_empty=False, create_dir=True, score_function=lambda engine: -engine.state.metrics['Loss'], save_as_state_dict=False, score_name='loss') logger = utils.getfile_outlogger(Path(outputdir, 'train.log')) logger.info("Storing files in {}".format(outputdir)) # utils.pprint_dict utils.pprint_dict(config_parameters, logger.info) logger.info("Running on device {}".format(DEVICE)) labels_df = pd.read_csv(config_parameters['trainlabel'], sep=' ') labels_df['encoded'], encoder = utils.encode_labels( labels=labels_df['bintype']) train_df, cv_df = utils.split_train_cv(labels_df) transform = utils.parse_transforms(config_parameters['transforms']) utils.pprint_dict({'Classes': encoder.classes_}, logger.info, formatter='pretty') utils.pprint_dict(transform, logger.info, formatter='pretty') if 'sampler' in config_parameters and config_parameters[ 'sampler'] == 'MinimumOccupancySampler': # Asserts that each "batch" contains at least one instance train_sampler = dataset.MinimumOccupancySampler( np.stack(train_df['encoded'].values)) sampling_kwargs = {"sampler": train_sampler, "shuffle": False} elif 'shuffle' in config_parameters and config_parameters['shuffle']: sampling_kwargs = {"shuffle": True} else: sampling_kwargs = {"shuffle": False} logger.info("Using Sampler {}".format(sampling_kwargs)) colname = config_parameters.get('colname', ('filename', 'encoded')) # trainloader = dataset.getdataloader( train_df, config_parameters['traindata'], transform=transform, batch_size=config_parameters['batch_size'], colname=colname, # For other datasets with different key names num_workers=config_parameters['num_workers'], **sampling_kwargs) cvdataloader = dataset.getdataloader( cv_df, config_parameters['traindata'], transform=None, shuffle=False, colname=colname, # For other datasets with different key names batch_size=config_parameters['batch_size'], num_workers=config_parameters['num_workers']) if 'pretrained' in config_parameters and config_parameters[ 'pretrained'] is not None: model = models.load_pretrained(config_parameters['pretrained'], outputdim=len(encoder.classes_)) else: model = getattr(models, config_parameters['model'], 'LightCNN')(inputdim=trainloader.dataset.datadim, outputdim=len(encoder.classes_), **config_parameters['model_args']) if config_parameters['optimizer'] == 'AdaBound': try: import adabound optimizer = adabound.AdaBound( model.parameters(), **config_parameters['optimizer_args']) except ImportError: logger.info( "Adabound package not found, install via pip install adabound. Using Adam instead" ) config_parameters['optimizer'] = 'Adam' config_parameters['optimizer_args'] = { } # Default adam is adabount not found else: optimizer = getattr( torch.optim, config_parameters['optimizer'], )(model.parameters(), **config_parameters['optimizer_args']) utils.pprint_dict(optimizer, logger.info, formatter='pretty') utils.pprint_dict(model, logger.info, formatter='pretty') if DEVICE.type != 'cpu' and torch.cuda.device_count() > 1: logger.info("Using {} GPUs!".format(torch.cuda.device_count())) model = torch.nn.DataParallel(model) criterion = torch.nn.CrossEntropyLoss().to(DEVICE) model = model.to(DEVICE) precision = Precision() recall = Recall() f1_score = (precision * recall * 2 / (precision + recall)).mean() metrics = { 'Loss': Loss(criterion), 'Precision': precision.mean(), 'Recall': recall.mean(), 'Accuracy': Accuracy(), 'F1': f1_score, } # batch contains 3 elements, X,Y and filename. Filename is only used # during evaluation def _prep_batch(batch, device=DEVICE, non_blocking=False): x, y, _ = batch return (convert_tensor(x, device=device, non_blocking=non_blocking), convert_tensor(y, device=device, non_blocking=non_blocking)) train_engine = create_supervised_trainer(model, optimizer=optimizer, loss_fn=criterion, prepare_batch=_prep_batch, device=DEVICE) inference_engine = create_supervised_evaluator( model, metrics=metrics, prepare_batch=_prep_batch, device=DEVICE) RunningAverage(output_transform=lambda x: x).attach( train_engine, 'run_loss') # Showing progressbar during training pbar = ProgressBar(persist=False) pbar.attach(train_engine, ['run_loss']) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, factor=0.1) @inference_engine.on(Events.COMPLETED) def update_reduce_on_plateau(engine): val_loss = engine.state.metrics['Loss'] if 'ReduceLROnPlateau' == scheduler.__class__.__name__: scheduler.step(val_loss) else: scheduler.step() early_stop_handler = EarlyStopping( patience=5, score_function=lambda engine: -engine.state.metrics['Loss'], trainer=train_engine) inference_engine.add_event_handler(Events.EPOCH_COMPLETED, early_stop_handler) inference_engine.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, { 'model': model, 'encoder': encoder, 'config': config_parameters, }) @train_engine.on(Events.EPOCH_COMPLETED) def compute_validation_metrics(engine): inference_engine.run(cvdataloader) results = inference_engine.state.metrics output_str_list = [ "Validation Results - Epoch : {:<5}".format(engine.state.epoch) ] for metric in metrics: output_str_list.append("{} {:<5.3f}".format( metric, results[metric])) logger.info(" ".join(output_str_list)) pbar.n = pbar.last_print_n = 0 train_engine.run(trainloader, max_epochs=config_parameters['epochs']) return outputdir
def train(self, config, **kwargs): """Trains a given model specified in the config file or passed as the --model parameter. All options in the config file can be overwritten as needed by passing --PARAM Options with variable lengths ( e.g., kwargs can be passed by --PARAM '{"PARAM1":VAR1, "PARAM2":VAR2}' :param config: yaml config file :param **kwargs: parameters to overwrite yaml config """ config_parameters = utils.parse_config_or_kwargs(config, **kwargs) outputdir = os.path.join( config_parameters['outputpath'], config_parameters['model'], "{}_{}".format( datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%m'), uuid.uuid1().hex)) # Create base dir Path(outputdir).mkdir(exist_ok=True, parents=True) logger = utils.getfile_outlogger(os.path.join(outputdir, 'train.log')) logger.info("Storing files in {}".format(outputdir)) # utils.pprint_dict utils.pprint_dict(config_parameters, logger.info) logger.info("Running on device {}".format(DEVICE)) labels_df = pd.read_csv(config_parameters['label'], sep='\s+').convert_dtypes() # In case of ave dataset where index is int, we change the # absolute name to relname if not np.all(labels_df['filename'].str.isnumeric()): labels_df.loc[:, 'filename'] = labels_df['filename'].apply( os.path.basename) encoder = utils.train_labelencoder(labels=labels_df['event_labels']) # These labels are useless, only for mode == stratified label_array, _ = utils.encode_labels(labels_df['event_labels'], encoder) if 'cv_label' in config_parameters: cv_df = pd.read_csv(config_parameters['cv_label'], sep='\s+').convert_dtypes() if not np.all(cv_df['filename'].str.isnumeric()): cv_df.loc[:, 'filename'] = cv_df['filename'].apply( os.path.basename) train_df = labels_df logger.info( f"Using CV labels from {config_parameters['cv_label']}") else: train_df, cv_df = utils.split_train_cv( labels_df, y=label_array, **config_parameters['data_args']) if 'cv_data' in config_parameters: cv_data = config_parameters['cv_data'] logger.info(f"Using CV data {config_parameters['cv_data']}") else: cv_data = config_parameters['data'] train_label_array, _ = utils.encode_labels(train_df['event_labels'], encoder) cv_label_array, _ = utils.encode_labels(cv_df['event_labels'], encoder) transform = utils.parse_transforms(config_parameters['transforms']) utils.pprint_dict({'Classes': encoder.classes_}, logger.info, formatter='pretty') torch.save(encoder, os.path.join(outputdir, 'run_encoder.pth')) torch.save(config_parameters, os.path.join(outputdir, 'run_config.pth')) logger.info("Transforms:") utils.pprint_dict(transform, logger.info, formatter='pretty') # For Unbalanced Audioset, this is true if 'sampler' in config_parameters and config_parameters[ 'sampler'] == 'MultiBalancedSampler': # Training sampler that oversamples the dataset to be roughly equally sized # Calcualtes mean over multiple instances, rather useful when number of classes # is large train_sampler = dataset.MultiBalancedSampler( train_label_array, num_samples=1 * train_label_array.shape[0], replacement=True) sampling_kwargs = {"shuffle": False, "sampler": train_sampler} elif 'sampler' in config_parameters and config_parameters[ 'sampler'] == 'MinimumOccupancySampler': # Asserts that each "batch" contains at least one instance train_sampler = dataset.MinimumOccupancySampler( train_label_array, sampling_mode='same') sampling_kwargs = {"shuffle": False, "sampler": train_sampler} else: sampling_kwargs = {"shuffle": True} logger.info("Using Sampler {}".format(sampling_kwargs)) trainloader = dataset.getdataloader( { 'filename': train_df['filename'].values, 'encoded': train_label_array }, config_parameters['data'], transform=transform, batch_size=config_parameters['batch_size'], colname=config_parameters['colname'], num_workers=config_parameters['num_workers'], **sampling_kwargs) cvdataloader = dataset.getdataloader( { 'filename': cv_df['filename'].values, 'encoded': cv_label_array }, cv_data, transform=None, shuffle=False, colname=config_parameters['colname'], batch_size=config_parameters['batch_size'], num_workers=config_parameters['num_workers']) model = getattr(models, config_parameters['model'], 'CRNN')(inputdim=trainloader.dataset.datadim, outputdim=len(encoder.classes_), **config_parameters['model_args']) if 'pretrained' in config_parameters and config_parameters[ 'pretrained'] is not None: models.load_pretrained(model, config_parameters['pretrained'], outputdim=len(encoder.classes_)) logger.info("Loading pretrained model {}".format( config_parameters['pretrained'])) model = model.to(DEVICE) if config_parameters['optimizer'] == 'AdaBound': try: import adabound optimizer = adabound.AdaBound( model.parameters(), **config_parameters['optimizer_args']) except ImportError: config_parameters['optimizer'] = 'Adam' config_parameters['optimizer_args'] = {} else: optimizer = getattr( torch.optim, config_parameters['optimizer'], )(model.parameters(), **config_parameters['optimizer_args']) utils.pprint_dict(optimizer, logger.info, formatter='pretty') utils.pprint_dict(model, logger.info, formatter='pretty') if DEVICE.type != 'cpu' and torch.cuda.device_count() > 1: logger.info("Using {} GPUs!".format(torch.cuda.device_count())) model = torch.nn.DataParallel(model) criterion = getattr(losses, config_parameters['loss'])().to(DEVICE) def _train_batch(_, batch): model.train() with torch.enable_grad(): optimizer.zero_grad() output = self._forward( model, batch) # output is tuple (clip, frame, target) loss = criterion(*output) loss.backward() # Single loss optimizer.step() return loss.item() def _inference(_, batch): model.eval() with torch.no_grad(): return self._forward(model, batch) def thresholded_output_transform(output): # Output is (clip, frame, target) y_pred, _, y = output y_pred = torch.round(y_pred) return y_pred, y precision = Precision(thresholded_output_transform, average=False) recall = Recall(thresholded_output_transform, average=False) f1_score = (precision * recall * 2 / (precision + recall)).mean() metrics = { 'Loss': losses.Loss( criterion), #reimplementation of Loss, supports 3 way loss 'Precision': Precision(thresholded_output_transform), 'Recall': Recall(thresholded_output_transform), 'Accuracy': Accuracy(thresholded_output_transform), 'F1': f1_score, } train_engine = Engine(_train_batch) inference_engine = Engine(_inference) for name, metric in metrics.items(): metric.attach(inference_engine, name) def compute_metrics(engine): inference_engine.run(cvdataloader) results = inference_engine.state.metrics output_str_list = [ "Validation Results - Epoch : {:<5}".format(engine.state.epoch) ] for metric in metrics: output_str_list.append("{} {:<5.2f}".format( metric, results[metric])) logger.info(" ".join(output_str_list)) pbar = ProgressBar(persist=False) pbar.attach(train_engine) if 'itercv' in config_parameters and config_parameters[ 'itercv'] is not None: train_engine.add_event_handler( Events.ITERATION_COMPLETED(every=config_parameters['itercv']), compute_metrics) train_engine.add_event_handler(Events.EPOCH_COMPLETED, compute_metrics) # Default scheduler is using patience=3, factor=0.1 scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, **config_parameters['scheduler_args']) @inference_engine.on(Events.EPOCH_COMPLETED) def update_reduce_on_plateau(engine): logger.info(f"Scheduling epoch {engine.state.epoch}") val_loss = engine.state.metrics['Loss'] if 'ReduceLROnPlateau' == scheduler.__class__.__name__: scheduler.step(val_loss) else: scheduler.step() early_stop_handler = EarlyStopping( patience=config_parameters['early_stop'], score_function=self._negative_loss, trainer=train_engine) inference_engine.add_event_handler(Events.EPOCH_COMPLETED, early_stop_handler) if config_parameters['save'] == 'everyepoch': checkpoint_handler = ModelCheckpoint(outputdir, 'run', n_saved=5, require_empty=False) train_engine.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, { 'model': model, }) train_engine.add_event_handler( Events.ITERATION_COMPLETED(every=config_parameters['itercv']), checkpoint_handler, { 'model': model, }) else: checkpoint_handler = ModelCheckpoint( outputdir, 'run', n_saved=1, require_empty=False, score_function=self._negative_loss, global_step_transform=global_step_from_engine( train_engine), # Just so that model is saved with epoch... score_name='loss') inference_engine.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, { 'model': model, }) train_engine.run(trainloader, max_epochs=config_parameters['epochs']) return outputdir
rf = [] tf = [] attentionf = [] for aid in cur_author: if len(cur_author[aid]) < 5: continue for pid in cur_author[aid]: pids.append(pid) labels.append(aid) rf.append(rawFeature.get(pid)) tf.append(tripletFeature.get(pid)) attentionf.append(lc_emb.get(pid)) labels = encode_labels(labels) numberofLabels = len(set(labels)) def clusterTest(embedding, numberofLabels): clusters_pred = clustering(embedding, num_clusters=numberofLabels) prec, rec, f1 = pairwise_precision_recall_f1(clusters_pred, labels) return [prec, rec, f1] tSNEAnanlyse(rf, labels, join(settings.PIC_DIR, "FINALResult", "%s_rawFeature.png" % (name))) tSNEAnanlyse(tf, labels, join(settings.PIC_DIR, "FINALResult", "%s_tripletFeature.png" % (name))) tSNEAnanlyse(attentionf, labels, join(settings.PIC_DIR, "FINALResult", "%s_lcmbFeature.png" % (name))) Res = {} Res['rawfeature'] = clusterTest(rf, numberofLabels=numberofLabels)
def evaluate_tagging(self, experiment_path: str, tag_file='tagging_predictions_{}.txt', **kwargs): exppath = Path(experiment_path) if exppath.is_file(): # Best model passed! model_parameters = torch.load( str(exppath), map_location=lambda storage, loc: storage) experiment_path = exppath.parent # Just set upper path as default else: model_parameters = torch.load( glob.glob("{}/run_model*".format(experiment_path))[0], map_location=lambda storage, loc: storage) config = torch.load(glob.glob( "{}/run_config*".format(experiment_path))[0], map_location=lambda storage, loc: storage) logger = utils.getfile_outlogger(None) # Use previous config, but update data such as kwargs config_parameters = dict(config, **kwargs) # Default columns to search for in data config_parameters.setdefault('colname', ('filename', 'encoded')) encoder = torch.load(glob.glob( '{}/run_encoder*'.format(experiment_path))[0], map_location=lambda storage, loc: storage) test_data_filename = os.path.splitext( os.path.basename(config_parameters['label']))[0] strong_labels_df = pd.read_csv(config_parameters['label'], sep='\s+') # Evaluation is done via the filenames, not full paths if not np.issubdtype(strong_labels_df['filename'].dtype, np.number): strong_labels_df['filename'] = strong_labels_df['filename'].apply( os.path.basename) if 'audiofilepath' in strong_labels_df.columns: # In case of ave dataset, the audiofilepath column is the main column strong_labels_df['audiofilepath'] = strong_labels_df[ 'audiofilepath'].apply(os.path.basename) colname = 'audiofilepath' # AVE else: colname = 'filename' # Dcase etc. weak_labels_df = strong_labels_df.groupby( colname)['event_label'].unique().apply( tuple).to_frame().reset_index() if "event_labels" in strong_labels_df.columns: assert False, "Data with the column event_labels are used to train not to evaluate" weak_labels_array, encoder = utils.encode_labels( labels=weak_labels_df['event_label'], encoder=encoder) # assert (weak_labels_df['encoded'].apply(lambda x: sum(x)) > # 0).all(), "No targets found, is the encoder maybe not right?" for k, v in config_parameters.items(): logger.info(f"{k}:{v}") dataloader = dataset.getdataloader( { 'filename': weak_labels_df['filename'].values, 'encoded': weak_labels_array }, config_parameters['data'], batch_size=1, shuffle=False, colname=config_parameters[ 'colname'], # For other datasets with different key names num_workers=3, ) model = getattr(models, config_parameters['model'])( inputdim=dataloader.dataset.datadim, outputdim=len(encoder.classes_), **config_parameters['model_args']) model.load_state_dict(model_parameters) model = model.to(DEVICE).eval() y_pred, y_true = [], [] with torch.no_grad(): for batch in tqdm(dataloader, unit='file', leave=False): _, target, filenames = batch clip_pred, _, _ = self._forward(model, batch) clip_pred = clip_pred.cpu().detach().numpy() y_pred.append(clip_pred) y_true.append(target.numpy()) y_pred = np.concatenate(y_pred) y_true = np.concatenate(y_true) mAP = np.nan_to_num(metrics.mAP(y_true, y_pred)) auc = np.nan_to_num(metrics.roc(y_true, y_pred)) with open( os.path.join(experiment_path, tag_file.format(test_data_filename)), 'w') as wp: print(f"mAP:{mAP.mean():.3f}", file=wp) print(f"mAP:\n{mAP.mean():.3f}") print(f"AuC:{auc.mean():.3f}", file=wp) print(f"AuC:\n{auc.mean():.3f}")