def _get_shared_data(): # pragma: no cover """multiprocessing helper function to extract numpy arrays from shared RawArray types used to shared data across process.""" s_counts = _to_np_array(mp_params['s_counts']) prune_count_matrix = _to_np_array( mp_arr=mp_params['prune_count_matrix'], shape=mp_params['pcm_shape'], ) psx = _to_np_array( mp_arr=mp_params['psx'], dtype='float32', shape=mp_params['psx_shape'], ) multi_label = mp_params['multi_label'] if multi_label: # Shared data is passed as one-hot encoded matrix print('before', mp_params['s']) s = onehot2int( _to_np_array( mp_arr=mp_params['s'], shape=(psx.shape[0], psx.shape[1]), )) print('after', s) else: s = _to_np_array(mp_params['s']) return s, s_counts, prune_count_matrix, psx, multi_label
def find_noisy_labels(prediction, labels, out, multi_label): # create folder if not exists os.makedirs(out, exist_ok=True) # sort files by Images names pred_df = pd.read_csv(prediction) pred_df = pred_df.sort_values(by=['Images']) label_df = pd.read_csv(labels) label_df = label_df.sort_values(by=['Images']) label_df[list(label_df)[1:]] = 1 * (label_df[list(label_df)[1:]] >= 0.5) # print(label_df) assert len(pred_df) == len( label_df), "Mismatch between predictions and labels" psx = np.array(pred_df[list(pred_df)[1:]]) _labels = np.array(label_df[list(label_df)[1:]]) if multi_label: correctly_formatted_labels = onehot2int(_labels) label_errors_bool = cleanlab.pruning.get_noise_indices( s=correctly_formatted_labels, psx=psx, prune_method='prune_by_noise_rate', sorted_index_method=None, multi_label=True) label_errors_idx = cleanlab.pruning.order_label_errors( label_errors_bool=label_errors_bool, psx=psx, labels=correctly_formatted_labels, sorted_index_method='normalized_margin') nb_dict = { 'Images': list(), 'Labels': list(), 'Prediction': list(), 'Indices': list() } for i in label_errors_idx: nb_dict['Images'].append(label_df.iloc[i]['Images']) nb_dict['Labels'].append(','.join(list(map(str, _labels[i, :])))) nb_dict['Prediction'].append(','.join(list(map(str, psx[i, :])))) nb_dict['Indices'].append(i) nb_df = pd.DataFrame(nb_dict) nb_df.to_csv('noise_labels_multi.csv', index=False) else: header = list(pred_df.keys())[1:] for t in range(len(header)): pred_t = np.array(pred_df[header[t]]) disease = header[t] # if disease not in list(label_df.keys()): # raise ValueError("{} does not exists in ground truth labels!") label_t = np.array(label_df[disease]) # generate noise labels binary = list() for i in range(len(pred_t)): binary.append([1 - pred_t[i], pred_t[i]]) binary = np.array(binary) label_errors_bool = cleanlab.pruning.get_noise_indices( s=copy.deepcopy(label_t), psx=binary, prune_method='prune_by_noise_rate', sorted_index_method=None) label_errors_idx = cleanlab.pruning.order_label_errors( label_errors_bool=label_errors_bool, psx=binary, labels=copy.deepcopy(label_t), sorted_index_method='normalized_margin') nb_dict = { 'Images': list(), header[t]: list(), 'Prob': list(), 'Indices': list() } for i in label_errors_idx: nb_dict['Images'].append(label_df.iloc[i]['Images']) nb_dict[header[t]].append(label_t[i]) nb_dict['Prob'].append(float(pred_t[i])) nb_dict['Indices'].append(i) nb_df = pd.DataFrame(nb_dict) nb_df.to_csv(os.path.join(out, '{}.csv'.format(header[t])), index=False)