def __init__(self, base_work_dir, descriptor): """ Initialize IQR session :param base_work_dir: Base directory to put working files into :type base_work_dir: str :param descriptor: FeatureDescriptor to use for this IQR session :type descriptor: masir.search.FeatureDescriptor.FeatureDescriptor """ self._session_lock = multiprocessing.RLock() self.uuid = uuid.uuid1() self.positive_ids = set() self.negative_ids = set() self.work_dir = osp.join(osp.abspath(osp.expanduser(base_work_dir)), 'iqr', 'session-%s' % str(self.uuid)) if not osp.isdir(self.work_dir): os.makedirs(self.work_dir) #: :type: dict of (str, FeatureDescriptor) self.descriptor = descriptor # noinspection PyTypeChecker self.feature_memory = FeatureMemory.construct_from_files( descriptor.ids_file, descriptor.bg_flags_file, descriptor.feature_data_file, descriptor.kernel_data_file ) # noinspection PyProtectedMember self._original_fm_bgid_set = self.feature_memory._bg_clip_ids # Mapping of a clip ID to the probability of it being associated to # positive adjudications. This is None before any refinement occurs. #: :type: None or dict of (int, float) self.results = None self.svm_train_params = '-q -t 4 -b 1 -w1 50 -c 100' # Ingest where extension images are placed extension_ingest_dir = osp.join(self.work_dir, "extension_ingest") self.extension_ingest = IngestManager( extension_ingest_dir, self.feature_memory.get_ids().max() + 1 )
def reset(self): """ Reset the IQR Search state No positive adjudications, reload original feature data """ with self._session_lock: self.positive_ids.clear() self.negative_ids.clear() # noinspection PyUnresolvedReferences self.feature_memory = FeatureMemory.construct_from_files( self.descriptor.ids_file, self.descriptor.bg_flags_file, self.descriptor.feature_data_file, self.descriptor.kernel_data_file ) self.results = None # clear contents of working directory shutil.rmtree(self.work_dir) os.makedirs(self.work_dir)
def masir_svm_cross_validate(k_folds, parameter_sets, fm, positive_ids, negative_ids, metric='ap_prime'): """ Perform K-fold cross validation over the given data and positive set. Only uses one fold for testing per fold iteration (K1 CV). :param k_folds: Number of folds to perform :type k_folds: int :param parameter_sets: Iterable of parameter strings for libSVM :type parameter_sets: Iterable of str :param fm: Dataset feature memory object :type fm: masir.FeatureMemory.FeatureMemory :param positive_ids: Iterable of positive image IDs in given dataset :type positive_ids: list of int :param negative_ids: Iterable of negative image IDs in given dataset :type negative_ids: list of int :param metric: Average precision metric flavor to use. Must be one of: [ "ap_prime", "ap", "R0_star" ] :type metric: str :return: Optimal parameter set :rtype: str """ log = logging.getLogger("masir_SVM_CV") # Input checks assert not set(positive_ids).intersection(negative_ids), \ "Common IDs in positive and negative ID sets!" # # Partition the pos/neg IDs into k slices for positive and negative IDs # k_folds = int(k_folds) fold_index = range(k_folds) pos_partition_interval = len(positive_ids) / float(k_folds) neg_partition_interval = len(negative_ids) / float(k_folds) pos_fold_indices = [0] neg_fold_indices = [0] for f in range(1, k_folds): pos_fold_indices.append(int(pos_partition_interval * f)) neg_fold_indices.append(int(neg_partition_interval * f)) pos_fold_indices.append(len(positive_ids)) neg_fold_indices.append(len(negative_ids)) # iterables of ID set slices for each fold pos_fold_slices = tuple(slice(pos_fold_indices[f], pos_fold_indices[f + 1]) for f in fold_index) neg_fold_slices = tuple(slice(neg_fold_indices[f], neg_fold_indices[f + 1]) for f in fold_index) #+ # DEBUG log.debug("Pos fold slices: %s", pos_fold_slices) log.debug("Neg fold slices: %s", neg_fold_slices) #- # # CV vars # # Collection of average CV k-fold precisions per parameter set tested #: :type: list of float p_set_avg_precision = [] # Average precision metric flavor to use # must be one of: [ "ap_prime", "ap", "R0_star" ] metric = metric or "ap_prime" # # Train, test and score for each parameter set # for param_set in parameter_sets: # For each parameter set, train/test # For each fold, create an SVM model with training fold, test on testing # fold, compute average precision msg = "===== Parameters: %s " % param_set log.info(msg + '=' * (80 - len(msg))) # List entries parallel to fold range #: :type: list of dict fold_results = [] # f denotes the current testing fold for test_fold in fold_index: msg = '---------- Fold %d ' % test_fold log.info(msg + '-' * (80 - len(msg))) train_folds = tuple(i for i in fold_index if i != test_fold) # Train positive/negative IDs fold_train_positive_ids = set([ uid for fidx in train_folds for uid in positive_ids[pos_fold_slices[fidx]] ]) fold_train_negative_ids = set([ uid for fidx in train_folds for uid in negative_ids[neg_fold_slices[fidx]] ]) # Test positive/negative IDs fold_test_positive_ids = set(positive_ids[pos_fold_slices[test_fold]]) fold_test_negative_ids = set(negative_ids[neg_fold_slices[test_fold]]) # FeatureMemory objects for positive and negative folds fold_fm = FeatureMemory(fm.get_ids(), fold_train_negative_ids, fm.get_feature_matrix(), fm.get_kernel_matrix()) fold_dk = fold_fm.get_distance_kernel() # # Training SVM model for current fold # # symmetric_submatrix call automatically includes the DK's BG set, # which was initialized above to be the fold's training negatives. idx2id_map, idx_bg_flags, m = \ fold_dk.symmetric_submatrix(*fold_train_positive_ids) svm_train_labels = numpy.array(tuple(not b for b in idx_bg_flags)) # Train the model with the current parameter set train_d = iqr_modules.iqr_model_train(m, svm_train_labels, idx2id_map, param_set) fold_svm_model = train_d['model'] fold_svm_svids = train_d['clipids_SVs'] # # Model application to test fold # # Need testing kernel to include both testing positive and negative # IDs. Merging sets. fold_test_ids = set.union(fold_test_positive_ids, fold_test_negative_ids) idx2id_row, idx2id_col, kernel_test = \ fold_dk.extract_rows(fold_svm_svids, col_ids=fold_test_ids) # adjust the contents of the testing kernel to only include the test_d = iqr_modules.iqr_model_test(fold_svm_model, kernel_test.A, idx2id_col) ordered_results = sorted(zip(test_d['clipids'], test_d['probs']), key=lambda x: x[1], reverse=True) # Store ordered scores and label lists for precision calculation # later. fold_results.append({ 'scores': [e[1] for e in ordered_results], 'labels': [1 if (cid in fold_test_positive_ids) else 0 for cid, prob in ordered_results] }) fold_precisions = perf_estimation.average_precision_R0(fold_results) # Average chosen precision metric across folds p_set_avg_precision.append( sum(fp[metric] for fp in fold_precisions) / k_folds ) # Choose the best performing parameter set log.debug("Parameter precisions:\n%s", zip(parameter_sets, p_set_avg_precision)) p_best_idx = p_set_avg_precision.index(max(p_set_avg_precision)) p_best = parameter_sets[p_best_idx] log.info("Best chosen: %s", p_best) return p_best
if __name__ == '__main__': # MANUAL TESTING from masir.search.colordescriptor import ColorDescriptor import masir_config logging.basicConfig() logging.getLogger().setLevel(logging.DEBUG) # TODO: Add random search for base-line comparison pos_ids = list(numpy.loadtxt("positive_ids.txt", dtype=int)) neg_ids = list(numpy.loadtxt("negative_ids.txt", dtype=int)) cd_csift = ColorDescriptor.CSIFT(masir_config.DIR_DATA, masir_config.DIR_WORK) fm = FeatureMemory.construct_from_descriptor(cd_csift) parameter_set = [ '-w1 50 -t 4 -b 1 -c 0.1', '-w1 50 -t 4 -b 1 -c 0.5', '-w1 50 -t 4 -b 1 -c 1', '-w1 50 -t 4 -b 1 -c 5', '-w1 50 -t 4 -b 1 -c 10', '-w1 50 -t 4 -b 1 -c 50', '-w1 50 -t 4 -b 1 -c 100', '-w1 50 -t 4 -b 1 -c 500', ] b = masir_svm_cross_validate(5, parameter_set, fm, pos_ids, neg_ids) print print "Best set:", b