def _prepare_classifiers(self): if self.classifier_params is None: self.classifier_params = dict() self.classifier_name.set_params(**self.classifier_params) if self.select_features is not None: if self.selection_params is None: self.selection_params = { 'nfeatures': 0.1, 'min_count': 2, 'minfeatures': 100 } self.selection_params['method'] = self.select_features self.first_selector = MulticlassFeatureSelector( local=True, method=self.select_features, min_count=self.min_count, nfeatures=-1) feature_selector = MulticlassFeatureSelector( **self.selection_params) self.classifier_ = Pipeline([('selector', feature_selector), ('classifier', self.classifier_name)]) else: self.first_selector = ZeroFeatureRemover() self.classifier_ = self.classifier_name return self
def _prepare_classifiers(self): if self.classifier_params is None: self.classifier_params = dict() self.classifier_name.set_params(**self.classifier_params) if self.select_features is not None: if self.selection_params is None: self.selection_params = {'nfeatures': 0.1, 'min_count': 2, 'minfeatures': 100} self.selection_params['method'] = self.select_features self.first_selector = MulticlassFeatureSelector( local=True, method=self.select_features, min_count=self.min_count, nfeatures=-1) feature_selector = MulticlassFeatureSelector(**self.selection_params) self.classifier_ = Pipeline([('selector', feature_selector), ('classifier', self.classifier_name)]) else: self.first_selector = ZeroFeatureRemover() self.classifier_ = self.classifier_name return self
class LocalTransformationClassifier(BaseEstimator, ClassifierMixin): def __init__(self, left_context_length=2, right_context_length=3, has_central_context=True, classifier_name=sklm.LogisticRegression(), classifier_params=None, multiclass=False, select_features=False, selection_params = None, min_count=3, smallest_prob=0.01, min_probs_ratio=0.75): self.left_context_length = left_context_length self.right_context_length = right_context_length self.has_central_context = has_central_context self.select_features = True self.substrs = [] self.classifier_name = classifier_name self.classifier_params = classifier_params self.multiclass = multiclass self.select_features = select_features self.selection_params = selection_params self.smallest_prob = smallest_prob self.min_probs_ratio = min_probs_ratio self.min_count = min_count # инициализация параметров self.classifiers = dict() self.classifier_classes = dict() self.predefined_answers = dict() self.selector_name = None self.selectors = dict() self.feature_codes = dict() self.features = [] self.features_number = 0 def _prepare_classifiers(self): if self.classifier_params is None: self.classifier_params = dict() self.classifier_name.set_params(**self.classifier_params) if self.select_features is not None: if self.selection_params is None: self.selection_params = {'nfeatures': 0.1, 'min_count': 2, 'minfeatures': 100} self.selection_params['method'] = self.select_features self.first_selector = MulticlassFeatureSelector( local=True, method=self.select_features, min_count=self.min_count, nfeatures=-1) feature_selector = MulticlassFeatureSelector(**self.selection_params) self.classifier_ = Pipeline([('selector', feature_selector), ('classifier', self.classifier_name)]) else: self.first_selector = ZeroFeatureRemover() self.classifier_ = self.classifier_name return self def fit(self, X, y): """ Применяет классификатор к данным. X --- набор троек вида (фрагмент, парадигма, левый контекст, правый контекст) """ self._prepare_classifiers() substrs, X_train, y = self._preprocess_input(X, y, create_features=True, retain_multiple=self.multiclass, return_y=True) X_train = self.first_selector.fit_transform(X_train, y) self._fit_classifiers(substrs, X_train, y) return self def _fit_classifiers(self, substrs, X_train, y): """ Обучает классификаторы для каждой из подстрок, к которой планируется его применять """ if issparse(X_train): X_train = X_train.tocsr() substr_indexes_in_train = _find_positions(substrs) for substr, indexes in substr_indexes_in_train.items(): X_curr, y_curr = X_train[indexes,:], [y[i] for i in indexes] X_curr.sort_indices() max_index = max(y_curr) # в случае, если y_curr содержит только один класс, не обучаем классификатор if min(y_curr) == max_index: self.predefined_answers[substr] = max_index continue self.classifiers[substr] = clone(self.classifier_) self.classifiers[substr].fit(X_curr, y_curr) self.classifier_classes[substr] = self.classifiers[substr].classes_ return def predict(self, X): probs = self._predict_proba(X) if not self.multiclass: class_indexes = [indices[0] for indices, _ in probs] answer = [[x] for x in np.take(self.classes_, class_indexes)] else: answer = [np.take(self.classes_, indices) for indices in extract_classes_from_sparse_probs(probs, self.min_probs_ratio)] return answer def predict_proba(self, X): # probs = self._predict_proba(X) # answer = np.zeros(dtype=np.float64, shape=(len(X), len(self.classes_))) # for i, (indices, elem_probs) in enumerate(probs): # answer[i][indices] = elem_probs # return answer return self._predict_proba(X, return_arrays=True) def _predict_proba(self, X, return_arrays=False): """ Аргументы: ------------ X: list of tuples, список троек вида (преобразуемая подстрока, левый контекст, правый контекст) Возвращает: ------------ answer: list of tuples, список пар вида (indices, probs), indices: array, shape=(len(self.classes), dtype=int, список кодов трансформаций в порядке убывания их вероятности, probs: array, shape=(len(self.classes), dtype=float, вероятности трансформаций в порядке убывания. """ substrs, X_test = self._preprocess_input(X, create_features=False, retain_multiple=False) X_test = self.first_selector.transform(X_test) substr_indexes = _find_positions(substrs) if return_arrays: answer = np.zeros(shape=(len(substrs), len(self.classes_))) else: answer = [None for _ in substrs] for substr, indexes in substr_indexes.items(): # print(substr, end=" ") cls = self.classifiers.get(substr, None) if cls is not None: # X_curr = self.selectors[substr].transform(X_test[indexes]) curr_smallest_prob = self.smallest_prob / len(cls.classes_) X_curr = X_test[indexes,:] probs = cls.predict_proba(X_curr) # probs[np.where(probs < curr_smallest_prob)] = 0.0 probs[np.where(probs < self.smallest_prob)] = 0.0 if return_arrays: answer[np.ix_(indexes, cls.classes_)] = probs else: for index, row in zip(indexes, probs): nonzero_indices = row.nonzero()[0] row = row[nonzero_indices] indices_order = np.flipud(np.argsort(row)) current_indices = nonzero_indices[indices_order] answer[index] = (np.take(cls.classes_, current_indices), row[indices_order] / row.sum()) # for index, row in zip(indexes, probs): # nonzero_indices = row.nonzero()[0] # row = row[nonzero_indices] # indices_order = np.flipud(np.argsort(row)) # current_indices = nonzero_indices[indices_order] # elem_probs = row[indices_order] # probs_sum, last_index = elem_probs[0], 1 # while probs_sum < 1.0 - self.smallest_prob: # probs_sum += elem_probs[last_index] # last_index += 1 # class_indexes = np.take(cls.classes_, current_indices[:(last_index)]) # answer[index] = (class_indexes, elem_probs[:(last_index)] / probs_sum) else: # такой подстроки не было в обучающей выборке, # поэтому для неё возращаем отсутствие замены code = self.predefined_answers.get(substr, 0) if return_arrays: answer[indexes, code] = 1.0 else: for index in indexes: answer[index] = ([code], 1.0) return answer def _preprocess_input(self, X, y=None, create_features=False, retain_multiple=False, return_y=False, sparse_type='csr'): """ Аргументы: ---------- y: list of lists or None, список, i-ый элемент которого содержит классы для X[i] из обучающей выборки create_features: bool, optional(default=False), создаются ли новые признаки или им сопоставляется значение ``неизвестный суффикс/префикс'' retain_multiple: bool, optional(default=False), может ли возвращаться несколько меток классов для одного объекта return_y: возвращается ли y (True в случае обработки обучающей выборки, т.к. в этом случае y преобразуется из списка списков в один список) """ ## СОКРАТИТЬ if create_features: # создаём новые признаки, вызывается при обработке обучающей выборки self._create_basic_features() self.features_number = len(self.features) # создаём функцию для обработки неизвестных признаков ## ПЕРЕПИСАТЬ КАК В ОБЫЧНОМ КЛАССИФИКАТОРЕ # _process_unknown_feature = self._create_unknown_feature_processor(create_features) if y is None: y = [[None]] * len(X) retain_multiple = False y_new = list(chain(*y)) else: self.classes_, y_new = np.unique(list(chain(*y)), return_inverse=True) X_with_temporary_codes = [] for _, left_context, right_context in X: active_features_codes = [] # обрабатываем левые контексты for length in range(1, self.left_context_length + 1): feature = left_context[-length: ] feature_code, to_break =\ self._get_feature_code(feature, (len(feature) != length), 'left', create_new=create_features) active_features_codes.append(feature_code) if to_break: break # правые контексты for length in range(1, self.right_context_length + 1): feature = right_context[:length] feature_code, to_break =\ self._get_feature_code(feature, (len(feature) != length), 'right', create_new=create_features) active_features_codes.append(feature_code) if to_break: break # центральный контекст if self.has_central_context: feature = left_context[-1:] + '#' + right_context[:1] if feature.startswith('#'): feature = '^' + feature if feature.endswith('#'): feature += '$' feature_code, to_break =\ self._get_feature_code(feature, False, 'center', create_new=create_features) if feature_code > 0: active_features_codes.append(feature_code) X_with_temporary_codes.append(active_features_codes) if create_features: # при равной длине порядок x$, x#, $x, #x feature_order = sorted(enumerate(self.features), key=(lambda x:(len(x[1]), x[1].startswith('#'), x[1].startswith('$'), x[1].endswith('#')))) self.features = [x[1] for x in feature_order] # перекодируем временные признаки temporary_features_recoding = [None] * self.features_number for new_code, (code, _) in enumerate(feature_order): temporary_features_recoding[code] = new_code for i, elem in enumerate(X_with_temporary_codes): X_with_temporary_codes[i] = [temporary_features_recoding[code] for code in elem] self.feature_codes = {feature: code for code, feature in enumerate(self.features)} # сохраняем данные для классификации rows, cols, curr_row = [], [], 0 for codes, labels in zip(X_with_temporary_codes, y): # каждый объект обучающей выборки размножается k раз, # где k --- число классов, к которым он принадлежит, например, пара # X[i]=x, y[i]=[1, 2, 6] будет преобразована в <x, 1>, <x, 2>, <x, 6> number_of_rows_to_add = 1 if not(retain_multiple) else len(labels) for i in range(number_of_rows_to_add): rows.extend([curr_row for _ in codes]) cols.extend(codes) curr_row += 1 # сохраняем преобразованные данные в разреженную матрицу data = np.ones(shape=(len(rows,)), dtype=float) if sparse_type == 'csr': sparse_matrix = csr_matrix elif sparse_type == 'csc': sparse_matrix = csc_matrix X_train = sparse_matrix((data, (rows, cols)), shape=(curr_row, self.features_number)) # сохраняем подстроки, которые и нужно классифицировать substrs = [x[0] for x in X] if return_y: return substrs, X_train, y_new else: return substrs, X_train def _create_basic_features(self): """ Создаёт базовые признаки ещё до чтения входа Возвращает: ----------- features: list, список созданных признаков """ self.features, self.feature_codes = [], dict() # признаки для контекстов, не встречавшихся в обучающей выборке # правые контексты self.features.extend(("-" * length + '#') for length in range(1, self.right_context_length + 1)) # короткие правые контексты self.features.extend(("-" * length + '$') for length in range(self.right_context_length)) # левые контексты self.features.extend(('#' + "-" * length) for length in range(1, self.right_context_length + 1)) # короткие левые контексты self.features.extend(('^' + "-" * length) for length in range(self.right_context_length)) if self.has_central_context: self.features.append('-#-') for code, feature in enumerate(self.features): self.feature_codes[feature] = code return # def _create_new_feature(self, feature, is_full, side): # if is_full: # delim = '^' if side == 'left' else '$' # to_break = True # else: # delim, to_break = '#', False # if side == 'left': # feature = delim + feature # elif side == 'right': # feature += delim # feature_code = self.feature_codes.get(feature, None) # if feature_code is None: # feature_code = self._process_unknown_feature(feature, side) # return feature_code, to_break def _get_feature_code(self, feature, is_full, side, create_new=False): """ Возвращает код признака feature """ if is_full: delim = '^' if side == 'left' else '$' to_break = True else: delim, to_break = '#', False if side == 'left': feature = delim + feature elif side == 'right': feature += delim code = self.feature_codes.get(feature, -1) if code < 0: if create_new: self.features.append(feature) self.feature_codes[feature] = code = self.features_number self.features_number += 1 else: if side == 'right': partial_features = ['-' * start + feature[start:] for start in range(1, len(feature))] elif side == 'left': partial_features = [feature[:(len(feature)-start)] + '-' * start for start in range(1, len(feature))][::-1] else: # side == 'center' partial_features = [] code, to_break = -1, None for partial_feature in partial_features: code = self.feature_codes.get(partial_feature, -1) if code > 0: break if code > 0: self.feature_codes[feature] = code return code, to_break
class LocalTransformationClassifier(BaseEstimator, ClassifierMixin): def __init__(self, left_context_length=2, right_context_length=3, has_central_context=True, classifier_name=sklm.LogisticRegression(), classifier_params=None, multiclass=False, select_features=False, selection_params=None, min_count=3, smallest_prob=0.01, min_probs_ratio=0.75): self.left_context_length = left_context_length self.right_context_length = right_context_length self.has_central_context = has_central_context self.select_features = True self.substrs = [] self.classifier_name = classifier_name self.classifier_params = classifier_params self.multiclass = multiclass self.select_features = select_features self.selection_params = selection_params self.smallest_prob = smallest_prob self.min_probs_ratio = min_probs_ratio self.min_count = min_count # инициализация параметров self.classifiers = dict() self.classifier_classes = dict() self.predefined_answers = dict() self.selector_name = None self.selectors = dict() self.feature_codes = dict() self.features = [] self.features_number = 0 def _prepare_classifiers(self): if self.classifier_params is None: self.classifier_params = dict() self.classifier_name.set_params(**self.classifier_params) if self.select_features is not None: if self.selection_params is None: self.selection_params = { 'nfeatures': 0.1, 'min_count': 2, 'minfeatures': 100 } self.selection_params['method'] = self.select_features self.first_selector = MulticlassFeatureSelector( local=True, method=self.select_features, min_count=self.min_count, nfeatures=-1) feature_selector = MulticlassFeatureSelector( **self.selection_params) self.classifier_ = Pipeline([('selector', feature_selector), ('classifier', self.classifier_name)]) else: self.first_selector = ZeroFeatureRemover() self.classifier_ = self.classifier_name return self def fit(self, X, y): """ Применяет классификатор к данным. X --- набор троек вида (фрагмент, парадигма, левый контекст, правый контекст) """ self._prepare_classifiers() substrs, X_train, y = self._preprocess_input( X, y, create_features=True, retain_multiple=self.multiclass, return_y=True) X_train = self.first_selector.fit_transform(X_train, y) self._fit_classifiers(substrs, X_train, y) return self def _fit_classifiers(self, substrs, X_train, y): """ Обучает классификаторы для каждой из подстрок, к которой планируется его применять """ if issparse(X_train): X_train = X_train.tocsr() substr_indexes_in_train = _find_positions(substrs) for substr, indexes in substr_indexes_in_train.items(): X_curr, y_curr = X_train[indexes, :], [y[i] for i in indexes] X_curr.sort_indices() max_index = max(y_curr) # в случае, если y_curr содержит только один класс, не обучаем классификатор if min(y_curr) == max_index: self.predefined_answers[substr] = max_index continue self.classifiers[substr] = clone(self.classifier_) self.classifiers[substr].fit(X_curr, y_curr) self.classifier_classes[substr] = self.classifiers[substr].classes_ return def predict(self, X): probs = self._predict_proba(X) if not self.multiclass: class_indexes = [indices[0] for indices, _ in probs] answer = [[x] for x in np.take(self.classes_, class_indexes)] else: answer = [ np.take(self.classes_, indices) for indices in extract_classes_from_sparse_probs( probs, self.min_probs_ratio) ] return answer def predict_proba(self, X): # probs = self._predict_proba(X) # answer = np.zeros(dtype=np.float64, shape=(len(X), len(self.classes_))) # for i, (indices, elem_probs) in enumerate(probs): # answer[i][indices] = elem_probs # return answer return self._predict_proba(X, return_arrays=True) def _predict_proba(self, X, return_arrays=False): """ Аргументы: ------------ X: list of tuples, список троек вида (преобразуемая подстрока, левый контекст, правый контекст) Возвращает: ------------ answer: list of tuples, список пар вида (indices, probs), indices: array, shape=(len(self.classes), dtype=int, список кодов трансформаций в порядке убывания их вероятности, probs: array, shape=(len(self.classes), dtype=float, вероятности трансформаций в порядке убывания. """ substrs, X_test = self._preprocess_input(X, create_features=False, retain_multiple=False) X_test = self.first_selector.transform(X_test) substr_indexes = _find_positions(substrs) if return_arrays: answer = np.zeros(shape=(len(substrs), len(self.classes_))) else: answer = [None for _ in substrs] for substr, indexes in substr_indexes.items(): # print(substr, end=" ") cls = self.classifiers.get(substr, None) if cls is not None: # X_curr = self.selectors[substr].transform(X_test[indexes]) curr_smallest_prob = self.smallest_prob / len(cls.classes_) X_curr = X_test[indexes, :] probs = cls.predict_proba(X_curr) # probs[np.where(probs < curr_smallest_prob)] = 0.0 probs[np.where(probs < self.smallest_prob)] = 0.0 if return_arrays: answer[np.ix_(indexes, cls.classes_)] = probs else: for index, row in zip(indexes, probs): nonzero_indices = row.nonzero()[0] row = row[nonzero_indices] indices_order = np.flipud(np.argsort(row)) current_indices = nonzero_indices[indices_order] answer[index] = (np.take(cls.classes_, current_indices), row[indices_order] / row.sum()) # for index, row in zip(indexes, probs): # nonzero_indices = row.nonzero()[0] # row = row[nonzero_indices] # indices_order = np.flipud(np.argsort(row)) # current_indices = nonzero_indices[indices_order] # elem_probs = row[indices_order] # probs_sum, last_index = elem_probs[0], 1 # while probs_sum < 1.0 - self.smallest_prob: # probs_sum += elem_probs[last_index] # last_index += 1 # class_indexes = np.take(cls.classes_, current_indices[:(last_index)]) # answer[index] = (class_indexes, elem_probs[:(last_index)] / probs_sum) else: # такой подстроки не было в обучающей выборке, # поэтому для неё возращаем отсутствие замены code = self.predefined_answers.get(substr, 0) if return_arrays: answer[indexes, code] = 1.0 else: for index in indexes: answer[index] = ([code], 1.0) return answer def _preprocess_input(self, X, y=None, create_features=False, retain_multiple=False, return_y=False, sparse_type='csr'): """ Аргументы: ---------- y: list of lists or None, список, i-ый элемент которого содержит классы для X[i] из обучающей выборки create_features: bool, optional(default=False), создаются ли новые признаки или им сопоставляется значение ``неизвестный суффикс/префикс'' retain_multiple: bool, optional(default=False), может ли возвращаться несколько меток классов для одного объекта return_y: возвращается ли y (True в случае обработки обучающей выборки, т.к. в этом случае y преобразуется из списка списков в один список) """ ## СОКРАТИТЬ if create_features: # создаём новые признаки, вызывается при обработке обучающей выборки self._create_basic_features() self.features_number = len(self.features) # создаём функцию для обработки неизвестных признаков ## ПЕРЕПИСАТЬ КАК В ОБЫЧНОМ КЛАССИФИКАТОРЕ # _process_unknown_feature = self._create_unknown_feature_processor(create_features) if y is None: y = [[None]] * len(X) retain_multiple = False y_new = list(chain(*y)) else: self.classes_, y_new = np.unique(list(chain(*y)), return_inverse=True) X_with_temporary_codes = [] for _, left_context, right_context in X: active_features_codes = [] # обрабатываем левые контексты for length in range(1, self.left_context_length + 1): feature = left_context[-length:] feature_code, to_break =\ self._get_feature_code(feature, (len(feature) != length), 'left', create_new=create_features) active_features_codes.append(feature_code) if to_break: break # правые контексты for length in range(1, self.right_context_length + 1): feature = right_context[:length] feature_code, to_break =\ self._get_feature_code(feature, (len(feature) != length), 'right', create_new=create_features) active_features_codes.append(feature_code) if to_break: break # центральный контекст if self.has_central_context: feature = left_context[-1:] + '#' + right_context[:1] if feature.startswith('#'): feature = '^' + feature if feature.endswith('#'): feature += '$' feature_code, to_break =\ self._get_feature_code(feature, False, 'center', create_new=create_features) if feature_code > 0: active_features_codes.append(feature_code) X_with_temporary_codes.append(active_features_codes) if create_features: # при равной длине порядок x$, x#, $x, #x feature_order = sorted(enumerate(self.features), key=(lambda x: (len(x[1]), x[1].startswith('#'), x[1]. startswith('$'), x[1].endswith('#')))) self.features = [x[1] for x in feature_order] # перекодируем временные признаки temporary_features_recoding = [None] * self.features_number for new_code, (code, _) in enumerate(feature_order): temporary_features_recoding[code] = new_code for i, elem in enumerate(X_with_temporary_codes): X_with_temporary_codes[i] = [ temporary_features_recoding[code] for code in elem ] self.feature_codes = { feature: code for code, feature in enumerate(self.features) } # сохраняем данные для классификации rows, cols, curr_row = [], [], 0 for codes, labels in zip(X_with_temporary_codes, y): # каждый объект обучающей выборки размножается k раз, # где k --- число классов, к которым он принадлежит, например, пара # X[i]=x, y[i]=[1, 2, 6] будет преобразована в <x, 1>, <x, 2>, <x, 6> number_of_rows_to_add = 1 if not (retain_multiple) else len(labels) for i in range(number_of_rows_to_add): rows.extend([curr_row for _ in codes]) cols.extend(codes) curr_row += 1 # сохраняем преобразованные данные в разреженную матрицу data = np.ones(shape=(len(rows, )), dtype=float) if sparse_type == 'csr': sparse_matrix = csr_matrix elif sparse_type == 'csc': sparse_matrix = csc_matrix X_train = sparse_matrix((data, (rows, cols)), shape=(curr_row, self.features_number)) # сохраняем подстроки, которые и нужно классифицировать substrs = [x[0] for x in X] if return_y: return substrs, X_train, y_new else: return substrs, X_train def _create_basic_features(self): """ Создаёт базовые признаки ещё до чтения входа Возвращает: ----------- features: list, список созданных признаков """ self.features, self.feature_codes = [], dict() # признаки для контекстов, не встречавшихся в обучающей выборке # правые контексты self.features.extend( ("-" * length + '#') for length in range(1, self.right_context_length + 1)) # короткие правые контексты self.features.extend(("-" * length + '$') for length in range(self.right_context_length)) # левые контексты self.features.extend( ('#' + "-" * length) for length in range(1, self.right_context_length + 1)) # короткие левые контексты self.features.extend(('^' + "-" * length) for length in range(self.right_context_length)) if self.has_central_context: self.features.append('-#-') for code, feature in enumerate(self.features): self.feature_codes[feature] = code return # def _create_new_feature(self, feature, is_full, side): # if is_full: # delim = '^' if side == 'left' else '$' # to_break = True # else: # delim, to_break = '#', False # if side == 'left': # feature = delim + feature # elif side == 'right': # feature += delim # feature_code = self.feature_codes.get(feature, None) # if feature_code is None: # feature_code = self._process_unknown_feature(feature, side) # return feature_code, to_break def _get_feature_code(self, feature, is_full, side, create_new=False): """ Возвращает код признака feature """ if is_full: delim = '^' if side == 'left' else '$' to_break = True else: delim, to_break = '#', False if side == 'left': feature = delim + feature elif side == 'right': feature += delim code = self.feature_codes.get(feature, -1) if code < 0: if create_new: self.features.append(feature) self.feature_codes[feature] = code = self.features_number self.features_number += 1 else: if side == 'right': partial_features = [ '-' * start + feature[start:] for start in range(1, len(feature)) ] elif side == 'left': partial_features = [ feature[:(len(feature) - start)] + '-' * start for start in range(1, len(feature)) ][::-1] else: # side == 'center' partial_features = [] code, to_break = -1, None for partial_feature in partial_features: code = self.feature_codes.get(partial_feature, -1) if code > 0: break if code > 0: self.feature_codes[feature] = code return code, to_break