def _prepare_classifiers(self):
     if self.classifier_params is None:
         self.classifier_params = dict()
     self.classifier_name.set_params(**self.classifier_params)
     if self.select_features is not None:
         if self.selection_params is None:
             self.selection_params = {
                 'nfeatures': 0.1,
                 'min_count': 2,
                 'minfeatures': 100
             }
         self.selection_params['method'] = self.select_features
         self.first_selector = MulticlassFeatureSelector(
             local=True,
             method=self.select_features,
             min_count=self.min_count,
             nfeatures=-1)
         feature_selector = MulticlassFeatureSelector(
             **self.selection_params)
         self.classifier_ = Pipeline([('selector', feature_selector),
                                      ('classifier', self.classifier_name)])
     else:
         self.first_selector = ZeroFeatureRemover()
         self.classifier_ = self.classifier_name
     return self
 def _prepare_classifiers(self):
     if self.classifier_params is None:
         self.classifier_params = dict()
     self.classifier_name.set_params(**self.classifier_params)
     if self.select_features is not None:
         if self.selection_params is None:
             self.selection_params = {'nfeatures': 0.1, 'min_count': 2, 'minfeatures': 100}
         self.selection_params['method'] = self.select_features
         self.first_selector = MulticlassFeatureSelector(
             local=True, method=self.select_features, min_count=self.min_count, nfeatures=-1)
         feature_selector = MulticlassFeatureSelector(**self.selection_params)
         self.classifier_ = Pipeline([('selector', feature_selector),
                                      ('classifier', self.classifier_name)])
     else:
         self.first_selector = ZeroFeatureRemover()
         self.classifier_ = self.classifier_name
     return self
class LocalTransformationClassifier(BaseEstimator, ClassifierMixin):

    def __init__(self, left_context_length=2, right_context_length=3, has_central_context=True,
                 classifier_name=sklm.LogisticRegression(), classifier_params=None,
                 multiclass=False, select_features=False, selection_params = None,
                 min_count=3, smallest_prob=0.01, min_probs_ratio=0.75):
        self.left_context_length = left_context_length
        self.right_context_length = right_context_length
        self.has_central_context = has_central_context
        self.select_features = True
        self.substrs = []
        self.classifier_name = classifier_name
        self.classifier_params = classifier_params
        self.multiclass = multiclass
        self.select_features = select_features
        self.selection_params = selection_params
        self.smallest_prob = smallest_prob
        self.min_probs_ratio = min_probs_ratio
        self.min_count = min_count
        # инициализация параметров
        self.classifiers = dict()
        self.classifier_classes = dict()
        self.predefined_answers = dict()
        self.selector_name = None
        self.selectors = dict()
        self.feature_codes = dict()
        self.features = []
        self.features_number = 0

    def _prepare_classifiers(self):
        if self.classifier_params is None:
            self.classifier_params = dict()
        self.classifier_name.set_params(**self.classifier_params)
        if self.select_features is not None:
            if self.selection_params is None:
                self.selection_params = {'nfeatures': 0.1, 'min_count': 2, 'minfeatures': 100}
            self.selection_params['method'] = self.select_features
            self.first_selector = MulticlassFeatureSelector(
                local=True, method=self.select_features, min_count=self.min_count, nfeatures=-1)
            feature_selector = MulticlassFeatureSelector(**self.selection_params)
            self.classifier_ = Pipeline([('selector', feature_selector),
                                         ('classifier', self.classifier_name)])
        else:
            self.first_selector = ZeroFeatureRemover()
            self.classifier_ = self.classifier_name
        return self

    def fit(self, X, y):
        """
        Применяет классификатор к данным.

        X --- набор троек вида (фрагмент, парадигма, левый контекст, правый контекст)
        """
        self._prepare_classifiers()
        substrs, X_train, y = self._preprocess_input(X, y, create_features=True,
                                                     retain_multiple=self.multiclass, return_y=True)
        X_train = self.first_selector.fit_transform(X_train, y)
        self._fit_classifiers(substrs, X_train, y)
        return self

    def _fit_classifiers(self, substrs, X_train, y):
        """
        Обучает классификаторы для каждой из подстрок, к которой планируется его применять
        """
        if issparse(X_train):
            X_train = X_train.tocsr()
        substr_indexes_in_train = _find_positions(substrs)
        for substr, indexes in substr_indexes_in_train.items():
            X_curr, y_curr = X_train[indexes,:], [y[i] for i in indexes]
            X_curr.sort_indices()
            max_index = max(y_curr)
            # в случае, если y_curr содержит только один класс, не обучаем классификатор
            if min(y_curr) == max_index:
                self.predefined_answers[substr] = max_index
                continue
            self.classifiers[substr] = clone(self.classifier_)
            self.classifiers[substr].fit(X_curr, y_curr)
            self.classifier_classes[substr] = self.classifiers[substr].classes_
        return

    def predict(self, X):
        probs = self._predict_proba(X)
        if not self.multiclass:
            class_indexes = [indices[0] for indices, _ in probs]
            answer = [[x] for x in np.take(self.classes_, class_indexes)]
        else:
            answer = [np.take(self.classes_, indices) for indices
                      in extract_classes_from_sparse_probs(probs, self.min_probs_ratio)]
        return answer

    def predict_proba(self, X):
        # probs = self._predict_proba(X)
        # answer = np.zeros(dtype=np.float64, shape=(len(X), len(self.classes_)))
        # for i, (indices, elem_probs) in enumerate(probs):
        #     answer[i][indices] = elem_probs
        # return answer
        return self._predict_proba(X, return_arrays=True)


    def _predict_proba(self, X, return_arrays=False):
        """
        Аргументы:
        ------------
        X: list of tuples, список троек вида (преобразуемая подстрока, левый контекст, правый контекст)

        Возвращает:
        ------------
        answer: list of tuples, список пар вида (indices, probs),
            indices: array, shape=(len(self.classes), dtype=int,
                список кодов трансформаций в порядке убывания их вероятности,
            probs: array, shape=(len(self.classes), dtype=float,
                вероятности трансформаций в порядке убывания.
        """
        substrs, X_test = self._preprocess_input(X, create_features=False, retain_multiple=False)
        X_test = self.first_selector.transform(X_test)
        substr_indexes = _find_positions(substrs)
        if return_arrays:
            answer = np.zeros(shape=(len(substrs), len(self.classes_)))
        else:
            answer = [None for _ in substrs]
        for substr, indexes in substr_indexes.items():
            # print(substr, end=" ")
            cls = self.classifiers.get(substr, None)
            if cls is not None:
                # X_curr = self.selectors[substr].transform(X_test[indexes])
                curr_smallest_prob = self.smallest_prob / len(cls.classes_)
                X_curr = X_test[indexes,:]
                probs = cls.predict_proba(X_curr)
                # probs[np.where(probs < curr_smallest_prob)] = 0.0
                probs[np.where(probs < self.smallest_prob)] = 0.0
                if return_arrays:
                    answer[np.ix_(indexes, cls.classes_)] = probs
                else:
                    for index, row in zip(indexes, probs):
                        nonzero_indices = row.nonzero()[0]
                        row = row[nonzero_indices]
                        indices_order = np.flipud(np.argsort(row))
                        current_indices = nonzero_indices[indices_order]
                        answer[index] = (np.take(cls.classes_, current_indices),
                                         row[indices_order] / row.sum())
                # for index, row in zip(indexes, probs):
                #     nonzero_indices = row.nonzero()[0]
                #     row = row[nonzero_indices]
                #     indices_order = np.flipud(np.argsort(row))
                #     current_indices = nonzero_indices[indices_order]
                #     elem_probs = row[indices_order]
                #     probs_sum, last_index = elem_probs[0], 1
                #     while probs_sum < 1.0 - self.smallest_prob:
                #         probs_sum += elem_probs[last_index]
                #         last_index += 1
                #     class_indexes = np.take(cls.classes_, current_indices[:(last_index)])
                #     answer[index] = (class_indexes, elem_probs[:(last_index)] / probs_sum)
            else:
                # такой подстроки не было в обучающей выборке,
                # поэтому для неё возращаем отсутствие замены
                code = self.predefined_answers.get(substr, 0)
                if return_arrays:
                    answer[indexes, code] = 1.0
                else:
                    for index in indexes:
                        answer[index] = ([code], 1.0)
        return answer


    def _preprocess_input(self, X, y=None, create_features=False,
                          retain_multiple=False, return_y=False, sparse_type='csr'):
        """
        Аргументы:
        ----------
        y: list of lists or None, список, i-ый элемент которого содержит классы
            для X[i] из обучающей выборки
        create_features: bool, optional(default=False), создаются ли новые признаки
            или им сопоставляется значение ``неизвестный суффикс/префикс''
        retain_multiple: bool, optional(default=False), может ли возвращаться
            несколько меток классов для одного объекта
        return_y: возвращается ли y (True в случае обработки обучающей выборки,
            т.к. в этом случае y преобразуется из списка списков в один список)
        """
        ## СОКРАТИТЬ
        if create_features:
            # создаём новые признаки, вызывается при обработке обучающей выборки
            self._create_basic_features()
            self.features_number = len(self.features)
        # создаём функцию для обработки неизвестных признаков
        ## ПЕРЕПИСАТЬ КАК В ОБЫЧНОМ КЛАССИФИКАТОРЕ
        # _process_unknown_feature = self._create_unknown_feature_processor(create_features)
        if y is None:
            y = [[None]] * len(X)
            retain_multiple = False
            y_new = list(chain(*y))
        else:
            self.classes_, y_new = np.unique(list(chain(*y)), return_inverse=True)
        X_with_temporary_codes = []
        for _, left_context, right_context in X:
            active_features_codes = []
            # обрабатываем левые контексты
            for length in range(1, self.left_context_length + 1):
                feature = left_context[-length: ]
                feature_code, to_break =\
                    self._get_feature_code(feature, (len(feature) != length),
                                           'left', create_new=create_features)
                active_features_codes.append(feature_code)
                if to_break:
                    break
            # правые контексты
            for length in range(1, self.right_context_length + 1):
                feature = right_context[:length]
                feature_code, to_break =\
                    self._get_feature_code(feature, (len(feature) != length),
                                           'right', create_new=create_features)
                active_features_codes.append(feature_code)
                if to_break:
                    break
            # центральный контекст
            if self.has_central_context:
                feature = left_context[-1:] + '#' + right_context[:1]
                if feature.startswith('#'):
                    feature = '^' + feature
                if feature.endswith('#'):
                    feature += '$'
                feature_code, to_break =\
                    self._get_feature_code(feature, False, 'center', create_new=create_features)
                if feature_code > 0:
                    active_features_codes.append(feature_code)
            X_with_temporary_codes.append(active_features_codes)
        if create_features:
            # при равной длине порядок x$, x#, $x, #x
            feature_order = sorted(enumerate(self.features),
                                   key=(lambda x:(len(x[1]), x[1].startswith('#'),
                                                  x[1].startswith('$'), x[1].endswith('#'))))
            self.features = [x[1] for x in feature_order]
            # перекодируем временные признаки
            temporary_features_recoding = [None] * self.features_number
            for new_code, (code, _) in enumerate(feature_order):
                temporary_features_recoding[code] = new_code
            for i, elem in enumerate(X_with_temporary_codes):
                X_with_temporary_codes[i] = [temporary_features_recoding[code] for code in elem]
            self.feature_codes = {feature: code for code, feature in enumerate(self.features)}
        # сохраняем данные для классификации
        rows, cols, curr_row = [], [], 0
        for codes, labels in zip(X_with_temporary_codes, y):
            # каждый объект обучающей выборки размножается k раз,
            # где k --- число классов, к которым он принадлежит, например, пара
            # X[i]=x, y[i]=[1, 2, 6] будет преобразована в <x, 1>, <x, 2>, <x, 6>
            number_of_rows_to_add = 1 if not(retain_multiple) else len(labels)
            for i in range(number_of_rows_to_add):
                rows.extend([curr_row for _ in codes])
                cols.extend(codes)
                curr_row += 1
        # сохраняем преобразованные данные в разреженную матрицу
        data = np.ones(shape=(len(rows,)), dtype=float)
        if sparse_type == 'csr':
            sparse_matrix = csr_matrix
        elif sparse_type == 'csc':
            sparse_matrix = csc_matrix
        X_train = sparse_matrix((data, (rows, cols)), shape=(curr_row, self.features_number))
        # сохраняем подстроки, которые и нужно классифицировать
        substrs = [x[0] for x in X]
        if return_y:
            return substrs, X_train, y_new
        else:
            return substrs, X_train

    def _create_basic_features(self):
        """
        Создаёт базовые признаки ещё до чтения входа

        Возвращает:
        -----------
        features: list, список созданных признаков
        """
        self.features, self.feature_codes = [], dict()
        #  признаки для контекстов, не встречавшихся в обучающей выборке
        # правые контексты
        self.features.extend(("-" * length + '#')
                             for length in range(1, self.right_context_length + 1))
        # короткие правые контексты
        self.features.extend(("-" * length + '$')
                             for length in range(self.right_context_length))
        # левые контексты
        self.features.extend(('#' + "-" * length)
                             for length in range(1, self.right_context_length + 1))
        # короткие левые контексты
        self.features.extend(('^' + "-" * length)
                             for length in range(self.right_context_length))
        if self.has_central_context:
            self.features.append('-#-')
        for code, feature in enumerate(self.features):
            self.feature_codes[feature] = code
        return

    # def _create_new_feature(self, feature, is_full, side):
    #     if is_full:
    #         delim = '^' if side == 'left' else '$'
    #         to_break = True
    #     else:
    #         delim, to_break = '#', False
    #     if side == 'left':
    #         feature = delim + feature
    #     elif side == 'right':
    #         feature += delim
    #     feature_code = self.feature_codes.get(feature, None)
    #     if feature_code is None:
    #         feature_code = self._process_unknown_feature(feature, side)
    #     return feature_code, to_break

    def _get_feature_code(self, feature, is_full, side, create_new=False):
        """
        Возвращает код признака feature
        """
        if is_full:
            delim = '^' if side == 'left' else '$'
            to_break = True
        else:
            delim, to_break = '#', False
        if side == 'left':
            feature = delim + feature
        elif side == 'right':
            feature += delim
        code = self.feature_codes.get(feature, -1)
        if code < 0:
            if create_new:
                self.features.append(feature)
                self.feature_codes[feature] = code = self.features_number
                self.features_number += 1
            else:
                if side == 'right':
                    partial_features = ['-' * start + feature[start:]
                                        for start in range(1, len(feature))]
                elif side == 'left':
                    partial_features = [feature[:(len(feature)-start)] + '-' * start
                                        for start in range(1, len(feature))][::-1]
                else:  # side == 'center'
                    partial_features = []
                    code, to_break = -1, None
                for partial_feature in partial_features:
                    code = self.feature_codes.get(partial_feature, -1)
                    if code > 0:
                        break
                if code > 0:
                    self.feature_codes[feature] = code
        return code, to_break
class LocalTransformationClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self,
                 left_context_length=2,
                 right_context_length=3,
                 has_central_context=True,
                 classifier_name=sklm.LogisticRegression(),
                 classifier_params=None,
                 multiclass=False,
                 select_features=False,
                 selection_params=None,
                 min_count=3,
                 smallest_prob=0.01,
                 min_probs_ratio=0.75):
        self.left_context_length = left_context_length
        self.right_context_length = right_context_length
        self.has_central_context = has_central_context
        self.select_features = True
        self.substrs = []
        self.classifier_name = classifier_name
        self.classifier_params = classifier_params
        self.multiclass = multiclass
        self.select_features = select_features
        self.selection_params = selection_params
        self.smallest_prob = smallest_prob
        self.min_probs_ratio = min_probs_ratio
        self.min_count = min_count
        # инициализация параметров
        self.classifiers = dict()
        self.classifier_classes = dict()
        self.predefined_answers = dict()
        self.selector_name = None
        self.selectors = dict()
        self.feature_codes = dict()
        self.features = []
        self.features_number = 0

    def _prepare_classifiers(self):
        if self.classifier_params is None:
            self.classifier_params = dict()
        self.classifier_name.set_params(**self.classifier_params)
        if self.select_features is not None:
            if self.selection_params is None:
                self.selection_params = {
                    'nfeatures': 0.1,
                    'min_count': 2,
                    'minfeatures': 100
                }
            self.selection_params['method'] = self.select_features
            self.first_selector = MulticlassFeatureSelector(
                local=True,
                method=self.select_features,
                min_count=self.min_count,
                nfeatures=-1)
            feature_selector = MulticlassFeatureSelector(
                **self.selection_params)
            self.classifier_ = Pipeline([('selector', feature_selector),
                                         ('classifier', self.classifier_name)])
        else:
            self.first_selector = ZeroFeatureRemover()
            self.classifier_ = self.classifier_name
        return self

    def fit(self, X, y):
        """
        Применяет классификатор к данным.

        X --- набор троек вида (фрагмент, парадигма, левый контекст, правый контекст)
        """
        self._prepare_classifiers()
        substrs, X_train, y = self._preprocess_input(
            X,
            y,
            create_features=True,
            retain_multiple=self.multiclass,
            return_y=True)
        X_train = self.first_selector.fit_transform(X_train, y)
        self._fit_classifiers(substrs, X_train, y)
        return self

    def _fit_classifiers(self, substrs, X_train, y):
        """
        Обучает классификаторы для каждой из подстрок, к которой планируется его применять
        """
        if issparse(X_train):
            X_train = X_train.tocsr()
        substr_indexes_in_train = _find_positions(substrs)
        for substr, indexes in substr_indexes_in_train.items():
            X_curr, y_curr = X_train[indexes, :], [y[i] for i in indexes]
            X_curr.sort_indices()
            max_index = max(y_curr)
            # в случае, если y_curr содержит только один класс, не обучаем классификатор
            if min(y_curr) == max_index:
                self.predefined_answers[substr] = max_index
                continue
            self.classifiers[substr] = clone(self.classifier_)
            self.classifiers[substr].fit(X_curr, y_curr)
            self.classifier_classes[substr] = self.classifiers[substr].classes_
        return

    def predict(self, X):
        probs = self._predict_proba(X)
        if not self.multiclass:
            class_indexes = [indices[0] for indices, _ in probs]
            answer = [[x] for x in np.take(self.classes_, class_indexes)]
        else:
            answer = [
                np.take(self.classes_, indices)
                for indices in extract_classes_from_sparse_probs(
                    probs, self.min_probs_ratio)
            ]
        return answer

    def predict_proba(self, X):
        # probs = self._predict_proba(X)
        # answer = np.zeros(dtype=np.float64, shape=(len(X), len(self.classes_)))
        # for i, (indices, elem_probs) in enumerate(probs):
        #     answer[i][indices] = elem_probs
        # return answer
        return self._predict_proba(X, return_arrays=True)

    def _predict_proba(self, X, return_arrays=False):
        """
        Аргументы:
        ------------
        X: list of tuples, список троек вида (преобразуемая подстрока, левый контекст, правый контекст)

        Возвращает:
        ------------
        answer: list of tuples, список пар вида (indices, probs),
            indices: array, shape=(len(self.classes), dtype=int,
                список кодов трансформаций в порядке убывания их вероятности,
            probs: array, shape=(len(self.classes), dtype=float,
                вероятности трансформаций в порядке убывания.
        """
        substrs, X_test = self._preprocess_input(X,
                                                 create_features=False,
                                                 retain_multiple=False)
        X_test = self.first_selector.transform(X_test)
        substr_indexes = _find_positions(substrs)
        if return_arrays:
            answer = np.zeros(shape=(len(substrs), len(self.classes_)))
        else:
            answer = [None for _ in substrs]
        for substr, indexes in substr_indexes.items():
            # print(substr, end=" ")
            cls = self.classifiers.get(substr, None)
            if cls is not None:
                # X_curr = self.selectors[substr].transform(X_test[indexes])
                curr_smallest_prob = self.smallest_prob / len(cls.classes_)
                X_curr = X_test[indexes, :]
                probs = cls.predict_proba(X_curr)
                # probs[np.where(probs < curr_smallest_prob)] = 0.0
                probs[np.where(probs < self.smallest_prob)] = 0.0
                if return_arrays:
                    answer[np.ix_(indexes, cls.classes_)] = probs
                else:
                    for index, row in zip(indexes, probs):
                        nonzero_indices = row.nonzero()[0]
                        row = row[nonzero_indices]
                        indices_order = np.flipud(np.argsort(row))
                        current_indices = nonzero_indices[indices_order]
                        answer[index] = (np.take(cls.classes_,
                                                 current_indices),
                                         row[indices_order] / row.sum())
                # for index, row in zip(indexes, probs):
                #     nonzero_indices = row.nonzero()[0]
                #     row = row[nonzero_indices]
                #     indices_order = np.flipud(np.argsort(row))
                #     current_indices = nonzero_indices[indices_order]
                #     elem_probs = row[indices_order]
                #     probs_sum, last_index = elem_probs[0], 1
                #     while probs_sum < 1.0 - self.smallest_prob:
                #         probs_sum += elem_probs[last_index]
                #         last_index += 1
                #     class_indexes = np.take(cls.classes_, current_indices[:(last_index)])
                #     answer[index] = (class_indexes, elem_probs[:(last_index)] / probs_sum)
            else:
                # такой подстроки не было в обучающей выборке,
                # поэтому для неё возращаем отсутствие замены
                code = self.predefined_answers.get(substr, 0)
                if return_arrays:
                    answer[indexes, code] = 1.0
                else:
                    for index in indexes:
                        answer[index] = ([code], 1.0)
        return answer

    def _preprocess_input(self,
                          X,
                          y=None,
                          create_features=False,
                          retain_multiple=False,
                          return_y=False,
                          sparse_type='csr'):
        """
        Аргументы:
        ----------
        y: list of lists or None, список, i-ый элемент которого содержит классы
            для X[i] из обучающей выборки
        create_features: bool, optional(default=False), создаются ли новые признаки
            или им сопоставляется значение ``неизвестный суффикс/префикс''
        retain_multiple: bool, optional(default=False), может ли возвращаться
            несколько меток классов для одного объекта
        return_y: возвращается ли y (True в случае обработки обучающей выборки,
            т.к. в этом случае y преобразуется из списка списков в один список)
        """
        ## СОКРАТИТЬ
        if create_features:
            # создаём новые признаки, вызывается при обработке обучающей выборки
            self._create_basic_features()
            self.features_number = len(self.features)
        # создаём функцию для обработки неизвестных признаков
        ## ПЕРЕПИСАТЬ КАК В ОБЫЧНОМ КЛАССИФИКАТОРЕ
        # _process_unknown_feature = self._create_unknown_feature_processor(create_features)
        if y is None:
            y = [[None]] * len(X)
            retain_multiple = False
            y_new = list(chain(*y))
        else:
            self.classes_, y_new = np.unique(list(chain(*y)),
                                             return_inverse=True)
        X_with_temporary_codes = []
        for _, left_context, right_context in X:
            active_features_codes = []
            # обрабатываем левые контексты
            for length in range(1, self.left_context_length + 1):
                feature = left_context[-length:]
                feature_code, to_break =\
                    self._get_feature_code(feature, (len(feature) != length),
                                           'left', create_new=create_features)
                active_features_codes.append(feature_code)
                if to_break:
                    break
            # правые контексты
            for length in range(1, self.right_context_length + 1):
                feature = right_context[:length]
                feature_code, to_break =\
                    self._get_feature_code(feature, (len(feature) != length),
                                           'right', create_new=create_features)
                active_features_codes.append(feature_code)
                if to_break:
                    break
            # центральный контекст
            if self.has_central_context:
                feature = left_context[-1:] + '#' + right_context[:1]
                if feature.startswith('#'):
                    feature = '^' + feature
                if feature.endswith('#'):
                    feature += '$'
                feature_code, to_break =\
                    self._get_feature_code(feature, False, 'center', create_new=create_features)
                if feature_code > 0:
                    active_features_codes.append(feature_code)
            X_with_temporary_codes.append(active_features_codes)
        if create_features:
            # при равной длине порядок x$, x#, $x, #x
            feature_order = sorted(enumerate(self.features),
                                   key=(lambda x:
                                        (len(x[1]), x[1].startswith('#'), x[1].
                                         startswith('$'), x[1].endswith('#'))))
            self.features = [x[1] for x in feature_order]
            # перекодируем временные признаки
            temporary_features_recoding = [None] * self.features_number
            for new_code, (code, _) in enumerate(feature_order):
                temporary_features_recoding[code] = new_code
            for i, elem in enumerate(X_with_temporary_codes):
                X_with_temporary_codes[i] = [
                    temporary_features_recoding[code] for code in elem
                ]
            self.feature_codes = {
                feature: code
                for code, feature in enumerate(self.features)
            }
        # сохраняем данные для классификации
        rows, cols, curr_row = [], [], 0
        for codes, labels in zip(X_with_temporary_codes, y):
            # каждый объект обучающей выборки размножается k раз,
            # где k --- число классов, к которым он принадлежит, например, пара
            # X[i]=x, y[i]=[1, 2, 6] будет преобразована в <x, 1>, <x, 2>, <x, 6>
            number_of_rows_to_add = 1 if not (retain_multiple) else len(labels)
            for i in range(number_of_rows_to_add):
                rows.extend([curr_row for _ in codes])
                cols.extend(codes)
                curr_row += 1
        # сохраняем преобразованные данные в разреженную матрицу
        data = np.ones(shape=(len(rows, )), dtype=float)
        if sparse_type == 'csr':
            sparse_matrix = csr_matrix
        elif sparse_type == 'csc':
            sparse_matrix = csc_matrix
        X_train = sparse_matrix((data, (rows, cols)),
                                shape=(curr_row, self.features_number))
        # сохраняем подстроки, которые и нужно классифицировать
        substrs = [x[0] for x in X]
        if return_y:
            return substrs, X_train, y_new
        else:
            return substrs, X_train

    def _create_basic_features(self):
        """
        Создаёт базовые признаки ещё до чтения входа

        Возвращает:
        -----------
        features: list, список созданных признаков
        """
        self.features, self.feature_codes = [], dict()
        #  признаки для контекстов, не встречавшихся в обучающей выборке
        # правые контексты
        self.features.extend(
            ("-" * length + '#')
            for length in range(1, self.right_context_length + 1))
        # короткие правые контексты
        self.features.extend(("-" * length + '$')
                             for length in range(self.right_context_length))
        # левые контексты
        self.features.extend(
            ('#' + "-" * length)
            for length in range(1, self.right_context_length + 1))
        # короткие левые контексты
        self.features.extend(('^' + "-" * length)
                             for length in range(self.right_context_length))
        if self.has_central_context:
            self.features.append('-#-')
        for code, feature in enumerate(self.features):
            self.feature_codes[feature] = code
        return

    # def _create_new_feature(self, feature, is_full, side):
    #     if is_full:
    #         delim = '^' if side == 'left' else '$'
    #         to_break = True
    #     else:
    #         delim, to_break = '#', False
    #     if side == 'left':
    #         feature = delim + feature
    #     elif side == 'right':
    #         feature += delim
    #     feature_code = self.feature_codes.get(feature, None)
    #     if feature_code is None:
    #         feature_code = self._process_unknown_feature(feature, side)
    #     return feature_code, to_break

    def _get_feature_code(self, feature, is_full, side, create_new=False):
        """
        Возвращает код признака feature
        """
        if is_full:
            delim = '^' if side == 'left' else '$'
            to_break = True
        else:
            delim, to_break = '#', False
        if side == 'left':
            feature = delim + feature
        elif side == 'right':
            feature += delim
        code = self.feature_codes.get(feature, -1)
        if code < 0:
            if create_new:
                self.features.append(feature)
                self.feature_codes[feature] = code = self.features_number
                self.features_number += 1
            else:
                if side == 'right':
                    partial_features = [
                        '-' * start + feature[start:]
                        for start in range(1, len(feature))
                    ]
                elif side == 'left':
                    partial_features = [
                        feature[:(len(feature) - start)] + '-' * start
                        for start in range(1, len(feature))
                    ][::-1]
                else:  # side == 'center'
                    partial_features = []
                    code, to_break = -1, None
                for partial_feature in partial_features:
                    code = self.feature_codes.get(partial_feature, -1)
                    if code > 0:
                        break
                if code > 0:
                    self.feature_codes[feature] = code
        return code, to_break