示例#1
0
def cv_mode(testing_mode,
            multiclass,
            predict_lemmas,
            paradigm_file,
            infile,
            fraction,
            nfolds=0,
            order=3):
    lemma_descriptions_list = process_lemmas_file(infile)
    data, labels_with_vars = read_lemmas(lemma_descriptions_list,
                                         multiclass=multiclass,
                                         return_joint=True)
    paradigm_descriptions_list = process_codes_file(paradigm_file)
    paradigm_table, pattern_counts = read_paradigms(paradigm_descriptions_list)
    if predict_lemmas:
        paradigm_handlers = {
            code: ParadigmSubstitutor(descr)
            for descr, code in paradigm_table.items()
        }
    else:
        test_lemmas, pred_lemmas = None, None
    # подготовка для кросс-валидации
    classes = sorted(
        set(chain(*((x[0] for x in elem) for elem in labels_with_vars))))
    if nfolds == 0:
        train_data_length = int(fraction * len(data))
        train_data, test_data = [data[:train_data_length]
                                 ], [data[train_data_length:]]
        train_labels_with_vars, test_labels_with_vars =\
            [labels_with_vars[:train_data_length]], [labels_with_vars[train_data_length:]]
        nfolds = 1
    else:
        test_data, train_data = [None] * nfolds, [None] * nfolds
        test_labels_with_vars, train_labels_with_vars = [None] * nfolds, [
            None
        ] * nfolds
        for fold in range(nfolds):
            train_data[fold], test_data[fold], train_labels_with_vars[fold], test_labels_with_vars[fold] =\
                skcv.train_test_split(data, labels_with_vars, test_size = 1.0 - fraction,
                                      random_state = 100 * fold + 13)
        if predict_lemmas:
            test_lemmas = [None] * nfolds
            for fold in range(nfolds):
                test_lemmas[fold] = make_lemmas(paradigm_handlers,
                                                test_labels_with_vars[fold])
    predictions = [None] * nfolds
    prediction_probs = [None] * nfolds
    classes_by_cls = [None] * nfolds
    if predict_lemmas:
        pred_lemmas = [None] * nfolds
    # задаём классификатор
    lm_classifier = LMParadigmClassifier(paradigm_table,
                                         pattern_counts,
                                         lm_order=order)
    for i, (train_sample, train_labels_sample, test_sample, test_labels_sample) in\
            enumerate(zip(train_data, train_labels_with_vars, test_data, test_labels_with_vars)):
        lm_classifier.fit(train_sample, train_labels_sample)
        lm_classifier.test()
def cv_mode(testing_mode, multiclass, predict_lemmas, paradigm_file, infile,
            fraction, nfolds=0, order=3):
    lemma_descriptions_list = process_lemmas_file(infile)
    data, labels_with_vars = read_lemmas(lemma_descriptions_list, multiclass=multiclass, return_joint=True)
    paradigm_descriptions_list = process_codes_file(paradigm_file)
    paradigm_table, pattern_counts = read_paradigms(paradigm_descriptions_list)
    if predict_lemmas:
        paradigm_handlers = {code: ParadigmSubstitutor(descr)
                             for descr, code in paradigm_table.items()}
    else:
        test_lemmas, pred_lemmas = None, None
    # подготовка для кросс-валидации
    classes = sorted(set(chain(*((x[0] for x in elem) for elem in labels_with_vars))))
    if nfolds == 0:
        train_data_length = int(fraction * len(data))
        train_data, test_data = [data[:train_data_length]], [data[train_data_length:]]
        train_labels_with_vars, test_labels_with_vars =\
            [labels_with_vars[:train_data_length]], [labels_with_vars[train_data_length:]]
        nfolds = 1
    else:
        test_data, train_data = [None] * nfolds, [None] * nfolds
        test_labels_with_vars, train_labels_with_vars = [None] * nfolds, [None] * nfolds
        for fold in range(nfolds):
            train_data[fold], test_data[fold], train_labels_with_vars[fold], test_labels_with_vars[fold] =\
                skcv.train_test_split(data, labels_with_vars, test_size = 1.0 - fraction,
                                      random_state = 100 * fold + 13)
        if predict_lemmas:
            test_lemmas = [None] * nfolds
            for fold in range(nfolds):
                test_lemmas[fold] = make_lemmas(paradigm_handlers, test_labels_with_vars[fold])
    predictions = [None] * nfolds
    prediction_probs = [None] * nfolds
    classes_by_cls = [None] * nfolds
    if predict_lemmas:
        pred_lemmas = [None] * nfolds
    # задаём классификатор
    lm_classifier = LMParadigmClassifier(paradigm_table, pattern_counts, lm_order=order)
    for i, (train_sample, train_labels_sample, test_sample, test_labels_sample) in\
            enumerate(zip(train_data, train_labels_with_vars, test_data, test_labels_with_vars)):
        lm_classifier.fit(train_sample, train_labels_sample)
        lm_classifier.test()
def cv_mode(testing_mode, multiclass, predict_lemmas, find_flection,
            paradigm_file, infile, max_length, fraction, nfolds=0,
            selection_method=None, feature_fraction=None,
            output_train_dir=None, output_pred_dir=None):
    '''
    Определяет качество классификатора с заданными параметрами
    по скользящему контролю на обучающей выборке

    Параметры:
    -----------
    testing_mode: str ('predict' or  'predict_proba'), режим использования
    multiclass: bool, может ли одно слово иметь несколько парадигм
    find_flection:  bool, выполняется ли предварительный поиск флексии
        для того, чтобы использовать в качестве признаков суффиксы основы,
        а не всего слова. Оказалось, что качество ухудшается
    paradigm_file: str, путь к файлу с парадигмами
    infile: str, путь к файлу с обучающей выборкой
    fraction: float, доля обучающей выборки
    nfolds: int, optional(default=0)
        число разбиений, по которым производится усреднение при скользящем контроле
        nfolds=0 --- в обучающую выборку попадает соответствующее число лексем
                     из начала файла
    selection_method: str or None, optional (default=None),
        метод отбора признаков
    feature_fraction: float or None, optional (default=None),
        доля признаков, которые следует оставить при отборе признаков
        (при этом nfeatures должно быть не задано)
    output_train_dir: str or None, optional(default=None),
        директория для сохранения тестовых данных,
        в случае output_train_dir=None сохранение не производится
    output_pred_dir: str or None, optional(default=None),
        директория для сохранения результатов классификации,
        в случае output_train_dir=None сохранение не производится
    '''
    # чтение входных файлов
    lemma_descriptions_list = process_lemmas_file(infile)
    data, labels_with_vars = read_lemmas(lemma_descriptions_list, multiclass=multiclass, return_joint=True)
    paradigm_descriptions_list = process_codes_file(paradigm_file)
    paradigm_table, pattern_counts = read_paradigms(paradigm_descriptions_list)
    if predict_lemmas:
        paradigm_handlers = {code: ParadigmSubstitutor(descr)
                             for descr, code in paradigm_table.items()}
    else:
        test_lemmas, pred_lemmas = None, None
    # подготовка для кросс-валидации
    classes = sorted(set(chain(*((x[0] for x in elem) for elem in labels_with_vars))))
    if selection_method is None:
        selection_method = 'ambiguity'
    if nfolds == 0:
        train_data_length = int(fraction * len(data))
        train_data, test_data = [data[:train_data_length]], [data[train_data_length:]]
        train_labels_with_vars, test_labels_with_vars =\
            [labels_with_vars[:train_data_length]], [labels_with_vars[train_data_length:]]
        nfolds = 1
    else:
        test_data, train_data = [None] * nfolds, [None] * nfolds
        test_labels_with_vars, train_labels_with_vars = [None] * nfolds, [None] * nfolds
        for fold in range(nfolds):
            train_data[fold], test_data[fold], train_labels_with_vars[fold], test_labels_with_vars[fold] =\
                skcv.train_test_split(data, labels_with_vars, test_size = 1.0 - fraction,
                                      random_state = 100 * fold + 13)
        if predict_lemmas:
            test_lemmas = [None] * nfolds
            for fold in range(nfolds):
                test_lemmas[fold] = make_lemmas(paradigm_handlers, test_labels_with_vars[fold])
    predictions = [None] * nfolds
    prediction_probs = [None] * nfolds
    classes_by_cls = [None] * nfolds
    if predict_lemmas:
        pred_lemmas = [None] * nfolds
    # задаём классификатор
    paradigm_classifier = ParadigmClassifier(paradigm_table)
    paradigm_classifier_params = {'multiclass': multiclass, 'find_flection': find_flection,
                                  'max_length': max_length, 'use_prefixes': True,
                                  'classifier_params': None, 'selection_method': selection_method,
                                  'nfeatures': feature_fraction, 'smallest_prob': 0.01}
    transformation_handler = TransformationsHandler(paradigm_table, pattern_counts)
    transformation_classifier_params = {'select_features': 'ambiguity',
                                        'selection_params': {'nfeatures': 0.1, 'min_count': 2}}
    # statprof.start()
    cls = JointParadigmClassifier(paradigm_table, paradigm_classifier_params,
                                  transformation_handler, transformation_classifier_params)
    # cls = CombinedParadigmClassifier(paradigm_classifier, transformation_handler,
    #                                  paradigm_classifier_params, transformation_classifier_params)
    # сохраняем тестовые данные
    # if output_train_dir is not None:
    #     if not os.path.exists(output_train_dir):
    #         os.makedirs(output_train_dir)
    #     for i, (train_sample, train_labels_sample) in\
    #             enumerate(zip(train_data, train_labels_with_vars), 1):
    #         write_joint_data(os.path.join(output_train_dir, "{0}.data".format(i)),
    #                          train_sample, train_labels_sample)
    # применяем классификатор к данным
    for i, (train_sample, train_labels_sample, test_sample, test_labels_sample) in\
            enumerate(zip(train_data, train_labels_with_vars, test_data, test_labels_with_vars)):
        cls.fit(train_sample, train_labels_sample)
        classes_by_cls[i] = cls.classes_
        if testing_mode == 'predict':
            predictions[i] = cls.predict(test_sample)
        elif testing_mode == 'predict_proba':
            prediction_probs[i] = cls.predict_probs(test_sample)
            # в случае, если мы вернули вероятности,
            # то надо ещё извлечь классы
            if not multiclass:
                predictions[i] = [[elem[0][0]] for elem in prediction_probs[i]]
            else:
                raise NotImplementedError()
        if predict_lemmas:
            pred_lemmas[i] = make_lemmas(paradigm_handlers, predictions[i])
    descrs_by_codes = {code: descr for descr, code in paradigm_table.items()}
    if output_pred_dir:
        test_words = test_data
        if testing_mode == 'predict_proba':
            prediction_probs_for_output = prediction_probs
        else:
            prediction_probs_for_output = None
    else:
        test_words, prediction_probs_for_output = None, None
    if not predict_lemmas:
        label_precisions, variable_precisions, form_precisions =\
            output_accuracies(classes, test_labels_with_vars, predictions, multiclass,
                              outfile=output_pred_dir, paradigm_descrs=descrs_by_codes,
                              test_words=test_words, predicted_probs=prediction_probs_for_output)
        print("{0}\t{1:<.2f}\t{2}\t{3:<.2f}\t{4:<.2f}\t{5:<.2f}".format(
            max_length, fraction, cls.paradigm_classifier.nfeatures,
            100 * np.mean(label_precisions), 100 * np.mean(variable_precisions),
            100 * np.mean(form_precisions)))
    else:
        label_precisions, variable_precisions, lemma_precisions, form_precisions =\
            output_accuracies(classes, test_labels_with_vars, predictions,
                              multiclass, test_lemmas, pred_lemmas,
                              outfile=output_pred_dir, paradigm_descrs=descrs_by_codes,
                              test_words=test_words, predicted_probs=prediction_probs_for_output,
                              save_confusion_matrices=True)
        print("{0}\t{1:<.2f}\t{2}\t{3:<.2f}\t{4:<.2f}\t{5:<.2f}\t{6:<.2f}".format(
            max_length, fraction, cls.paradigm_classifier.nfeatures,
            100 * np.mean(label_precisions), 100 * np.mean(variable_precisions),
            100 * np.mean(lemma_precisions), 100 * np.mean(form_precisions)))
    # statprof.stop()
    # with open("statprof_{0:.1f}_{1:.1f}.stat".format(fraction, feature_fraction), "w") as fout:
    #     with redirect_stdout(fout):
    #         statprof.display()
    # вычисляем точность и обрабатываем результаты
    # for curr_test_values, curr_pred_values in zip(test_values_with_codes, pred_values_with_codes):
    #     print(len(curr_test_values), len(curr_pred_values))
    #     for first, second in zip(curr_test_values, curr_pred_values):
    #         first_code, first_vars = first[0].split('_')[0], tuple(first[0].split('_')[1:])
    #         second_code, second_vars = second[0].split('_')[0], tuple(second[0].split('_')[1:])
    #         if first_code == second_code and first_vars != second_vars:
    #             print('{0}\t{1}'.format(first, second))
    # if not multiclass:
    #     confusion_matrices = [skm.confusion_matrix(first, second, labels=classes)
    #                           for first, second in zip(firsts, seconds)]
    # сохраняем результаты классификации
    # if output_pred_dir is not None:
    #     if not os.path.exists(output_pred_dir):
    #         os.makedirs(output_pred_dir)
    #     if testing_mode == 'predict':
    #         for i, (test_sample, pred_labels_sample, true_labels_sample) in\
    #                 enumerate(zip(test_data, predictions, test_labels), 1):
    #             write_data(os.path.join(output_pred_dir, "{0}.data".format(i)),
    #                        test_sample, pred_labels_sample, true_labels_sample)
    #     elif testing_mode == 'predict_proba':
    #         for i, (test_sample, pred_probs_sample, labels_sample, cls_classes) in\
    #                 enumerate(zip(test_data, prediction_probs, test_labels, classes_by_cls), 1):
    #             write_probs_data(os.path.join(output_pred_dir, "{0}.prob".format(i)),
    #                              test_sample, pred_probs_sample, cls_classes, labels_sample)
    # сохраняем матрицы ошибок классификации
    # if not multiclass and nfolds <= 1:
    #     confusion_matrices_folder = "confusion_matrices"
    #     if not os.path.exists(confusion_matrices_folder):
    #         os.makedirs(confusion_matrices_folder)
    #     dest = os.path.join(confusion_matrices_folder,
    #                         "confusion_matrix_{0}_{1:<.2f}_{2:<.2f}.out".format(
    #                             max_length, fraction, feature_fraction))
    #     with open(dest, "w", encoding="utf8") as fout:
    #         fout.write("{0:<4}".format("") +
    #                    "".join("{0:>4}".format(label) for label in classes) + "\n")
    #         for label, elem in zip(cls.classes_, confusion_matrices[0]):
    #             nonzero_positions = np.nonzero(elem)
    #             nonzero_counts = np.take(elem, nonzero_positions)[0]
    #             nonzero_labels = np.take(classes, nonzero_positions)[0]
    #             fout.write("{0:<4}\t".format(label))
    #             fout.write("\t".join("{0}:{1}".format(*pair)
    #                                  for pair in sorted(zip(nonzero_labels, nonzero_counts),
    #                                                     key=(lambda x: x[1]), reverse=True))
    #                        + "\n")
    return
            simple_score = measure(curr_test_binary, curr_pred_binary)
        else:
            score, simple_score = 1.0, 1.0
        accuracies[substr] = (score, simple_score, np.count_nonzero(curr_pred_binary),
                              np.count_nonzero(curr_test_binary), len(curr_test_binary))
    return accuracies

def output_results(outfile, accuracies):
    with open(outfile, "w", encoding="utf8") as fout:
        for substr, score in sorted(accuracies.items()):
            fout.write("{0}\t{1:.2f}\t{2:.2f}\t{3}\t{4}\t{5}\n".format(substr, *score))

if __name__ == "__main__":
    args = sys.argv[1:]
    lemmas_file, codes_file, trans_codes_outfile, results_outfile = args
    lemmas_data = process_lemmas_file(lemmas_file)
    codes_data = process_codes_file(codes_file)
    paradigm_codes, paradigm_counts = read_paradigms(codes_data)
    transformations_handler = TransformationsHandler(paradigm_codes, paradigm_counts)
    transformations_handler.output_transformations(trans_codes_outfile)
    ltc = LocalTransformationClassifier(
        left_context_length=2, right_context_length=3, select_features='ambiguity',
        selection_params={'nfeatures': 0.1, 'min_count': 3, 'min_features': 100})
    X, y = transformations_handler.create_training_data(lemmas_data)
    X_train, X_test, Y_train, Y_test = skcv.train_test_split(X, y, test_size=0.25, random_state=156)
    Y_train, Y_test = [[x] for x in Y_train], [[x] for x in Y_test]
    # print("Fitting...")
    ltc.fit(X_train, Y_train)
    # print("Predicting...")
    Y_pred = ltc.predict(X_test)
    substrs = [elem[0] for elem in X_test]
def cv_mode(testing_mode, language_code, multiclass, predict_lemmas,
            paradigm_file, counts_file, infile,
            train_fraction, feature_fraction, paradigm_counts_threshold,
            nfolds, selection_method, binarization_method, max_feature_length,
            output_train_dir=None, output_pred_dir=None):
    lemma_descriptions_list = process_lemmas_file(infile)
    data, labels_with_vars = read_lemmas(lemma_descriptions_list, multiclass=multiclass, return_joint=True)
    paradigm_descriptions_list = process_codes_file(paradigm_file)
    paradigm_table, paradigm_counts = read_paradigms(paradigm_descriptions_list)
    word_counts_table = read_counts(counts_file)
    if predict_lemmas:
        paradigm_handlers = {code: ParadigmSubstitutor(descr)
                             for descr, code in paradigm_table.items()}
    else:
        test_lemmas, pred_lemmas = None, None
    # подготовка для кросс-валидации
    classes = sorted(set(chain(*((x[0] for x in elem) for elem in labels_with_vars))))
    active_paradigm_codes = [i for i, count in paradigm_counts.items()
                             if count >= paradigm_counts_threshold]
    # зачем нужны метки???
    marks = [LEMMA_KEY] + [",".join(x) for x in get_categories_marks(language_code)]
    paradigms_by_codes = {code: descr for descr, code in paradigm_table.items()}
    # подготовка данных для локальных трансформаций
    if selection_method is None:
        selection_method = 'ambiguity'
    if nfolds == 0:
        train_data_length = int(train_fraction * len(data))
        train_data, test_data = [data[:train_data_length]], [data[train_data_length:]]
        train_labels_with_vars, test_labels_with_vars =\
            [labels_with_vars[:train_data_length]], [labels_with_vars[train_data_length:]]
        nfolds = 1
    else:
        test_data, train_data = [None] * nfolds, [None] * nfolds
        test_labels_with_vars, train_labels_with_vars = [None] * nfolds, [None] * nfolds
        for fold in range(nfolds):
            train_data[fold], test_data[fold], train_labels_with_vars[fold], test_labels_with_vars[fold] =\
                skcv.train_test_split(data, labels_with_vars, test_size = 1.0 - train_fraction,
                                      random_state = 100 * fold + 13)
        if predict_lemmas:
            test_lemmas = [None] * nfolds
            for fold in range(nfolds):
                test_lemmas[fold] = make_lemmas(paradigm_handlers, test_labels_with_vars[fold])
    predictions = [None] * nfolds
    prediction_probs = [None] * nfolds
    classes_by_cls = [None] * nfolds
    if predict_lemmas:
        pred_lemmas = [None] * nfolds
    # задаём классификатор
    # cls = ParadigmCorporaClassifier(marks, paradigm_table, word_counts_table,
    #                                 multiclass=multiclass, selection_method=selection_method,
    #                                 binarization_method=binarization_method,
    #                                 inner_feature_fraction=feature_fraction,
    #                                 active_paradigm_codes=active_paradigm_codes,
    #                                 paradigm_counts=paradigm_counts , smallest_prob=0.01)
    cls = ParadigmCorporaClassifier(paradigm_table, word_counts_table,
                                    multiclass=multiclass, selection_method=selection_method,
                                    binarization_method=binarization_method,
                                    inner_feature_fraction=feature_fraction,
                                    active_paradigm_codes=active_paradigm_codes,
                                    paradigm_counts=paradigm_counts , smallest_prob=0.001)
    cls_params = {'max_length': max_feature_length}
    transformation_handler = TransformationsHandler(paradigm_table, paradigm_counts)
    transformation_classifier_params = {'select_features': 'ambiguity',
                                        'selection_params': {'nfeatures': 0.1, 'min_count': 2}}
    # statprof.start()
    cls = JointParadigmClassifier(cls, transformation_handler, cls_params,
                                  transformation_classifier_params)
    # cls = CombinedParadigmClassifier(cls, transformation_handler, cls_params,
    #                                  transformation_classifier_params)
    # сохраняем тестовые данные
    # if output_train_dir is not None:
    #     if not os.path.exists(output_train_dir):
    #         os.makedirs(output_train_dir)
    #     for i, (train_sample, train_labels_sample) in\
    #             enumerate(zip(train_data, train_labels_with_vars), 1):
    #         write_joint_data(os.path.join(output_train_dir, "{0}.data".format(i)),
    #                          train_sample, train_labels_sample)
    # применяем классификатор к данным
    for i, (train_sample, train_labels_sample, test_sample, test_labels_sample) in\
            enumerate(zip(train_data, train_labels_with_vars, test_data, test_labels_with_vars)):
        cls.fit(train_sample, train_labels_sample)
        classes_by_cls[i] = cls.classes_
        if testing_mode == 'predict':
            predictions[i] = cls.predict(test_sample)
        elif testing_mode == 'predict_proba':
            prediction_probs[i] = cls.predict_probs(test_sample)
            # в случае, если мы вернули вероятности,
            # то надо ещё извлечь классы
            if not multiclass:
                predictions[i] = [[elem[0][0]] for elem in prediction_probs[i]]
            else:
                raise NotImplementedError()
        if predict_lemmas:
            pred_lemmas[i] = make_lemmas(paradigm_handlers, predictions[i])
    # statprof.stop()
    # with open("statprof_{0:.1f}_{1:.1f}.stat".format(train_fraction,
    #                                                  feature_fraction), "w") as fout:
    #     with redirect_stdout(fout):
    #         statprof.display()
    if output_pred_dir:
        descrs_by_codes = {code: descr for descr, code in paradigm_table.items()}
        test_words = test_data
        if testing_mode == 'predict_proba':
            prediction_probs_for_output = prediction_probs
        else:
            prediction_probs_for_output = None
    else:
        descrs_by_codes, test_words, prediction_probs_for_output = None, None, None
    if not predict_lemmas:
        label_precisions, variable_precisions, form_precisions =\
            output_accuracies(classes, test_labels_with_vars, predictions, multiclass,
                              outfile=output_pred_dir, paradigm_descrs=descrs_by_codes,
                              test_words=test_words, predicted_probs=prediction_probs_for_output,
                              save_confusion_matrices=True)
        print("{0:<.2f}\t{1}\t{2:<.2f}\t{3:<.2f}\t{4:<.2f}".format(
            train_fraction, cls.paradigm_classifier.nfeatures,
            100 * np.mean(label_precisions), 100 * np.mean(variable_precisions),
            100 * np.mean(form_precisions)))
    else:
        label_precisions, variable_precisions, lemma_precisions, form_precisions =\
            output_accuracies(classes, test_labels_with_vars, predictions,
                              multiclass, test_lemmas, pred_lemmas,
                              outfile=output_pred_dir, paradigm_descrs=descrs_by_codes,
                              test_words=test_words, predicted_probs=prediction_probs_for_output,
                              save_confusion_matrices=True)
        print("{0:<.2f}\t{1}\t{2:<.2f}\t{3:<.2f}\t{4:<.2f}\t{5:<.2f}".format(
            train_fraction, cls.paradigm_classifier.nfeatures,
            100 * np.mean(label_precisions), 100 * np.mean(variable_precisions),
            100 * np.mean(lemma_precisions), 100 * np.mean(form_precisions)))

    # statprof.stop()
    # with open("statprof_{0:.1f}_{1:.1f}.stat".format(fraction, feature_fraction), "w") as fout:
    #     with redirect_stdout(fout):
    #         statprof.display()
    # вычисляем точность и обрабатываем результаты
    # for curr_test_values, curr_pred_values in zip(test_values_with_codes, pred_values_with_codes):
    #     for first, second in zip(curr_test_values, curr_pred_values):
    #         first_code, first_vars = first[0].split('_')[0], tuple(first[0].split('_')[1:])
    #         second_code, second_vars = second[0].split('_')[0], tuple(second[0].split('_')[1:])
    #         if first_code == second_code and first_vars != second_vars:
    #             print('{0}\t{1}'.format(first, second))
    # if not multiclass:
    #     confusion_matrices = [skm.confusion_matrix(first, second, labels=classes)
    #                           for first, second in zip(firsts, seconds)]
    # сохраняем результаты классификации
    # if output_pred_dir is not None:
    #     if not os.path.exists(output_pred_dir):
    #         os.makedirs(output_pred_dir)
    #     if testing_mode == 'predict':
    #         for i, (test_sample, pred_labels_sample, true_labels_sample) in\
    #                 enumerate(zip(test_data, predictions, test_labels), 1):
    #             write_data(os.path.join(output_pred_dir, "{0}.data".format(i)),
    #                        test_sample, pred_labels_sample, true_labels_sample)
    #     elif testing_mode == 'predict_proba':
    #         for i, (test_sample, pred_probs_sample, labels_sample, cls_classes) in\
    #                 enumerate(zip(test_data, prediction_probs, test_labels, classes_by_cls), 1):
    #             write_probs_data(os.path.join(output_pred_dir, "{0}.prob".format(i)),
    #                              test_sample, pred_probs_sample, cls_classes, labels_sample)
    # сохраняем матрицы ошибок классификации
    # if not multiclass and nfolds <= 1:
    #     confusion_matrices_folder = "confusion_matrices"
    #     if not os.path.exists(confusion_matrices_folder):
    #         os.makedirs(confusion_matrices_folder)
    #     dest = os.path.join(confusion_matrices_folder,
    #                         "confusion_matrix_{0}_{1:<.2f}_{2:<.2f}.out".format(
    #                             max_length, fraction, feature_fraction))
    #     with open(dest, "w", encoding="utf8") as fout:
    #         fout.write("{0:<4}".format("") +
    #                    "".join("{0:>4}".format(label) for label in classes) + "\n")
    #         for label, elem in zip(cls.classes_, confusion_matrices[0]):
    #             nonzero_positions = np.nonzero(elem)
    #             nonzero_counts = np.take(elem, nonzero_positions)[0]
    #             nonzero_labels = np.take(classes, nonzero_positions)[0]
    #             fout.write("{0:<4}\t".format(label))
    #             fout.write("\t".join("{0}:{1}".format(*pair)
    #                                  for pair in sorted(zip(nonzero_labels, nonzero_counts),
    #                                                     key=(lambda x: x[1]), reverse=True))
    #                        + "\n")
    return
                              np.count_nonzero(curr_test_binary),
                              len(curr_test_binary))
    return accuracies


def output_results(outfile, accuracies):
    with open(outfile, "w", encoding="utf8") as fout:
        for substr, score in sorted(accuracies.items()):
            fout.write("{0}\t{1:.2f}\t{2:.2f}\t{3}\t{4}\t{5}\n".format(
                substr, *score))


if __name__ == "__main__":
    args = sys.argv[1:]
    lemmas_file, codes_file, trans_codes_outfile, results_outfile = args
    lemmas_data = process_lemmas_file(lemmas_file)
    codes_data = process_codes_file(codes_file)
    paradigm_codes, paradigm_counts = read_paradigms(codes_data)
    transformations_handler = TransformationsHandler(paradigm_codes,
                                                     paradigm_counts)
    transformations_handler.output_transformations(trans_codes_outfile)
    ltc = LocalTransformationClassifier(left_context_length=2,
                                        right_context_length=3,
                                        select_features='ambiguity',
                                        selection_params={
                                            'nfeatures': 0.1,
                                            'min_count': 3,
                                            'min_features': 100
                                        })
    X, y = transformations_handler.create_training_data(lemmas_data)
    X_train, X_test, Y_train, Y_test = skcv.train_test_split(X,