Пример #1
0
def cv_mode(testing_mode,
            multiclass,
            predict_lemmas,
            paradigm_file,
            infile,
            fraction,
            nfolds=0,
            order=3):
    lemma_descriptions_list = process_lemmas_file(infile)
    data, labels_with_vars = read_lemmas(lemma_descriptions_list,
                                         multiclass=multiclass,
                                         return_joint=True)
    paradigm_descriptions_list = process_codes_file(paradigm_file)
    paradigm_table, pattern_counts = read_paradigms(paradigm_descriptions_list)
    if predict_lemmas:
        paradigm_handlers = {
            code: ParadigmSubstitutor(descr)
            for descr, code in paradigm_table.items()
        }
    else:
        test_lemmas, pred_lemmas = None, None
    # подготовка для кросс-валидации
    classes = sorted(
        set(chain(*((x[0] for x in elem) for elem in labels_with_vars))))
    if nfolds == 0:
        train_data_length = int(fraction * len(data))
        train_data, test_data = [data[:train_data_length]
                                 ], [data[train_data_length:]]
        train_labels_with_vars, test_labels_with_vars =\
            [labels_with_vars[:train_data_length]], [labels_with_vars[train_data_length:]]
        nfolds = 1
    else:
        test_data, train_data = [None] * nfolds, [None] * nfolds
        test_labels_with_vars, train_labels_with_vars = [None] * nfolds, [
            None
        ] * nfolds
        for fold in range(nfolds):
            train_data[fold], test_data[fold], train_labels_with_vars[fold], test_labels_with_vars[fold] =\
                skcv.train_test_split(data, labels_with_vars, test_size = 1.0 - fraction,
                                      random_state = 100 * fold + 13)
        if predict_lemmas:
            test_lemmas = [None] * nfolds
            for fold in range(nfolds):
                test_lemmas[fold] = make_lemmas(paradigm_handlers,
                                                test_labels_with_vars[fold])
    predictions = [None] * nfolds
    prediction_probs = [None] * nfolds
    classes_by_cls = [None] * nfolds
    if predict_lemmas:
        pred_lemmas = [None] * nfolds
    # задаём классификатор
    lm_classifier = LMParadigmClassifier(paradigm_table,
                                         pattern_counts,
                                         lm_order=order)
    for i, (train_sample, train_labels_sample, test_sample, test_labels_sample) in\
            enumerate(zip(train_data, train_labels_with_vars, test_data, test_labels_with_vars)):
        lm_classifier.fit(train_sample, train_labels_sample)
        lm_classifier.test()
def cv_mode(testing_mode, multiclass, predict_lemmas, paradigm_file, infile,
            fraction, nfolds=0, order=3):
    lemma_descriptions_list = process_lemmas_file(infile)
    data, labels_with_vars = read_lemmas(lemma_descriptions_list, multiclass=multiclass, return_joint=True)
    paradigm_descriptions_list = process_codes_file(paradigm_file)
    paradigm_table, pattern_counts = read_paradigms(paradigm_descriptions_list)
    if predict_lemmas:
        paradigm_handlers = {code: ParadigmSubstitutor(descr)
                             for descr, code in paradigm_table.items()}
    else:
        test_lemmas, pred_lemmas = None, None
    # подготовка для кросс-валидации
    classes = sorted(set(chain(*((x[0] for x in elem) for elem in labels_with_vars))))
    if nfolds == 0:
        train_data_length = int(fraction * len(data))
        train_data, test_data = [data[:train_data_length]], [data[train_data_length:]]
        train_labels_with_vars, test_labels_with_vars =\
            [labels_with_vars[:train_data_length]], [labels_with_vars[train_data_length:]]
        nfolds = 1
    else:
        test_data, train_data = [None] * nfolds, [None] * nfolds
        test_labels_with_vars, train_labels_with_vars = [None] * nfolds, [None] * nfolds
        for fold in range(nfolds):
            train_data[fold], test_data[fold], train_labels_with_vars[fold], test_labels_with_vars[fold] =\
                skcv.train_test_split(data, labels_with_vars, test_size = 1.0 - fraction,
                                      random_state = 100 * fold + 13)
        if predict_lemmas:
            test_lemmas = [None] * nfolds
            for fold in range(nfolds):
                test_lemmas[fold] = make_lemmas(paradigm_handlers, test_labels_with_vars[fold])
    predictions = [None] * nfolds
    prediction_probs = [None] * nfolds
    classes_by_cls = [None] * nfolds
    if predict_lemmas:
        pred_lemmas = [None] * nfolds
    # задаём классификатор
    lm_classifier = LMParadigmClassifier(paradigm_table, pattern_counts, lm_order=order)
    for i, (train_sample, train_labels_sample, test_sample, test_labels_sample) in\
            enumerate(zip(train_data, train_labels_with_vars, test_data, test_labels_with_vars)):
        lm_classifier.fit(train_sample, train_labels_sample)
        lm_classifier.test()
def cv_mode(testing_mode, language_code, multiclass, predict_lemmas,
            paradigm_file, counts_file, infile,
            train_fraction, feature_fraction, paradigm_counts_threshold,
            nfolds, selection_method, binarization_method, max_feature_length,
            output_train_dir=None, output_pred_dir=None):
    lemma_descriptions_list = process_lemmas_file(infile)
    data, labels_with_vars = read_lemmas(lemma_descriptions_list, multiclass=multiclass, return_joint=True)
    paradigm_descriptions_list = process_codes_file(paradigm_file)
    paradigm_table, paradigm_counts = read_paradigms(paradigm_descriptions_list)
    word_counts_table = read_counts(counts_file)
    if predict_lemmas:
        paradigm_handlers = {code: ParadigmSubstitutor(descr)
                             for descr, code in paradigm_table.items()}
    else:
        test_lemmas, pred_lemmas = None, None
    # подготовка для кросс-валидации
    classes = sorted(set(chain(*((x[0] for x in elem) for elem in labels_with_vars))))
    active_paradigm_codes = [i for i, count in paradigm_counts.items()
                             if count >= paradigm_counts_threshold]
    # зачем нужны метки???
    marks = [LEMMA_KEY] + [",".join(x) for x in get_categories_marks(language_code)]
    paradigms_by_codes = {code: descr for descr, code in paradigm_table.items()}
    # подготовка данных для локальных трансформаций
    if selection_method is None:
        selection_method = 'ambiguity'
    if nfolds == 0:
        train_data_length = int(train_fraction * len(data))
        train_data, test_data = [data[:train_data_length]], [data[train_data_length:]]
        train_labels_with_vars, test_labels_with_vars =\
            [labels_with_vars[:train_data_length]], [labels_with_vars[train_data_length:]]
        nfolds = 1
    else:
        test_data, train_data = [None] * nfolds, [None] * nfolds
        test_labels_with_vars, train_labels_with_vars = [None] * nfolds, [None] * nfolds
        for fold in range(nfolds):
            train_data[fold], test_data[fold], train_labels_with_vars[fold], test_labels_with_vars[fold] =\
                skcv.train_test_split(data, labels_with_vars, test_size = 1.0 - train_fraction,
                                      random_state = 100 * fold + 13)
        if predict_lemmas:
            test_lemmas = [None] * nfolds
            for fold in range(nfolds):
                test_lemmas[fold] = make_lemmas(paradigm_handlers, test_labels_with_vars[fold])
    predictions = [None] * nfolds
    prediction_probs = [None] * nfolds
    classes_by_cls = [None] * nfolds
    if predict_lemmas:
        pred_lemmas = [None] * nfolds
    # задаём классификатор
    # cls = ParadigmCorporaClassifier(marks, paradigm_table, word_counts_table,
    #                                 multiclass=multiclass, selection_method=selection_method,
    #                                 binarization_method=binarization_method,
    #                                 inner_feature_fraction=feature_fraction,
    #                                 active_paradigm_codes=active_paradigm_codes,
    #                                 paradigm_counts=paradigm_counts , smallest_prob=0.01)
    cls = ParadigmCorporaClassifier(paradigm_table, word_counts_table,
                                    multiclass=multiclass, selection_method=selection_method,
                                    binarization_method=binarization_method,
                                    inner_feature_fraction=feature_fraction,
                                    active_paradigm_codes=active_paradigm_codes,
                                    paradigm_counts=paradigm_counts , smallest_prob=0.001)
    cls_params = {'max_length': max_feature_length}
    transformation_handler = TransformationsHandler(paradigm_table, paradigm_counts)
    transformation_classifier_params = {'select_features': 'ambiguity',
                                        'selection_params': {'nfeatures': 0.1, 'min_count': 2}}
    # statprof.start()
    cls = JointParadigmClassifier(cls, transformation_handler, cls_params,
                                  transformation_classifier_params)
    # cls = CombinedParadigmClassifier(cls, transformation_handler, cls_params,
    #                                  transformation_classifier_params)
    # сохраняем тестовые данные
    # if output_train_dir is not None:
    #     if not os.path.exists(output_train_dir):
    #         os.makedirs(output_train_dir)
    #     for i, (train_sample, train_labels_sample) in\
    #             enumerate(zip(train_data, train_labels_with_vars), 1):
    #         write_joint_data(os.path.join(output_train_dir, "{0}.data".format(i)),
    #                          train_sample, train_labels_sample)
    # применяем классификатор к данным
    for i, (train_sample, train_labels_sample, test_sample, test_labels_sample) in\
            enumerate(zip(train_data, train_labels_with_vars, test_data, test_labels_with_vars)):
        cls.fit(train_sample, train_labels_sample)
        classes_by_cls[i] = cls.classes_
        if testing_mode == 'predict':
            predictions[i] = cls.predict(test_sample)
        elif testing_mode == 'predict_proba':
            prediction_probs[i] = cls.predict_probs(test_sample)
            # в случае, если мы вернули вероятности,
            # то надо ещё извлечь классы
            if not multiclass:
                predictions[i] = [[elem[0][0]] for elem in prediction_probs[i]]
            else:
                raise NotImplementedError()
        if predict_lemmas:
            pred_lemmas[i] = make_lemmas(paradigm_handlers, predictions[i])
    # statprof.stop()
    # with open("statprof_{0:.1f}_{1:.1f}.stat".format(train_fraction,
    #                                                  feature_fraction), "w") as fout:
    #     with redirect_stdout(fout):
    #         statprof.display()
    if output_pred_dir:
        descrs_by_codes = {code: descr for descr, code in paradigm_table.items()}
        test_words = test_data
        if testing_mode == 'predict_proba':
            prediction_probs_for_output = prediction_probs
        else:
            prediction_probs_for_output = None
    else:
        descrs_by_codes, test_words, prediction_probs_for_output = None, None, None
    if not predict_lemmas:
        label_precisions, variable_precisions, form_precisions =\
            output_accuracies(classes, test_labels_with_vars, predictions, multiclass,
                              outfile=output_pred_dir, paradigm_descrs=descrs_by_codes,
                              test_words=test_words, predicted_probs=prediction_probs_for_output,
                              save_confusion_matrices=True)
        print("{0:<.2f}\t{1}\t{2:<.2f}\t{3:<.2f}\t{4:<.2f}".format(
            train_fraction, cls.paradigm_classifier.nfeatures,
            100 * np.mean(label_precisions), 100 * np.mean(variable_precisions),
            100 * np.mean(form_precisions)))
    else:
        label_precisions, variable_precisions, lemma_precisions, form_precisions =\
            output_accuracies(classes, test_labels_with_vars, predictions,
                              multiclass, test_lemmas, pred_lemmas,
                              outfile=output_pred_dir, paradigm_descrs=descrs_by_codes,
                              test_words=test_words, predicted_probs=prediction_probs_for_output,
                              save_confusion_matrices=True)
        print("{0:<.2f}\t{1}\t{2:<.2f}\t{3:<.2f}\t{4:<.2f}\t{5:<.2f}".format(
            train_fraction, cls.paradigm_classifier.nfeatures,
            100 * np.mean(label_precisions), 100 * np.mean(variable_precisions),
            100 * np.mean(lemma_precisions), 100 * np.mean(form_precisions)))

    # statprof.stop()
    # with open("statprof_{0:.1f}_{1:.1f}.stat".format(fraction, feature_fraction), "w") as fout:
    #     with redirect_stdout(fout):
    #         statprof.display()
    # вычисляем точность и обрабатываем результаты
    # for curr_test_values, curr_pred_values in zip(test_values_with_codes, pred_values_with_codes):
    #     for first, second in zip(curr_test_values, curr_pred_values):
    #         first_code, first_vars = first[0].split('_')[0], tuple(first[0].split('_')[1:])
    #         second_code, second_vars = second[0].split('_')[0], tuple(second[0].split('_')[1:])
    #         if first_code == second_code and first_vars != second_vars:
    #             print('{0}\t{1}'.format(first, second))
    # if not multiclass:
    #     confusion_matrices = [skm.confusion_matrix(first, second, labels=classes)
    #                           for first, second in zip(firsts, seconds)]
    # сохраняем результаты классификации
    # if output_pred_dir is not None:
    #     if not os.path.exists(output_pred_dir):
    #         os.makedirs(output_pred_dir)
    #     if testing_mode == 'predict':
    #         for i, (test_sample, pred_labels_sample, true_labels_sample) in\
    #                 enumerate(zip(test_data, predictions, test_labels), 1):
    #             write_data(os.path.join(output_pred_dir, "{0}.data".format(i)),
    #                        test_sample, pred_labels_sample, true_labels_sample)
    #     elif testing_mode == 'predict_proba':
    #         for i, (test_sample, pred_probs_sample, labels_sample, cls_classes) in\
    #                 enumerate(zip(test_data, prediction_probs, test_labels, classes_by_cls), 1):
    #             write_probs_data(os.path.join(output_pred_dir, "{0}.prob".format(i)),
    #                              test_sample, pred_probs_sample, cls_classes, labels_sample)
    # сохраняем матрицы ошибок классификации
    # if not multiclass and nfolds <= 1:
    #     confusion_matrices_folder = "confusion_matrices"
    #     if not os.path.exists(confusion_matrices_folder):
    #         os.makedirs(confusion_matrices_folder)
    #     dest = os.path.join(confusion_matrices_folder,
    #                         "confusion_matrix_{0}_{1:<.2f}_{2:<.2f}.out".format(
    #                             max_length, fraction, feature_fraction))
    #     with open(dest, "w", encoding="utf8") as fout:
    #         fout.write("{0:<4}".format("") +
    #                    "".join("{0:>4}".format(label) for label in classes) + "\n")
    #         for label, elem in zip(cls.classes_, confusion_matrices[0]):
    #             nonzero_positions = np.nonzero(elem)
    #             nonzero_counts = np.take(elem, nonzero_positions)[0]
    #             nonzero_labels = np.take(classes, nonzero_positions)[0]
    #             fout.write("{0:<4}\t".format(label))
    #             fout.write("\t".join("{0}:{1}".format(*pair)
    #                                  for pair in sorted(zip(nonzero_labels, nonzero_counts),
    #                                                     key=(lambda x: x[1]), reverse=True))
    #                        + "\n")
    return