def cv_mode(testing_mode, multiclass, predict_lemmas, paradigm_file, infile, fraction, nfolds=0, order=3): lemma_descriptions_list = process_lemmas_file(infile) data, labels_with_vars = read_lemmas(lemma_descriptions_list, multiclass=multiclass, return_joint=True) paradigm_descriptions_list = process_codes_file(paradigm_file) paradigm_table, pattern_counts = read_paradigms(paradigm_descriptions_list) if predict_lemmas: paradigm_handlers = { code: ParadigmSubstitutor(descr) for descr, code in paradigm_table.items() } else: test_lemmas, pred_lemmas = None, None # подготовка для кросс-валидации classes = sorted( set(chain(*((x[0] for x in elem) for elem in labels_with_vars)))) if nfolds == 0: train_data_length = int(fraction * len(data)) train_data, test_data = [data[:train_data_length] ], [data[train_data_length:]] train_labels_with_vars, test_labels_with_vars =\ [labels_with_vars[:train_data_length]], [labels_with_vars[train_data_length:]] nfolds = 1 else: test_data, train_data = [None] * nfolds, [None] * nfolds test_labels_with_vars, train_labels_with_vars = [None] * nfolds, [ None ] * nfolds for fold in range(nfolds): train_data[fold], test_data[fold], train_labels_with_vars[fold], test_labels_with_vars[fold] =\ skcv.train_test_split(data, labels_with_vars, test_size = 1.0 - fraction, random_state = 100 * fold + 13) if predict_lemmas: test_lemmas = [None] * nfolds for fold in range(nfolds): test_lemmas[fold] = make_lemmas(paradigm_handlers, test_labels_with_vars[fold]) predictions = [None] * nfolds prediction_probs = [None] * nfolds classes_by_cls = [None] * nfolds if predict_lemmas: pred_lemmas = [None] * nfolds # задаём классификатор lm_classifier = LMParadigmClassifier(paradigm_table, pattern_counts, lm_order=order) for i, (train_sample, train_labels_sample, test_sample, test_labels_sample) in\ enumerate(zip(train_data, train_labels_with_vars, test_data, test_labels_with_vars)): lm_classifier.fit(train_sample, train_labels_sample) lm_classifier.test()
def cv_mode(testing_mode, multiclass, predict_lemmas, paradigm_file, infile, fraction, nfolds=0, order=3): lemma_descriptions_list = process_lemmas_file(infile) data, labels_with_vars = read_lemmas(lemma_descriptions_list, multiclass=multiclass, return_joint=True) paradigm_descriptions_list = process_codes_file(paradigm_file) paradigm_table, pattern_counts = read_paradigms(paradigm_descriptions_list) if predict_lemmas: paradigm_handlers = {code: ParadigmSubstitutor(descr) for descr, code in paradigm_table.items()} else: test_lemmas, pred_lemmas = None, None # подготовка для кросс-валидации classes = sorted(set(chain(*((x[0] for x in elem) for elem in labels_with_vars)))) if nfolds == 0: train_data_length = int(fraction * len(data)) train_data, test_data = [data[:train_data_length]], [data[train_data_length:]] train_labels_with_vars, test_labels_with_vars =\ [labels_with_vars[:train_data_length]], [labels_with_vars[train_data_length:]] nfolds = 1 else: test_data, train_data = [None] * nfolds, [None] * nfolds test_labels_with_vars, train_labels_with_vars = [None] * nfolds, [None] * nfolds for fold in range(nfolds): train_data[fold], test_data[fold], train_labels_with_vars[fold], test_labels_with_vars[fold] =\ skcv.train_test_split(data, labels_with_vars, test_size = 1.0 - fraction, random_state = 100 * fold + 13) if predict_lemmas: test_lemmas = [None] * nfolds for fold in range(nfolds): test_lemmas[fold] = make_lemmas(paradigm_handlers, test_labels_with_vars[fold]) predictions = [None] * nfolds prediction_probs = [None] * nfolds classes_by_cls = [None] * nfolds if predict_lemmas: pred_lemmas = [None] * nfolds # задаём классификатор lm_classifier = LMParadigmClassifier(paradigm_table, pattern_counts, lm_order=order) for i, (train_sample, train_labels_sample, test_sample, test_labels_sample) in\ enumerate(zip(train_data, train_labels_with_vars, test_data, test_labels_with_vars)): lm_classifier.fit(train_sample, train_labels_sample) lm_classifier.test()
def cv_mode(testing_mode, multiclass, predict_lemmas, find_flection, paradigm_file, infile, max_length, fraction, nfolds=0, selection_method=None, feature_fraction=None, output_train_dir=None, output_pred_dir=None): ''' Определяет качество классификатора с заданными параметрами по скользящему контролю на обучающей выборке Параметры: ----------- testing_mode: str ('predict' or 'predict_proba'), режим использования multiclass: bool, может ли одно слово иметь несколько парадигм find_flection: bool, выполняется ли предварительный поиск флексии для того, чтобы использовать в качестве признаков суффиксы основы, а не всего слова. Оказалось, что качество ухудшается paradigm_file: str, путь к файлу с парадигмами infile: str, путь к файлу с обучающей выборкой fraction: float, доля обучающей выборки nfolds: int, optional(default=0) число разбиений, по которым производится усреднение при скользящем контроле nfolds=0 --- в обучающую выборку попадает соответствующее число лексем из начала файла selection_method: str or None, optional (default=None), метод отбора признаков feature_fraction: float or None, optional (default=None), доля признаков, которые следует оставить при отборе признаков (при этом nfeatures должно быть не задано) output_train_dir: str or None, optional(default=None), директория для сохранения тестовых данных, в случае output_train_dir=None сохранение не производится output_pred_dir: str or None, optional(default=None), директория для сохранения результатов классификации, в случае output_train_dir=None сохранение не производится ''' # чтение входных файлов lemma_descriptions_list = process_lemmas_file(infile) data, labels_with_vars = read_lemmas(lemma_descriptions_list, multiclass=multiclass, return_joint=True) paradigm_descriptions_list = process_codes_file(paradigm_file) paradigm_table, pattern_counts = read_paradigms(paradigm_descriptions_list) if predict_lemmas: paradigm_handlers = {code: ParadigmSubstitutor(descr) for descr, code in paradigm_table.items()} else: test_lemmas, pred_lemmas = None, None # подготовка для кросс-валидации classes = sorted(set(chain(*((x[0] for x in elem) for elem in labels_with_vars)))) if selection_method is None: selection_method = 'ambiguity' if nfolds == 0: train_data_length = int(fraction * len(data)) train_data, test_data = [data[:train_data_length]], [data[train_data_length:]] train_labels_with_vars, test_labels_with_vars =\ [labels_with_vars[:train_data_length]], [labels_with_vars[train_data_length:]] nfolds = 1 else: test_data, train_data = [None] * nfolds, [None] * nfolds test_labels_with_vars, train_labels_with_vars = [None] * nfolds, [None] * nfolds for fold in range(nfolds): train_data[fold], test_data[fold], train_labels_with_vars[fold], test_labels_with_vars[fold] =\ skcv.train_test_split(data, labels_with_vars, test_size = 1.0 - fraction, random_state = 100 * fold + 13) if predict_lemmas: test_lemmas = [None] * nfolds for fold in range(nfolds): test_lemmas[fold] = make_lemmas(paradigm_handlers, test_labels_with_vars[fold]) predictions = [None] * nfolds prediction_probs = [None] * nfolds classes_by_cls = [None] * nfolds if predict_lemmas: pred_lemmas = [None] * nfolds # задаём классификатор paradigm_classifier = ParadigmClassifier(paradigm_table) paradigm_classifier_params = {'multiclass': multiclass, 'find_flection': find_flection, 'max_length': max_length, 'use_prefixes': True, 'classifier_params': None, 'selection_method': selection_method, 'nfeatures': feature_fraction, 'smallest_prob': 0.01} transformation_handler = TransformationsHandler(paradigm_table, pattern_counts) transformation_classifier_params = {'select_features': 'ambiguity', 'selection_params': {'nfeatures': 0.1, 'min_count': 2}} # statprof.start() cls = JointParadigmClassifier(paradigm_table, paradigm_classifier_params, transformation_handler, transformation_classifier_params) # cls = CombinedParadigmClassifier(paradigm_classifier, transformation_handler, # paradigm_classifier_params, transformation_classifier_params) # сохраняем тестовые данные # if output_train_dir is not None: # if not os.path.exists(output_train_dir): # os.makedirs(output_train_dir) # for i, (train_sample, train_labels_sample) in\ # enumerate(zip(train_data, train_labels_with_vars), 1): # write_joint_data(os.path.join(output_train_dir, "{0}.data".format(i)), # train_sample, train_labels_sample) # применяем классификатор к данным for i, (train_sample, train_labels_sample, test_sample, test_labels_sample) in\ enumerate(zip(train_data, train_labels_with_vars, test_data, test_labels_with_vars)): cls.fit(train_sample, train_labels_sample) classes_by_cls[i] = cls.classes_ if testing_mode == 'predict': predictions[i] = cls.predict(test_sample) elif testing_mode == 'predict_proba': prediction_probs[i] = cls.predict_probs(test_sample) # в случае, если мы вернули вероятности, # то надо ещё извлечь классы if not multiclass: predictions[i] = [[elem[0][0]] for elem in prediction_probs[i]] else: raise NotImplementedError() if predict_lemmas: pred_lemmas[i] = make_lemmas(paradigm_handlers, predictions[i]) descrs_by_codes = {code: descr for descr, code in paradigm_table.items()} if output_pred_dir: test_words = test_data if testing_mode == 'predict_proba': prediction_probs_for_output = prediction_probs else: prediction_probs_for_output = None else: test_words, prediction_probs_for_output = None, None if not predict_lemmas: label_precisions, variable_precisions, form_precisions =\ output_accuracies(classes, test_labels_with_vars, predictions, multiclass, outfile=output_pred_dir, paradigm_descrs=descrs_by_codes, test_words=test_words, predicted_probs=prediction_probs_for_output) print("{0}\t{1:<.2f}\t{2}\t{3:<.2f}\t{4:<.2f}\t{5:<.2f}".format( max_length, fraction, cls.paradigm_classifier.nfeatures, 100 * np.mean(label_precisions), 100 * np.mean(variable_precisions), 100 * np.mean(form_precisions))) else: label_precisions, variable_precisions, lemma_precisions, form_precisions =\ output_accuracies(classes, test_labels_with_vars, predictions, multiclass, test_lemmas, pred_lemmas, outfile=output_pred_dir, paradigm_descrs=descrs_by_codes, test_words=test_words, predicted_probs=prediction_probs_for_output, save_confusion_matrices=True) print("{0}\t{1:<.2f}\t{2}\t{3:<.2f}\t{4:<.2f}\t{5:<.2f}\t{6:<.2f}".format( max_length, fraction, cls.paradigm_classifier.nfeatures, 100 * np.mean(label_precisions), 100 * np.mean(variable_precisions), 100 * np.mean(lemma_precisions), 100 * np.mean(form_precisions))) # statprof.stop() # with open("statprof_{0:.1f}_{1:.1f}.stat".format(fraction, feature_fraction), "w") as fout: # with redirect_stdout(fout): # statprof.display() # вычисляем точность и обрабатываем результаты # for curr_test_values, curr_pred_values in zip(test_values_with_codes, pred_values_with_codes): # print(len(curr_test_values), len(curr_pred_values)) # for first, second in zip(curr_test_values, curr_pred_values): # first_code, first_vars = first[0].split('_')[0], tuple(first[0].split('_')[1:]) # second_code, second_vars = second[0].split('_')[0], tuple(second[0].split('_')[1:]) # if first_code == second_code and first_vars != second_vars: # print('{0}\t{1}'.format(first, second)) # if not multiclass: # confusion_matrices = [skm.confusion_matrix(first, second, labels=classes) # for first, second in zip(firsts, seconds)] # сохраняем результаты классификации # if output_pred_dir is not None: # if not os.path.exists(output_pred_dir): # os.makedirs(output_pred_dir) # if testing_mode == 'predict': # for i, (test_sample, pred_labels_sample, true_labels_sample) in\ # enumerate(zip(test_data, predictions, test_labels), 1): # write_data(os.path.join(output_pred_dir, "{0}.data".format(i)), # test_sample, pred_labels_sample, true_labels_sample) # elif testing_mode == 'predict_proba': # for i, (test_sample, pred_probs_sample, labels_sample, cls_classes) in\ # enumerate(zip(test_data, prediction_probs, test_labels, classes_by_cls), 1): # write_probs_data(os.path.join(output_pred_dir, "{0}.prob".format(i)), # test_sample, pred_probs_sample, cls_classes, labels_sample) # сохраняем матрицы ошибок классификации # if not multiclass and nfolds <= 1: # confusion_matrices_folder = "confusion_matrices" # if not os.path.exists(confusion_matrices_folder): # os.makedirs(confusion_matrices_folder) # dest = os.path.join(confusion_matrices_folder, # "confusion_matrix_{0}_{1:<.2f}_{2:<.2f}.out".format( # max_length, fraction, feature_fraction)) # with open(dest, "w", encoding="utf8") as fout: # fout.write("{0:<4}".format("") + # "".join("{0:>4}".format(label) for label in classes) + "\n") # for label, elem in zip(cls.classes_, confusion_matrices[0]): # nonzero_positions = np.nonzero(elem) # nonzero_counts = np.take(elem, nonzero_positions)[0] # nonzero_labels = np.take(classes, nonzero_positions)[0] # fout.write("{0:<4}\t".format(label)) # fout.write("\t".join("{0}:{1}".format(*pair) # for pair in sorted(zip(nonzero_labels, nonzero_counts), # key=(lambda x: x[1]), reverse=True)) # + "\n") return
def cv_mode(testing_mode, language_code, multiclass, predict_lemmas, paradigm_file, counts_file, infile, train_fraction, feature_fraction, paradigm_counts_threshold, nfolds, selection_method, binarization_method, max_feature_length, output_train_dir=None, output_pred_dir=None): lemma_descriptions_list = process_lemmas_file(infile) data, labels_with_vars = read_lemmas(lemma_descriptions_list, multiclass=multiclass, return_joint=True) paradigm_descriptions_list = process_codes_file(paradigm_file) paradigm_table, paradigm_counts = read_paradigms(paradigm_descriptions_list) word_counts_table = read_counts(counts_file) if predict_lemmas: paradigm_handlers = {code: ParadigmSubstitutor(descr) for descr, code in paradigm_table.items()} else: test_lemmas, pred_lemmas = None, None # подготовка для кросс-валидации classes = sorted(set(chain(*((x[0] for x in elem) for elem in labels_with_vars)))) active_paradigm_codes = [i for i, count in paradigm_counts.items() if count >= paradigm_counts_threshold] # зачем нужны метки??? marks = [LEMMA_KEY] + [",".join(x) for x in get_categories_marks(language_code)] paradigms_by_codes = {code: descr for descr, code in paradigm_table.items()} # подготовка данных для локальных трансформаций if selection_method is None: selection_method = 'ambiguity' if nfolds == 0: train_data_length = int(train_fraction * len(data)) train_data, test_data = [data[:train_data_length]], [data[train_data_length:]] train_labels_with_vars, test_labels_with_vars =\ [labels_with_vars[:train_data_length]], [labels_with_vars[train_data_length:]] nfolds = 1 else: test_data, train_data = [None] * nfolds, [None] * nfolds test_labels_with_vars, train_labels_with_vars = [None] * nfolds, [None] * nfolds for fold in range(nfolds): train_data[fold], test_data[fold], train_labels_with_vars[fold], test_labels_with_vars[fold] =\ skcv.train_test_split(data, labels_with_vars, test_size = 1.0 - train_fraction, random_state = 100 * fold + 13) if predict_lemmas: test_lemmas = [None] * nfolds for fold in range(nfolds): test_lemmas[fold] = make_lemmas(paradigm_handlers, test_labels_with_vars[fold]) predictions = [None] * nfolds prediction_probs = [None] * nfolds classes_by_cls = [None] * nfolds if predict_lemmas: pred_lemmas = [None] * nfolds # задаём классификатор # cls = ParadigmCorporaClassifier(marks, paradigm_table, word_counts_table, # multiclass=multiclass, selection_method=selection_method, # binarization_method=binarization_method, # inner_feature_fraction=feature_fraction, # active_paradigm_codes=active_paradigm_codes, # paradigm_counts=paradigm_counts , smallest_prob=0.01) cls = ParadigmCorporaClassifier(paradigm_table, word_counts_table, multiclass=multiclass, selection_method=selection_method, binarization_method=binarization_method, inner_feature_fraction=feature_fraction, active_paradigm_codes=active_paradigm_codes, paradigm_counts=paradigm_counts , smallest_prob=0.001) cls_params = {'max_length': max_feature_length} transformation_handler = TransformationsHandler(paradigm_table, paradigm_counts) transformation_classifier_params = {'select_features': 'ambiguity', 'selection_params': {'nfeatures': 0.1, 'min_count': 2}} # statprof.start() cls = JointParadigmClassifier(cls, transformation_handler, cls_params, transformation_classifier_params) # cls = CombinedParadigmClassifier(cls, transformation_handler, cls_params, # transformation_classifier_params) # сохраняем тестовые данные # if output_train_dir is not None: # if not os.path.exists(output_train_dir): # os.makedirs(output_train_dir) # for i, (train_sample, train_labels_sample) in\ # enumerate(zip(train_data, train_labels_with_vars), 1): # write_joint_data(os.path.join(output_train_dir, "{0}.data".format(i)), # train_sample, train_labels_sample) # применяем классификатор к данным for i, (train_sample, train_labels_sample, test_sample, test_labels_sample) in\ enumerate(zip(train_data, train_labels_with_vars, test_data, test_labels_with_vars)): cls.fit(train_sample, train_labels_sample) classes_by_cls[i] = cls.classes_ if testing_mode == 'predict': predictions[i] = cls.predict(test_sample) elif testing_mode == 'predict_proba': prediction_probs[i] = cls.predict_probs(test_sample) # в случае, если мы вернули вероятности, # то надо ещё извлечь классы if not multiclass: predictions[i] = [[elem[0][0]] for elem in prediction_probs[i]] else: raise NotImplementedError() if predict_lemmas: pred_lemmas[i] = make_lemmas(paradigm_handlers, predictions[i]) # statprof.stop() # with open("statprof_{0:.1f}_{1:.1f}.stat".format(train_fraction, # feature_fraction), "w") as fout: # with redirect_stdout(fout): # statprof.display() if output_pred_dir: descrs_by_codes = {code: descr for descr, code in paradigm_table.items()} test_words = test_data if testing_mode == 'predict_proba': prediction_probs_for_output = prediction_probs else: prediction_probs_for_output = None else: descrs_by_codes, test_words, prediction_probs_for_output = None, None, None if not predict_lemmas: label_precisions, variable_precisions, form_precisions =\ output_accuracies(classes, test_labels_with_vars, predictions, multiclass, outfile=output_pred_dir, paradigm_descrs=descrs_by_codes, test_words=test_words, predicted_probs=prediction_probs_for_output, save_confusion_matrices=True) print("{0:<.2f}\t{1}\t{2:<.2f}\t{3:<.2f}\t{4:<.2f}".format( train_fraction, cls.paradigm_classifier.nfeatures, 100 * np.mean(label_precisions), 100 * np.mean(variable_precisions), 100 * np.mean(form_precisions))) else: label_precisions, variable_precisions, lemma_precisions, form_precisions =\ output_accuracies(classes, test_labels_with_vars, predictions, multiclass, test_lemmas, pred_lemmas, outfile=output_pred_dir, paradigm_descrs=descrs_by_codes, test_words=test_words, predicted_probs=prediction_probs_for_output, save_confusion_matrices=True) print("{0:<.2f}\t{1}\t{2:<.2f}\t{3:<.2f}\t{4:<.2f}\t{5:<.2f}".format( train_fraction, cls.paradigm_classifier.nfeatures, 100 * np.mean(label_precisions), 100 * np.mean(variable_precisions), 100 * np.mean(lemma_precisions), 100 * np.mean(form_precisions))) # statprof.stop() # with open("statprof_{0:.1f}_{1:.1f}.stat".format(fraction, feature_fraction), "w") as fout: # with redirect_stdout(fout): # statprof.display() # вычисляем точность и обрабатываем результаты # for curr_test_values, curr_pred_values in zip(test_values_with_codes, pred_values_with_codes): # for first, second in zip(curr_test_values, curr_pred_values): # first_code, first_vars = first[0].split('_')[0], tuple(first[0].split('_')[1:]) # second_code, second_vars = second[0].split('_')[0], tuple(second[0].split('_')[1:]) # if first_code == second_code and first_vars != second_vars: # print('{0}\t{1}'.format(first, second)) # if not multiclass: # confusion_matrices = [skm.confusion_matrix(first, second, labels=classes) # for first, second in zip(firsts, seconds)] # сохраняем результаты классификации # if output_pred_dir is not None: # if not os.path.exists(output_pred_dir): # os.makedirs(output_pred_dir) # if testing_mode == 'predict': # for i, (test_sample, pred_labels_sample, true_labels_sample) in\ # enumerate(zip(test_data, predictions, test_labels), 1): # write_data(os.path.join(output_pred_dir, "{0}.data".format(i)), # test_sample, pred_labels_sample, true_labels_sample) # elif testing_mode == 'predict_proba': # for i, (test_sample, pred_probs_sample, labels_sample, cls_classes) in\ # enumerate(zip(test_data, prediction_probs, test_labels, classes_by_cls), 1): # write_probs_data(os.path.join(output_pred_dir, "{0}.prob".format(i)), # test_sample, pred_probs_sample, cls_classes, labels_sample) # сохраняем матрицы ошибок классификации # if not multiclass and nfolds <= 1: # confusion_matrices_folder = "confusion_matrices" # if not os.path.exists(confusion_matrices_folder): # os.makedirs(confusion_matrices_folder) # dest = os.path.join(confusion_matrices_folder, # "confusion_matrix_{0}_{1:<.2f}_{2:<.2f}.out".format( # max_length, fraction, feature_fraction)) # with open(dest, "w", encoding="utf8") as fout: # fout.write("{0:<4}".format("") + # "".join("{0:>4}".format(label) for label in classes) + "\n") # for label, elem in zip(cls.classes_, confusion_matrices[0]): # nonzero_positions = np.nonzero(elem) # nonzero_counts = np.take(elem, nonzero_positions)[0] # nonzero_labels = np.take(classes, nonzero_positions)[0] # fout.write("{0:<4}\t".format(label)) # fout.write("\t".join("{0}:{1}".format(*pair) # for pair in sorted(zip(nonzero_labels, nonzero_counts), # key=(lambda x: x[1]), reverse=True)) # + "\n") return