def fit(self, data): if self.fit_lm: if self.lm_file is None: if self.save_lm_file is None: raise ValueError( "Either lm_file or save_lm_file should be given") save_language_model([list(x[2][0]) for x in data], self.lm_order, self.save_lm_file) self.lm_file = self.save_lm_file joint_classifier_params =\ {'has_language_model': True, 'lm_file': self.lm_file, 'has_joint_classifier': True, 'max_lm_coeff': 2.0} else: joint_classifier_params =\ {'has_language_model': False, 'has_joint_classifier': False} self.affixes_remover.train(data) joint_data = self.make_paradigms_from_data(data) transformations_handler = TransformationsHandler(self.transform_codes) transformation_classifier_params = { 'select_features': 'ambiguity', 'selection_params': { 'nfeatures': 0.25, 'min_count': 2 } } self.classifiers = [None] * self.problems_number classifier_params = get_classifier_params(self.language, self.task_type) # classifier_params['paradigm_table'] = self.transform_codes classifier_params['min_feature_count'] = 3 classifier_params['nfeatures'] = 0.1 for i, problem_descr in enumerate(self.problem_codes): curr_classifier_params = copy.copy(classifier_params) prefixes, suffixes = self.affixes_remover.get_affixes( problem_descr) curr_classifier_params['prefixes_to_remove'] = prefixes curr_classifier_params['suffixes_to_remove'] = suffixes self.classifiers[i] = JointParadigmClassifier( self.transform_codes, curr_classifier_params, transformations_handler, transformation_classifier_params, **joint_classifier_params) data_by_problems = arrange_data_by_problems(joint_data, self.problems_number, has_answers=True) for i, (_, curr_X, curr_y) in enumerate(data_by_problems): # if i != 87: # continue if i % 20 == 0: print("Classifier {} fitting...".format(i + 1)) self.classifiers[i].fit(curr_X, [[x] for x in curr_y]) return self
def fit(self, data): joint_data = self.make_paradigms_from_data(data) transformations_handler = TransformationsHandler(self.transform_codes) transformation_classifier_params = {'select_features': 'ambiguity', 'selection_params': {'nfeatures': 0.25, 'min_count': 2}} self.classifiers = [None] * self.problems_number # НЕМЕЦКИЙ # classifier_params = {'paradigm_table': self.transform_codes, 'use_prefixes': True, # 'max_prefix_length': 2, 'suffixes_to_delete': ['en'], # 'has_letter_classifiers': False} # ИСПАНСКИЙ # classifier_params = {'paradigm_table': self.transform_codes, 'use_prefixes': False} # АРАБСКИЙ # classifier_params = {'paradigm_table': self.transform_codes, 'use_prefixes': True, # 'max_prefix_length': 4, 'max_length': 4, # 'has_letter_classifiers': None, 'to_memorize_affixes': 0} # ГРУЗИНСКИЙ classifier_params = {'paradigm_table': self.transform_codes, 'use_prefixes': True, 'max_prefix_length': 4, 'has_letter_classifiers': 'suffix', 'to_memorize_affixes': 0} # ФИНСКИЙ # classifier_params = {'paradigm_table': self.transform_codes, 'use_prefixes': False} # РУССКИЙ # classifier_params = {'paradigm_table': self.transform_codes, 'use_prefixes': False, # 'max_prefix_length': 3, 'suffixes_to_delete': ['ся', 'сь'], # 'to_memorize_affixes': 2, 'has_letter_classifiers': False} # НАВАХО # classifier_params = {'paradigm_table': self.transform_codes, 'use_prefixes': True, # 'max_prefix_length': 5, 'max_length': 3, 'has_letter_classifiers': 'prefix', # 'to_memorize_affixes': -3} # ТУРЕЦКИЙ # classifier_params = {'paradigm_table': self.transform_codes, 'use_prefixes': False} classifier_params['min_feature_count'] = 3 classifier_params['nfeatures'] = 0.1 for i in range(self.problems_number): self.classifiers[i] = JointParadigmClassifier( ParadigmClassifier(**classifier_params), transformations_handler, dict(), transformation_classifier_params) data_by_problems = arrange_data_by_problems( joint_data, self.problems_number, has_answers=True) for i, (_, curr_X, curr_y) in enumerate(data_by_problems): self.classifiers[i].fit(curr_X, [[x] for x in curr_y]) # print("Classifier {0} of {1} fitted".format(i+1, self.problems_number)) return self
def fit(self, data): if self.fit_lm: if self.lm_file is None: if self.save_lm_file is None: raise ValueError( "Either lm_file or save_lm_file should be given") save_language_model([list(x[0]) for x in data], self.lm_order, self.save_lm_file) self.lm_file = self.save_lm_file joint_classifier_params =\ {'has_language_model': True, 'lm_file': self.lm_file, 'has_joint_classifier': True, 'max_lm_coeff': 2.0} else: joint_classifier_params =\ {'has_language_model': False, 'has_joint_classifier': False} self._initialize_affix_removal_params() reversed_data =\ [((elem[2][0], elem[1], [elem[0]]) + elem[2:]) for elem in data] self.form_affix_remover.train(reversed_data) self.lemma_affix_remover.train(data) joint_data = self.make_paradigms_from_data(data) joint_reversed_data = self.make_paradigms_from_data(reversed_data) transformation_handler = TransformationsHandler(self.transform_codes) transformation_classifier_params = { 'select_features': 'ambiguity', 'selection_params': { 'nfeatures': 0.25, 'min_count': 2 } } classifier_params = get_classifier_params(self.language) classifier_params['paradigm_table'] = self.transform_codes classifier_params['min_feature_count'] = 3 classifier_params['nfeatures'] = 0.25 reversed_classifier_params = copy.copy(classifier_params) reversed_classifier_params['max_length'] = 6 if not reversed_classifier_params.get('use_prefixes', False): reversed_classifier_params['use_prefixes'] = True reversed_classifier_params['max_prefix_length'] = 3 self.direct_classifiers = [None] * self.problems_number self.reversed_classifiers = [None] * self.problems_number data_by_problems = arrange_data_by_problems(joint_data, self.problems_number, has_answers=True) reversed_data_by_problems = arrange_data_by_problems( joint_reversed_data, self.problems_number, has_answers=True) for i, problem_descr in enumerate(self.problem_codes): # классификаторы лемма-словоформа direct_classifier_params = copy.copy(classifier_params) prefixes, suffixes =\ self.lemma_affix_remover.get_affixes(problem_descr) direct_classifier_params['prefixes_to_remove'] = prefixes direct_classifier_params['suffixes_to_remove'] = suffixes self.direct_classifiers[i] = JointParadigmClassifier( self.transform_codes, direct_classifier_params, transformation_handler, transformation_classifier_params) _, curr_X, curr_y = data_by_problems[i] self.direct_classifiers[i].fit(curr_X, [[label] for label in curr_y]) if i % 1 == 0: print("Classifiers {} fitted".format(i + 1)) # классификаторы словоформа-лемма # удаляем суффиксы, здесь это особенно важно (???) curr_classifier_params = copy.copy(reversed_classifier_params) prefixes, suffixes =\ self.form_affix_remover.get_affixes(problem_descr) curr_classifier_params['prefixes_to_remove'] = prefixes curr_classifier_params['suffixes_to_remove'] = suffixes self.reversed_classifiers[i] = JointParadigmClassifier( self.transform_codes, curr_classifier_params, transformation_handler, transformation_classifier_params, **joint_classifier_params) _, curr_X, curr_y = reversed_data_by_problems[i] self.reversed_classifiers[i].fit(curr_X, [[label] for label in curr_y]) if i % 1 == 0: print("Classifiers {} fitted".format(i + 1)) return self
def _initialize(self): self.transformations_handler = TransformationsHandler( self.paradigm_codes, self.paradigm_counts) if not os.path.exists(self.tmp_folder): os.makedirs(self.tmp_folder)
class LMParadigmClassifier(BaseEstimator, ClassifierMixin): """ Пытаемся классифицировать парадигмы с помощью языковых моделей """ def __init__(self, paradigm_codes, paradigm_counts, lm_order=3, lm_type="kenlm", multiclass=False, tmp_folder="saved_models"): self.paradigm_codes = paradigm_codes self.paradigm_counts = paradigm_counts self.lm_order = lm_order self.lm_type = lm_type self.tmp_folder = tmp_folder self.multiclass = multiclass self.lm = None self.filename_count = 1 self._initialize() def _initialize(self): self.transformations_handler = TransformationsHandler( self.paradigm_codes, self.paradigm_counts) if not os.path.exists(self.tmp_folder): os.makedirs(self.tmp_folder) def fit(self, X, y): lemmas_with_codes_and_vars = list( chain.from_iterable([(lemma, code, values) for code, values in label] for lemma, label in zip(X, y))) strings_for_lm_learning = \ self.transformations_handler._extract_transformations_for_lm_learning( lemmas_with_codes_and_vars) self.infile = os.path.join( self.tmp_folder, "saved_models_{0}.sav".format(self.filename_count)) with open(self.infile, "w") as fout: for seq in strings_for_lm_learning: fout.write(" ".join(map(str, seq)) + "\n") self.outfile = os.path.join( self.tmp_folder, "saved_models_{0}.arpa".format(self.filename_count)) with open(self.infile, "r") as fin, open(self.outfile, "w") as fout: subprocess.call([ "/data/sorokin/tools/kenlm/bin/lmplz", "-o", str(self.lm_order), "-S", "4G" ], stdin=fin, stdout=fout) if self.lm_type == "pynlpl": self.lm = ARPALanguageModel(self.outfile, base_e=False) elif self.lm_type == "kenlm": self.lm = Model(self.outfile) return self @property def transformation_codes(self): return self.transformations_handler.transformation_codes @property def transformations_by_strings(self): return self.transformations_handler.transformations_by_strings @property def transformations(self): return self.transformations_handler.transformations def get_best_continuous_score(self, word): total_score = 0 best_variant, best_score = None, -np.inf if self.lm_type == "kenlm": state = State() self.lm.BeginSentenceWrite(state) else: history = ('<s>', ) for i, symbol in enumerate(word): prefix, suffix = word[:i], word[i:] for code in self.transformations_by_strings.get(suffix, []): if self.lm_type == "kenlm": # curr_state изменяется в функции BaseScore, поэтому копируем curr_state, out_state = copy.copy(state), State() code_score = self.lm.BaseScore(curr_state, str(code), out_state) end_score = self.lm.EndSentenceBaseScore( out_state, State()) elif self.lm_type == "pynlpl": code_score = self.lm.scoreword(str(code), history) new_history = history + (code, ) end_score = self.lm.scoreword('</s>', new_history) score = (total_score + code_score + end_score) # / (i + 1) # print("{3} {0} {1} {2:.2f}".format( # " ".join(prefix), self.transformations[code], score, code)) if score > best_score: best_variant = list(prefix) + [code] best_score = score print("{3} {0} {1} {2:.2f}".format( " ".join(prefix), self.transformations[code], score, code)) if self.lm_type == "kenlm": out_state = State() score = self.lm.BaseScore(state, symbol, out_state) state = out_state elif self.lm_type == "pynlpl": score = self.lm.scoreword(symbol, history) history += (symbol, ) total_score += score curr_state, word_score = out_state, total_score for code in sorted(self.transformations_by_strings[""]): if self.lm_type == "kenlm": state, out_state = copy.copy(curr_state), State() code_score = self.lm.BaseScore(curr_state, str(code), out_state) end_score = self.lm.EndSentenceBaseScore(out_state, state) elif self.lm_type == "pynlpl": code_score = self.lm.scoreword(str(code), history) new_history = history + (str(code), ) end_score = self.lm.scoreword('</s>', new_history) score = (total_score + code_score + end_score) # / (len(word) + 1) # print("{0} {1} {2:.2f}".format(" ".join(word), # "#".join(self.transformations[code].trans), score)) if score > best_score: best_variant, best_score = list(word) + [code], score print("{3} {0} {1} {2:.2f}".format(" ".join(word), self.transformations[code], score, code)) answer = [] for elem in best_variant: if isinstance(elem, str): answer.append([elem] * 13) else: answer.append(self.transformations[elem].trans) print("#".join("".join(elem) for elem in zip(*answer))) def test(self): """ Тестируем интерфейс языковых моделей """ word = "лыжня" self.get_best_continuous_score(word)
def cv_mode(testing_mode, multiclass, predict_lemmas, find_flection, paradigm_file, infile, max_length, fraction, nfolds=0, selection_method=None, feature_fraction=None, output_train_dir=None, output_pred_dir=None): ''' Определяет качество классификатора с заданными параметрами по скользящему контролю на обучающей выборке Параметры: ----------- testing_mode: str ('predict' or 'predict_proba'), режим использования multiclass: bool, может ли одно слово иметь несколько парадигм find_flection: bool, выполняется ли предварительный поиск флексии для того, чтобы использовать в качестве признаков суффиксы основы, а не всего слова. Оказалось, что качество ухудшается paradigm_file: str, путь к файлу с парадигмами infile: str, путь к файлу с обучающей выборкой fraction: float, доля обучающей выборки nfolds: int, optional(default=0) число разбиений, по которым производится усреднение при скользящем контроле nfolds=0 --- в обучающую выборку попадает соответствующее число лексем из начала файла selection_method: str or None, optional (default=None), метод отбора признаков feature_fraction: float or None, optional (default=None), доля признаков, которые следует оставить при отборе признаков (при этом nfeatures должно быть не задано) output_train_dir: str or None, optional(default=None), директория для сохранения тестовых данных, в случае output_train_dir=None сохранение не производится output_pred_dir: str or None, optional(default=None), директория для сохранения результатов классификации, в случае output_train_dir=None сохранение не производится ''' # чтение входных файлов lemma_descriptions_list = process_lemmas_file(infile) data, labels_with_vars = read_lemmas(lemma_descriptions_list, multiclass=multiclass, return_joint=True) paradigm_descriptions_list = process_codes_file(paradigm_file) paradigm_table, pattern_counts = read_paradigms(paradigm_descriptions_list) if predict_lemmas: paradigm_handlers = {code: ParadigmSubstitutor(descr) for descr, code in paradigm_table.items()} else: test_lemmas, pred_lemmas = None, None # подготовка для кросс-валидации classes = sorted(set(chain(*((x[0] for x in elem) for elem in labels_with_vars)))) if selection_method is None: selection_method = 'ambiguity' if nfolds == 0: train_data_length = int(fraction * len(data)) train_data, test_data = [data[:train_data_length]], [data[train_data_length:]] train_labels_with_vars, test_labels_with_vars =\ [labels_with_vars[:train_data_length]], [labels_with_vars[train_data_length:]] nfolds = 1 else: test_data, train_data = [None] * nfolds, [None] * nfolds test_labels_with_vars, train_labels_with_vars = [None] * nfolds, [None] * nfolds for fold in range(nfolds): train_data[fold], test_data[fold], train_labels_with_vars[fold], test_labels_with_vars[fold] =\ skcv.train_test_split(data, labels_with_vars, test_size = 1.0 - fraction, random_state = 100 * fold + 13) if predict_lemmas: test_lemmas = [None] * nfolds for fold in range(nfolds): test_lemmas[fold] = make_lemmas(paradigm_handlers, test_labels_with_vars[fold]) predictions = [None] * nfolds prediction_probs = [None] * nfolds classes_by_cls = [None] * nfolds if predict_lemmas: pred_lemmas = [None] * nfolds # задаём классификатор paradigm_classifier = ParadigmClassifier(paradigm_table) paradigm_classifier_params = {'multiclass': multiclass, 'find_flection': find_flection, 'max_length': max_length, 'use_prefixes': True, 'classifier_params': None, 'selection_method': selection_method, 'nfeatures': feature_fraction, 'smallest_prob': 0.01} transformation_handler = TransformationsHandler(paradigm_table, pattern_counts) transformation_classifier_params = {'select_features': 'ambiguity', 'selection_params': {'nfeatures': 0.1, 'min_count': 2}} # statprof.start() cls = JointParadigmClassifier(paradigm_table, paradigm_classifier_params, transformation_handler, transformation_classifier_params) # cls = CombinedParadigmClassifier(paradigm_classifier, transformation_handler, # paradigm_classifier_params, transformation_classifier_params) # сохраняем тестовые данные # if output_train_dir is not None: # if not os.path.exists(output_train_dir): # os.makedirs(output_train_dir) # for i, (train_sample, train_labels_sample) in\ # enumerate(zip(train_data, train_labels_with_vars), 1): # write_joint_data(os.path.join(output_train_dir, "{0}.data".format(i)), # train_sample, train_labels_sample) # применяем классификатор к данным for i, (train_sample, train_labels_sample, test_sample, test_labels_sample) in\ enumerate(zip(train_data, train_labels_with_vars, test_data, test_labels_with_vars)): cls.fit(train_sample, train_labels_sample) classes_by_cls[i] = cls.classes_ if testing_mode == 'predict': predictions[i] = cls.predict(test_sample) elif testing_mode == 'predict_proba': prediction_probs[i] = cls.predict_probs(test_sample) # в случае, если мы вернули вероятности, # то надо ещё извлечь классы if not multiclass: predictions[i] = [[elem[0][0]] for elem in prediction_probs[i]] else: raise NotImplementedError() if predict_lemmas: pred_lemmas[i] = make_lemmas(paradigm_handlers, predictions[i]) descrs_by_codes = {code: descr for descr, code in paradigm_table.items()} if output_pred_dir: test_words = test_data if testing_mode == 'predict_proba': prediction_probs_for_output = prediction_probs else: prediction_probs_for_output = None else: test_words, prediction_probs_for_output = None, None if not predict_lemmas: label_precisions, variable_precisions, form_precisions =\ output_accuracies(classes, test_labels_with_vars, predictions, multiclass, outfile=output_pred_dir, paradigm_descrs=descrs_by_codes, test_words=test_words, predicted_probs=prediction_probs_for_output) print("{0}\t{1:<.2f}\t{2}\t{3:<.2f}\t{4:<.2f}\t{5:<.2f}".format( max_length, fraction, cls.paradigm_classifier.nfeatures, 100 * np.mean(label_precisions), 100 * np.mean(variable_precisions), 100 * np.mean(form_precisions))) else: label_precisions, variable_precisions, lemma_precisions, form_precisions =\ output_accuracies(classes, test_labels_with_vars, predictions, multiclass, test_lemmas, pred_lemmas, outfile=output_pred_dir, paradigm_descrs=descrs_by_codes, test_words=test_words, predicted_probs=prediction_probs_for_output, save_confusion_matrices=True) print("{0}\t{1:<.2f}\t{2}\t{3:<.2f}\t{4:<.2f}\t{5:<.2f}\t{6:<.2f}".format( max_length, fraction, cls.paradigm_classifier.nfeatures, 100 * np.mean(label_precisions), 100 * np.mean(variable_precisions), 100 * np.mean(lemma_precisions), 100 * np.mean(form_precisions))) # statprof.stop() # with open("statprof_{0:.1f}_{1:.1f}.stat".format(fraction, feature_fraction), "w") as fout: # with redirect_stdout(fout): # statprof.display() # вычисляем точность и обрабатываем результаты # for curr_test_values, curr_pred_values in zip(test_values_with_codes, pred_values_with_codes): # print(len(curr_test_values), len(curr_pred_values)) # for first, second in zip(curr_test_values, curr_pred_values): # first_code, first_vars = first[0].split('_')[0], tuple(first[0].split('_')[1:]) # second_code, second_vars = second[0].split('_')[0], tuple(second[0].split('_')[1:]) # if first_code == second_code and first_vars != second_vars: # print('{0}\t{1}'.format(first, second)) # if not multiclass: # confusion_matrices = [skm.confusion_matrix(first, second, labels=classes) # for first, second in zip(firsts, seconds)] # сохраняем результаты классификации # if output_pred_dir is not None: # if not os.path.exists(output_pred_dir): # os.makedirs(output_pred_dir) # if testing_mode == 'predict': # for i, (test_sample, pred_labels_sample, true_labels_sample) in\ # enumerate(zip(test_data, predictions, test_labels), 1): # write_data(os.path.join(output_pred_dir, "{0}.data".format(i)), # test_sample, pred_labels_sample, true_labels_sample) # elif testing_mode == 'predict_proba': # for i, (test_sample, pred_probs_sample, labels_sample, cls_classes) in\ # enumerate(zip(test_data, prediction_probs, test_labels, classes_by_cls), 1): # write_probs_data(os.path.join(output_pred_dir, "{0}.prob".format(i)), # test_sample, pred_probs_sample, cls_classes, labels_sample) # сохраняем матрицы ошибок классификации # if not multiclass and nfolds <= 1: # confusion_matrices_folder = "confusion_matrices" # if not os.path.exists(confusion_matrices_folder): # os.makedirs(confusion_matrices_folder) # dest = os.path.join(confusion_matrices_folder, # "confusion_matrix_{0}_{1:<.2f}_{2:<.2f}.out".format( # max_length, fraction, feature_fraction)) # with open(dest, "w", encoding="utf8") as fout: # fout.write("{0:<4}".format("") + # "".join("{0:>4}".format(label) for label in classes) + "\n") # for label, elem in zip(cls.classes_, confusion_matrices[0]): # nonzero_positions = np.nonzero(elem) # nonzero_counts = np.take(elem, nonzero_positions)[0] # nonzero_labels = np.take(classes, nonzero_positions)[0] # fout.write("{0:<4}\t".format(label)) # fout.write("\t".join("{0}:{1}".format(*pair) # for pair in sorted(zip(nonzero_labels, nonzero_counts), # key=(lambda x: x[1]), reverse=True)) # + "\n") return
def cv_mode(testing_mode, language_code, multiclass, predict_lemmas, paradigm_file, counts_file, infile, train_fraction, feature_fraction, paradigm_counts_threshold, nfolds, selection_method, binarization_method, max_feature_length, output_train_dir=None, output_pred_dir=None): lemma_descriptions_list = process_lemmas_file(infile) data, labels_with_vars = read_lemmas(lemma_descriptions_list, multiclass=multiclass, return_joint=True) paradigm_descriptions_list = process_codes_file(paradigm_file) paradigm_table, paradigm_counts = read_paradigms(paradigm_descriptions_list) word_counts_table = read_counts(counts_file) if predict_lemmas: paradigm_handlers = {code: ParadigmSubstitutor(descr) for descr, code in paradigm_table.items()} else: test_lemmas, pred_lemmas = None, None # подготовка для кросс-валидации classes = sorted(set(chain(*((x[0] for x in elem) for elem in labels_with_vars)))) active_paradigm_codes = [i for i, count in paradigm_counts.items() if count >= paradigm_counts_threshold] # зачем нужны метки??? marks = [LEMMA_KEY] + [",".join(x) for x in get_categories_marks(language_code)] paradigms_by_codes = {code: descr for descr, code in paradigm_table.items()} # подготовка данных для локальных трансформаций if selection_method is None: selection_method = 'ambiguity' if nfolds == 0: train_data_length = int(train_fraction * len(data)) train_data, test_data = [data[:train_data_length]], [data[train_data_length:]] train_labels_with_vars, test_labels_with_vars =\ [labels_with_vars[:train_data_length]], [labels_with_vars[train_data_length:]] nfolds = 1 else: test_data, train_data = [None] * nfolds, [None] * nfolds test_labels_with_vars, train_labels_with_vars = [None] * nfolds, [None] * nfolds for fold in range(nfolds): train_data[fold], test_data[fold], train_labels_with_vars[fold], test_labels_with_vars[fold] =\ skcv.train_test_split(data, labels_with_vars, test_size = 1.0 - train_fraction, random_state = 100 * fold + 13) if predict_lemmas: test_lemmas = [None] * nfolds for fold in range(nfolds): test_lemmas[fold] = make_lemmas(paradigm_handlers, test_labels_with_vars[fold]) predictions = [None] * nfolds prediction_probs = [None] * nfolds classes_by_cls = [None] * nfolds if predict_lemmas: pred_lemmas = [None] * nfolds # задаём классификатор # cls = ParadigmCorporaClassifier(marks, paradigm_table, word_counts_table, # multiclass=multiclass, selection_method=selection_method, # binarization_method=binarization_method, # inner_feature_fraction=feature_fraction, # active_paradigm_codes=active_paradigm_codes, # paradigm_counts=paradigm_counts , smallest_prob=0.01) cls = ParadigmCorporaClassifier(paradigm_table, word_counts_table, multiclass=multiclass, selection_method=selection_method, binarization_method=binarization_method, inner_feature_fraction=feature_fraction, active_paradigm_codes=active_paradigm_codes, paradigm_counts=paradigm_counts , smallest_prob=0.001) cls_params = {'max_length': max_feature_length} transformation_handler = TransformationsHandler(paradigm_table, paradigm_counts) transformation_classifier_params = {'select_features': 'ambiguity', 'selection_params': {'nfeatures': 0.1, 'min_count': 2}} # statprof.start() cls = JointParadigmClassifier(cls, transformation_handler, cls_params, transformation_classifier_params) # cls = CombinedParadigmClassifier(cls, transformation_handler, cls_params, # transformation_classifier_params) # сохраняем тестовые данные # if output_train_dir is not None: # if not os.path.exists(output_train_dir): # os.makedirs(output_train_dir) # for i, (train_sample, train_labels_sample) in\ # enumerate(zip(train_data, train_labels_with_vars), 1): # write_joint_data(os.path.join(output_train_dir, "{0}.data".format(i)), # train_sample, train_labels_sample) # применяем классификатор к данным for i, (train_sample, train_labels_sample, test_sample, test_labels_sample) in\ enumerate(zip(train_data, train_labels_with_vars, test_data, test_labels_with_vars)): cls.fit(train_sample, train_labels_sample) classes_by_cls[i] = cls.classes_ if testing_mode == 'predict': predictions[i] = cls.predict(test_sample) elif testing_mode == 'predict_proba': prediction_probs[i] = cls.predict_probs(test_sample) # в случае, если мы вернули вероятности, # то надо ещё извлечь классы if not multiclass: predictions[i] = [[elem[0][0]] for elem in prediction_probs[i]] else: raise NotImplementedError() if predict_lemmas: pred_lemmas[i] = make_lemmas(paradigm_handlers, predictions[i]) # statprof.stop() # with open("statprof_{0:.1f}_{1:.1f}.stat".format(train_fraction, # feature_fraction), "w") as fout: # with redirect_stdout(fout): # statprof.display() if output_pred_dir: descrs_by_codes = {code: descr for descr, code in paradigm_table.items()} test_words = test_data if testing_mode == 'predict_proba': prediction_probs_for_output = prediction_probs else: prediction_probs_for_output = None else: descrs_by_codes, test_words, prediction_probs_for_output = None, None, None if not predict_lemmas: label_precisions, variable_precisions, form_precisions =\ output_accuracies(classes, test_labels_with_vars, predictions, multiclass, outfile=output_pred_dir, paradigm_descrs=descrs_by_codes, test_words=test_words, predicted_probs=prediction_probs_for_output, save_confusion_matrices=True) print("{0:<.2f}\t{1}\t{2:<.2f}\t{3:<.2f}\t{4:<.2f}".format( train_fraction, cls.paradigm_classifier.nfeatures, 100 * np.mean(label_precisions), 100 * np.mean(variable_precisions), 100 * np.mean(form_precisions))) else: label_precisions, variable_precisions, lemma_precisions, form_precisions =\ output_accuracies(classes, test_labels_with_vars, predictions, multiclass, test_lemmas, pred_lemmas, outfile=output_pred_dir, paradigm_descrs=descrs_by_codes, test_words=test_words, predicted_probs=prediction_probs_for_output, save_confusion_matrices=True) print("{0:<.2f}\t{1}\t{2:<.2f}\t{3:<.2f}\t{4:<.2f}\t{5:<.2f}".format( train_fraction, cls.paradigm_classifier.nfeatures, 100 * np.mean(label_precisions), 100 * np.mean(variable_precisions), 100 * np.mean(lemma_precisions), 100 * np.mean(form_precisions))) # statprof.stop() # with open("statprof_{0:.1f}_{1:.1f}.stat".format(fraction, feature_fraction), "w") as fout: # with redirect_stdout(fout): # statprof.display() # вычисляем точность и обрабатываем результаты # for curr_test_values, curr_pred_values in zip(test_values_with_codes, pred_values_with_codes): # for first, second in zip(curr_test_values, curr_pred_values): # first_code, first_vars = first[0].split('_')[0], tuple(first[0].split('_')[1:]) # second_code, second_vars = second[0].split('_')[0], tuple(second[0].split('_')[1:]) # if first_code == second_code and first_vars != second_vars: # print('{0}\t{1}'.format(first, second)) # if not multiclass: # confusion_matrices = [skm.confusion_matrix(first, second, labels=classes) # for first, second in zip(firsts, seconds)] # сохраняем результаты классификации # if output_pred_dir is not None: # if not os.path.exists(output_pred_dir): # os.makedirs(output_pred_dir) # if testing_mode == 'predict': # for i, (test_sample, pred_labels_sample, true_labels_sample) in\ # enumerate(zip(test_data, predictions, test_labels), 1): # write_data(os.path.join(output_pred_dir, "{0}.data".format(i)), # test_sample, pred_labels_sample, true_labels_sample) # elif testing_mode == 'predict_proba': # for i, (test_sample, pred_probs_sample, labels_sample, cls_classes) in\ # enumerate(zip(test_data, prediction_probs, test_labels, classes_by_cls), 1): # write_probs_data(os.path.join(output_pred_dir, "{0}.prob".format(i)), # test_sample, pred_probs_sample, cls_classes, labels_sample) # сохраняем матрицы ошибок классификации # if not multiclass and nfolds <= 1: # confusion_matrices_folder = "confusion_matrices" # if not os.path.exists(confusion_matrices_folder): # os.makedirs(confusion_matrices_folder) # dest = os.path.join(confusion_matrices_folder, # "confusion_matrix_{0}_{1:<.2f}_{2:<.2f}.out".format( # max_length, fraction, feature_fraction)) # with open(dest, "w", encoding="utf8") as fout: # fout.write("{0:<4}".format("") + # "".join("{0:>4}".format(label) for label in classes) + "\n") # for label, elem in zip(cls.classes_, confusion_matrices[0]): # nonzero_positions = np.nonzero(elem) # nonzero_counts = np.take(elem, nonzero_positions)[0] # nonzero_labels = np.take(classes, nonzero_positions)[0] # fout.write("{0:<4}\t".format(label)) # fout.write("\t".join("{0}:{1}".format(*pair) # for pair in sorted(zip(nonzero_labels, nonzero_counts), # key=(lambda x: x[1]), reverse=True)) # + "\n") return
def _initialize(self): self.transformations_handler = TransformationsHandler(self.paradigm_codes, self.paradigm_counts) if not os.path.exists(self.tmp_folder): os.makedirs(self.tmp_folder)
class LMParadigmClassifier(BaseEstimator, ClassifierMixin): """ Пытаемся классифицировать парадигмы с помощью языковых моделей """ def __init__(self, paradigm_codes, paradigm_counts, lm_order=3, lm_type="kenlm", multiclass=False, tmp_folder="saved_models"): self.paradigm_codes = paradigm_codes self.paradigm_counts = paradigm_counts self.lm_order = lm_order self.lm_type = lm_type self.tmp_folder = tmp_folder self.multiclass = multiclass self.lm = None self.filename_count = 1 self._initialize() def _initialize(self): self.transformations_handler = TransformationsHandler(self.paradigm_codes, self.paradigm_counts) if not os.path.exists(self.tmp_folder): os.makedirs(self.tmp_folder) def fit(self, X, y): lemmas_with_codes_and_vars = list(chain.from_iterable( [(lemma, code, values) for code, values in label] for lemma, label in zip(X, y))) strings_for_lm_learning = \ self.transformations_handler._extract_transformations_for_lm_learning( lemmas_with_codes_and_vars) self.infile = os.path.join(self.tmp_folder, "saved_models_{0}.sav".format(self.filename_count)) with open(self.infile, "w") as fout: for seq in strings_for_lm_learning: fout.write(" ".join(map(str, seq)) + "\n") self.outfile = os.path.join(self.tmp_folder, "saved_models_{0}.arpa".format(self.filename_count)) with open(self.infile, "r") as fin, open(self.outfile, "w") as fout: subprocess.call(["/data/sorokin/tools/kenlm/bin/lmplz", "-o", str(self.lm_order), "-S", "4G"], stdin=fin, stdout=fout) if self.lm_type == "pynlpl": self.lm = ARPALanguageModel(self.outfile, base_e=False) elif self.lm_type == "kenlm": self.lm = Model(self.outfile) return self @property def transformation_codes(self): return self.transformations_handler.transformation_codes @property def transformations_by_strings(self): return self.transformations_handler.transformations_by_strings @property def transformations(self): return self.transformations_handler.transformations def get_best_continuous_score(self, word): total_score = 0 best_variant, best_score = None, -np.inf if self.lm_type == "kenlm": state = State() self.lm.BeginSentenceWrite(state) else: history = ('<s>',) for i, symbol in enumerate(word): prefix, suffix = word[:i], word[i:] for code in self.transformations_by_strings.get(suffix, []): if self.lm_type == "kenlm": # curr_state изменяется в функции BaseScore, поэтому копируем curr_state, out_state = copy.copy(state), State() code_score = self.lm.BaseScore(curr_state, str(code), out_state) end_score = self.lm.EndSentenceBaseScore(out_state, State()) elif self.lm_type == "pynlpl": code_score = self.lm.scoreword(str(code), history) new_history = history + (code,) end_score = self.lm.scoreword('</s>', new_history) score = (total_score + code_score + end_score) # / (i + 1) print("{3} {0} {1} {2:.2f}".format( " ".join(prefix), self.transformations[code], score, code)) if score > best_score: best_variant = list(prefix) + [code] best_score = score # print("{3} {0} {1} {2:.2f}".format( # " ".join(prefix), self.transformations[code], score, code)) if self.lm_type == "kenlm": out_state = State() score = self.lm.BaseScore(state, symbol, out_state) state = out_state elif self.lm_type == "pynlpl": score = self.lm.scoreword(symbol, history) history += (symbol,) total_score += score curr_state, word_score = out_state, total_score for code in sorted(self.transformations_by_strings[""]): if self.lm_type == "kenlm": state, out_state = copy.copy(curr_state), State() code_score = self.lm.BaseScore(curr_state, str(code), out_state) end_score = self.lm.EndSentenceBaseScore(out_state, state) elif self.lm_type == "pynlpl": code_score = self.lm.scoreword(str(code), history) new_history = history + (str(code), ) end_score = self.lm.scoreword('</s>', new_history) score = (total_score + code_score + end_score) # / (len(word) + 1) print("{0} {1} {2:.2f}".format(" ".join(word), "#".join(self.transformations[code].trans), score)) if score > best_score: best_variant, best_score = list(word) + [code], score # print("{3} {0} {1} {2:.2f}".format( # " ".join(word), self.transformations[code], score, code)) answer = [] for elem in best_variant: if isinstance(elem, str): answer.append([elem] * 13) else: answer.append(self.transformations[elem].trans) print("#".join("".join(elem) for elem in zip(*answer))) def test(self): """ Тестируем интерфейс языковых моделей """ word = "лыжня" self.get_best_continuous_score(word)