def get_features(model_file, output_file): """\ """ m = Model.load_from_file(model_file) labels = m.data_headers.get_attrib(m.class_attr).labels feats = m.vectorizer.get_feature_names() fh = codecs.open(output_file, 'w', 'UTF-8') for i, label in enumerate(labels): log_info('Enumerating features for label %d (\'%s\')' % (i, label)) coefs = m.classifier.coef_[i] nonzero = [(f, c) for (f, c) in zip(feats, coefs) if c != 0] print >> fh, 'LABEL == %s' % label for f, c in sorted(nonzero, key=itemgetter(1), reverse=True): print >> fh, '%s - %f' % (f, c) print >> fh, "\n\n" fh.close()
def run_training(work_dir, config_file, train_file, model_file, test_file=None, classif_file=None, memory=MEMORY, name='train'): """\ Run the model training. """ # initialization from the configuration file _, ext = os.path.splitext(config_file) # load configuration from a pickle (we're already in the working directory) if ext == '.pickle': fh = open(config_file, mode='rb') cfg = pickle.load(fh) fh.close() demarshal_lambda(cfg, 'filter_attr') demarshal_lambda(cfg, 'postprocess') # load by running Python code (make paths relative to working directory) else: config_file = os.path.join(work_dir, config_file) cfg = Config(config_file) # training if cfg.get('unfold_pattern'): pattern = cfg['unfold_pattern'] del cfg['unfold_pattern'] unfold_key = cfg.get('unfold_key', 'unfold_key') cfgs = cfg.unfold_lists(pattern, unfold_key) for cfg in cfgs: key = re.sub(r'[^A-Za-z0-9_]', '', cfg[unfold_key]) create_job(cfg, name + '-' + key, work_dir, train_file, model_file, test_file, classif_file, memory) return if cfg.get('divide_func'): model = SplitModel(cfg) model.train(train_file, work_dir, memory) else: model = Model(cfg) model.train(train_file) # evaluation if test_file is not None and classif_file is not None: if ext != '.pickle': # this means we're not in the working directory classif_file = os.path.join(work_dir, classif_file) log_info('Evaluation on file: ' + test_file) score = model.evaluate(test_file, classif_file=classif_file) log_info('Score: ' + str(score)) # save the model if ext != '.pickle': # we need to make the path relative to work_dir model_file = os.path.join(work_dir, model_file) model.save_to_file(model_file)
def test_models(file_in, file_out, model_files, source_attr, target_attr, oov_test_file, oov_part, pos_attr, test_indiv): """\ Test all the given models on the selected file and save the target. If oov_test_file is set, performs also OOV evaluation. If test_pos is True, prints detailed results for various POSs. """ # load testing data log_info('Loading data: ' + file_in) data = DataSet() data.load_from_arff(file_in) forms = data[source_attr] # apply all models for model_num, model_file in enumerate(model_files, start=1): model = Model.load_from_file(model_file) log_info('Applying model: ' + model_file) rules = model.classify(data) output_attr = 'OUTPUT_M' + str(model_num) data.add_attrib(Attribute(output_attr, 'string'), rules) if test_indiv: good = count_correct(data, model.class_attr, output_attr) print_score(good, len(data), 'Model accuracy') forms = [inflect(form, rule) for form, rule in zip(forms, rules)] forms_attr = 'FORMS_M' + str(model_num) data.add_attrib(Attribute(forms_attr, 'string'), forms) # test the final performance log_info('Evaluating...') good = count_correct(data, target_attr, forms_attr) print_score(good, len(data), 'ALL') # evaluate without punctuation evaluate_nopunct(data, source_attr, target_attr, forms_attr) # evaluate forms different from lemma evaluate_nolemma(data, source_attr, target_attr, forms_attr) # load training data for OOV tests, evaluate on OOV if oov_test_file: evaluate_oov(data, source_attr, target_attr, forms_attr, oov_test_file, oov_part) # test on different POSes if pos_attr: evaluate_poses(data, target_attr, forms_attr, pos_attr) # save the classification results log_info('Saving data: ' + file_out) data.save_to_arff(file_out)