def test_with_remainder(self): objects = [0, 1, 2, 3, 4, 5, 6, 7] n_folds = 3 split_0 = [3, 4, 5, 6, 7], [0, 1, 2] split_1 = [0, 1, 2, 6, 7], [3, 4, 5] split_2 = [0, 1, 2, 3, 4, 5], [6, 7] for idx, split in enumerate((split_0, split_1, split_2)): self.assertEqual(split, get_fold(objects, n_folds, idx))
def test_no_remainder(self): objects = [0, 1, 2, 3, 4, 5] n_folds = 3 split_0 = [2, 3, 4, 5], [0, 1] split_1 = [0, 1, 4, 5], [2, 3] split_2 = [0, 1, 2, 3], [4, 5] for idx, split in enumerate((split_0, split_1, split_2)): self.assertEqual(split, get_fold(objects, n_folds, idx))
def main(): args = create_argparser().parse_args() docs = load(args.docs_path) evaluator = evaluator_for(args.task_name) if args.transformers_props_path is not None: with open(args.transformers_props_path, "r", encoding="utf-8") as f, \ transformer_from_props(json.load(f)) as transformer: docs = [transformer.transform(doc) for doc in docs] if args.strategy == "holdout": folds_num = 1 models = [args.model_path] else: folds_num = len(args.splits_model_paths) models = args.splits_model_paths main_scores = [] for split_idx, model_path in enumerate(models): _, test_docs = get_fold(docs, folds_num, split_idx) with classifier_for(args.task_name)(model_path) as clf: main_score, scores, stats_generator = evaluator( clf, test_docs, args.stats_path is not None) main_scores.append(main_score) print("Split {}, Main score={:.4f}".format(split_idx, main_score)) print( f"Scores: \n{json.dumps(scores, indent=4, sort_keys=True)}\n") if stats_generator is not None: stats_path = join(args.stats_path, f"split_{split_idx}") makedirs(stats_path, exist_ok=True) for doc_idx, doc in enumerate(test_docs): with open(join(stats_path, doc.name + '_stats.txt'), 'w', encoding='utf-8') as f: f.write(stats_generator(doc_idx)) print("\nMean splits score={:.4f}".format( sum(main_scores) / len(main_scores)))
def train(self, props: dict, params: dict, working_dir: str): docs = load(params['data_path']) serializer = CoNLLSerializer() n_folds, fold_num, seed = params['n_folds'], props['fold_num'], props[ 'seed'] out_path = os.path.join(working_dir, str(fold_num) + '_fold', str(seed)) param_str = params_to_str(props) print(param_str) os.makedirs(out_path, exist_ok=True) train_docs, dev_docs = get_fold(docs, n_folds, fold_num) print("Fold:", fold_num) with open(os.path.join(out_path, 'gold.conll'), 'w', encoding="utf-8") as f: serializer.serialize_docs(dev_docs, f) classifier_path = props.get('classifier_path') if classifier_path is not None and props.get( 'sampling_strategy', 'coref') in [ 'coref_pron', 'coref_pron_cluster', 'coref_pron_cluster_strict' ]: with Classifier(classifier_path) as clf: print("Applying known model") known_rels = get_known_rels(clf, dev_docs) strategies = ['pron_rank', 'pron_vote_rank'] else: known_rels = None strategies = ['easy_first'] hook, evaluation_result = get_evaluating_hook( serializer, dev_docs, out_path, seed, fold_num, params.get('save_models', False), strategies, known_rels) with Trainer(props) as trainer: trainer.train(train_docs, hook) return dict(evaluation_result)
def get_splits(self): for i in range(args.folds): yield get_fold(self.__docs, args.folds, i)
def test_single_fold(self): objects = [0, 1, 2, 3] expected = [], [0, 1, 2, 3] self.assertEqual(expected, get_fold(objects, 1, 0))