Пример #1
0
    def test_with_remainder(self):
        objects = [0, 1, 2, 3, 4, 5, 6, 7]
        n_folds = 3

        split_0 = [3, 4, 5, 6, 7], [0, 1, 2]
        split_1 = [0, 1, 2, 6, 7], [3, 4, 5]
        split_2 = [0, 1, 2, 3, 4, 5], [6, 7]

        for idx, split in enumerate((split_0, split_1, split_2)):
            self.assertEqual(split, get_fold(objects, n_folds, idx))
Пример #2
0
    def test_no_remainder(self):
        objects = [0, 1, 2, 3, 4, 5]
        n_folds = 3

        split_0 = [2, 3, 4, 5], [0, 1]
        split_1 = [0, 1, 4, 5], [2, 3]
        split_2 = [0, 1, 2, 3], [4, 5]

        for idx, split in enumerate((split_0, split_1, split_2)):
            self.assertEqual(split, get_fold(objects, n_folds, idx))
Пример #3
0
def main():
    args = create_argparser().parse_args()
    docs = load(args.docs_path)
    evaluator = evaluator_for(args.task_name)

    if args.transformers_props_path is not None:
        with open(args.transformers_props_path, "r", encoding="utf-8") as f, \
                transformer_from_props(json.load(f)) as transformer:
            docs = [transformer.transform(doc) for doc in docs]

    if args.strategy == "holdout":
        folds_num = 1
        models = [args.model_path]
    else:
        folds_num = len(args.splits_model_paths)
        models = args.splits_model_paths

    main_scores = []
    for split_idx, model_path in enumerate(models):
        _, test_docs = get_fold(docs, folds_num, split_idx)

        with classifier_for(args.task_name)(model_path) as clf:
            main_score, scores, stats_generator = evaluator(
                clf, test_docs, args.stats_path is not None)
            main_scores.append(main_score)

            print("Split {}, Main score={:.4f}".format(split_idx, main_score))
            print(
                f"Scores: \n{json.dumps(scores, indent=4, sort_keys=True)}\n")

            if stats_generator is not None:
                stats_path = join(args.stats_path, f"split_{split_idx}")
                makedirs(stats_path, exist_ok=True)

                for doc_idx, doc in enumerate(test_docs):
                    with open(join(stats_path, doc.name + '_stats.txt'),
                              'w',
                              encoding='utf-8') as f:

                        f.write(stats_generator(doc_idx))

    print("\nMean splits score={:.4f}".format(
        sum(main_scores) / len(main_scores)))
Пример #4
0
    def train(self, props: dict, params: dict, working_dir: str):
        docs = load(params['data_path'])
        serializer = CoNLLSerializer()

        n_folds, fold_num, seed = params['n_folds'], props['fold_num'], props[
            'seed']
        out_path = os.path.join(working_dir,
                                str(fold_num) + '_fold', str(seed))

        param_str = params_to_str(props)
        print(param_str)

        os.makedirs(out_path, exist_ok=True)

        train_docs, dev_docs = get_fold(docs, n_folds, fold_num)
        print("Fold:", fold_num)

        with open(os.path.join(out_path, 'gold.conll'), 'w',
                  encoding="utf-8") as f:
            serializer.serialize_docs(dev_docs, f)

        classifier_path = props.get('classifier_path')
        if classifier_path is not None and props.get(
                'sampling_strategy', 'coref') in [
                    'coref_pron', 'coref_pron_cluster',
                    'coref_pron_cluster_strict'
                ]:
            with Classifier(classifier_path) as clf:
                print("Applying known model")
                known_rels = get_known_rels(clf, dev_docs)
            strategies = ['pron_rank', 'pron_vote_rank']
        else:
            known_rels = None
            strategies = ['easy_first']

        hook, evaluation_result = get_evaluating_hook(
            serializer, dev_docs, out_path, seed, fold_num,
            params.get('save_models', False), strategies, known_rels)

        with Trainer(props) as trainer:
            trainer.train(train_docs, hook)

        return dict(evaluation_result)
Пример #5
0
 def get_splits(self):
     for i in range(args.folds):
         yield get_fold(self.__docs, args.folds, i)
Пример #6
0
    def test_single_fold(self):
        objects = [0, 1, 2, 3]
        expected = [], [0, 1, 2, 3]

        self.assertEqual(expected, get_fold(objects, 1, 0))