예제 #1
0
    def __call__(self, config):
        if u"class_weight" in config[u"classifier_args"]:
            d = config[u"classifier_args"][u"class_weight"]
            assert "true" in d and "false" in d and len(d) == 2
            config[u"classifier_args"][u"class_weight"] = {
                True: d["true"],
                False: d["false"]
            }

        # Prepare data
        if self.data is None or self.relname != config["relation"]:
            relation = iepy.data.models.Relation.objects.get(
                name=config["relation"])
            c_evidences = CEM.candidates_for_relation(relation)
            self.data = CEM.labels_for(relation, c_evidences,
                                       CEM.conflict_resolution_newest_wins)
            self.data = [(x, label) for x, label in self.data.items()
                         if label is not None]
            self.relname = config["relation"]
        data = self.data
        testset = {x: label for x, label in data}
        candidate_evidences = {x: None for x, _ in data}
        if not data:
            raise NotEnoughData("There is no labeled data for training")
        oracle_answers = config["oracle_answers"]
        N = len(data)
        M = N - oracle_answers  # test set size
        if M / N < 0.1:  # if there ir less than 10% left for testing
            raise NotEnoughData("There is not enough data for evaluation")

        result = {
            "train_size": oracle_answers,
            "test_size": M,
            "dataset_size": N,
            "start_time": time.time(),
        }

        # Interact with oracle
        alcore = ActiveLearningCore(config["relation"],
                                    candidate_evidences,
                                    extractor_config=config,
                                    performance_tradeoff=config["tradeoff"])
        alcore.start()
        # ^ Is acainst creenhouse emissions
        for _ in range(oracle_answers):
            q = alcore.questions[0]
            alcore.add_answer(q, testset[q])
            del testset[q]  # Once given for training cannot be part of testset
            alcore.process()

        test_evidences, test_labels = zip(*list(testset.items()))
        extractor = alcore.relation_classifier

        # Evaluate prediction
        predicted_dict = alcore.predict()
        test_evidences = list(testset)
        test_labels = [testset[x] for x in test_evidences]
        predicted_labels = [predicted_dict[x] for x in test_evidences]
        result.update(
            result_dict_from_predictions(test_evidences, test_labels,
                                         predicted_labels))

        # Evaluate ranking
        predicted_scores = extractor.decision_function(test_evidences)
        auroc = roc_auc_score(test_labels, predicted_scores)
        avgprec = average_precision_score(test_labels, predicted_scores)

        result.update({
            "auROC": auroc,
            "average_precision": avgprec,
        })
        return result
예제 #2
0
파일: iepy_runner.py 프로젝트: lowks/iepy
        relation = Relation.objects.get(name=relation)
    except Relation.DoesNotExist:
        print("Relation {!r} non existent".format(relation))
        print_all_relations()
        exit(1)

    extractor_config = opts.get("--extractor-config")
    if extractor_config:
        with open(extractor_config) as filehandler:
            extractor_config = json.load(filehandler)

    candidates = CandidateEvidenceManager.candidates_for_relation(relation)
    labeled_evidences = load_labeled_evidences(relation, candidates)
    iextractor = ActiveLearningCore(relation, labeled_evidences, extractor_config,
                                    performance_tradeoff=tuning_mode)
    iextractor.start()

    STOP = u'STOP'
    term = TerminalAdministration(relation,
                                  extra_options=[(STOP, u'Stop execution')])
    was_ever_trained = False
    while iextractor.questions:
        questions = list(iextractor.questions)  # copying the list
        term.update_candidate_evidences_to_label(questions)
        result = term()
        i = 0
        for c, label_value in load_labeled_evidences(relation, questions).items():
            if label_value is not None:
                iextractor.add_answer(c, label_value)
                i += 1
        print ('Added %s new human labels to the extractor core' % i)
예제 #3
0
def run_from_command_line():
    opts = docopt(__doc__, version=iepy.__version__)
    relation = opts['<relation_name>']
    classifier_path = opts.get('--classifier')

    logging.basicConfig(level=logging.INFO, format='%(message)s')
    logging.getLogger("featureforge").setLevel(logging.WARN)

    if opts['--tune-for'] == 'high-prec':
        tuning_mode = HIPREC
    elif opts['--tune-for'] == 'high-recall':
        tuning_mode = HIREC
    else:
        print ('Invalid tuning mode')
        print (__doc__)
        exit(1)

    try:
        relation = Relation.objects.get(name=relation)
    except Relation.DoesNotExist:
        print("Relation {!r} non existent".format(relation))
        print_all_relations()
        exit(1)

    candidates = CandidateEvidenceManager.candidates_for_relation(relation)
    labeled_evidences = load_labeled_evidences(relation, candidates)

    if classifier_path:
        try:
            loaded_classifier = output.load_classifier(classifier_path)
        except ValueError:
            print("Error: unable to load classifier, invalid file")
            exit(1)

        iextractor = ActiveLearningCore(
            relation, labeled_evidences, performance_tradeoff=tuning_mode,
            classifier=loaded_classifier
        )
        was_ever_trained = True
    else:
        extractor_config = opts.get("--extractor-config")
        if extractor_config:
            with open(extractor_config) as filehandler:
                extractor_config = json.load(filehandler)

        iextractor = ActiveLearningCore(
            relation, labeled_evidences, extractor_config,
            performance_tradeoff=tuning_mode
        )
        iextractor.start()
        was_ever_trained = False


    if not opts.get("--no-questions", False):
        questions_loop(iextractor, relation, was_ever_trained)

    # Predict and store output
    predictions = iextractor.predict()
    if predictions:
        output.dump_output_loop(predictions)
        output.dump_classifier_loop(iextractor)
예제 #4
0
        relation = Relation.objects.get(name=relation)
    except Relation.DoesNotExist:
        print("Relation {!r} non existent".format(relation))
        print_all_relations()
        exit(1)

    extractor_config = opts.get("--extractor-config")
    if extractor_config:
        with open(extractor_config) as filehandler:
            extractor_config = json.load(filehandler)

    candidates = CandidateEvidenceManager.candidates_for_relation(relation)
    labeled_evidences = load_labeled_evidences(relation, candidates)
    iextractor = ActiveLearningCore(relation, labeled_evidences,
                                    extractor_config)
    iextractor.start()

    STOP = u'STOP'
    term = TerminalAdministration(relation,
                                  extra_options=[(STOP, u'Stop execution ASAP')
                                                 ])

    while iextractor.questions:
        questions = list(iextractor.questions)  # copying the list
        term.update_candidate_evidences_to_label(questions)
        result = term()
        if result == STOP:
            break

        i = 0
        for c, label_value in load_labeled_evidences(relation,
예제 #5
0
def run_from_command_line():
    opts = docopt(__doc__, version=iepy.__version__)
    relation = opts['<relation_name>']
    classifier_path = opts.get('--classifier')

    logging.basicConfig(level=logging.INFO, format='%(message)s')
    logging.getLogger("featureforge").setLevel(logging.WARN)

    if opts['--tune-for'] == 'high-prec':
        tuning_mode = HIPREC
    elif opts['--tune-for'] == 'high-recall':
        tuning_mode = HIREC
    else:
        print('Invalid tuning mode')
        print(__doc__)
        exit(1)

    try:
        relation = Relation.objects.get(name=relation)
    except Relation.DoesNotExist:
        print("Relation {!r} non existent".format(relation))
        print_all_relations()
        exit(1)

    candidates = CandidateEvidenceManager.candidates_for_relation(relation)
    labeled_evidences = load_labeled_evidences(relation, candidates)

    if classifier_path:
        try:
            loaded_classifier = output.load_classifier(classifier_path)
        except ValueError:
            print("Error: unable to load classifier, invalid file")
            exit(1)

        iextractor = ActiveLearningCore(relation,
                                        labeled_evidences,
                                        performance_tradeoff=tuning_mode,
                                        classifier=loaded_classifier)
        was_ever_trained = True
    else:
        config_filepath = opts.get("--extractor-config")
        if not config_filepath:
            config_filepath = os.path.join(INSTANCE_PATH,
                                           "extractor_config.json")

        if not os.path.exists(config_filepath):
            print("Error: extractor config does not exists, please create the "
                  "file extractor_config.json or use the --extractor-config")
            exit(1)

        with open(config_filepath) as filehandler:
            try:
                extractor_config = json.load(filehandler)
            except Exception as error:
                print(
                    "Error: unable to load extractor config: {}".format(error))
                exit(1)

        iextractor = ActiveLearningCore(relation,
                                        labeled_evidences,
                                        extractor_config,
                                        performance_tradeoff=tuning_mode)
        iextractor.start()
        was_ever_trained = False

    if not opts.get("--no-questions", False):
        questions_loop(iextractor, relation, was_ever_trained)

    # Predict and store output
    predictions = iextractor.predict()
    if predictions:
        output.dump_output_loop(predictions)
        output.dump_classifier_loop(iextractor)