Пример #1
0
def build_test_data(model: Model, dataset: str, train_sms: List[SemanticModel],
                    discover_sources: List[SemanticModel], output_dir: Path,
                    n_iter):
    data: Dict[str, Dict[bytes,
                         Example]] = {sm.id: {}
                                      for sm in discover_sources}
    discover_sids = {sm.id for sm in discover_sources}
    (output_dir / "examples").mkdir(exist_ok=True, parents=True)

    # default should have ground-truth
    for sm in discover_sources:
        data[sm.id][graph_to_hashable_string(sm.graph)] = make_example(
            sm, sm.graph, Example.generate_example_id(sm.id, 0, 0),
            [sm.id for sm in train_sms])

    new_data = generate_data(model, dataset, train_sms, discover_sources, 1)
    for sm in discover_sources:
        new_candidate_sms = [
            key for key in new_data[sm.id] if key not in data[sm.id]
        ]
        for key in new_candidate_sms:
            data[sm.id][key] = new_data[sm.id][key]

    test_examples = [
        example for sid in discover_sids for example in data[sid].values()
    ]
    test_examples.sort(key=lambda e: e.example_id)

    serializeJSON(test_examples,
                  output_dir / "examples" / f"test.{n_iter}.json")
Пример #2
0
def generate_data(model: Model, dataset: str, train_sms: List[SemanticModel],
                  discover_sources: List[SemanticModel], n_iter):
    data = {}
    stat = Statistic.get_instance(train_sms)
    train_sids = [sm.id for sm in train_sms]
    model_bundle = (model.dataset, model.model, model.tf_domain,
                    model.pairwise_domain)

    with get_pool(Settings.get_instance().parallel_n_process) as pool:
        results = []
        for source in discover_sources:
            result: AsyncResult[Dict[bytes, Graph]] = pool.apply_async(
                generate_candidate_sm,
                (dataset, source, stat, model_bundle, train_sids))
            results.append(result)

        for source, result in zip(discover_sources, results):
            candidate_sms = result.get()
            for i, key in enumerate(candidate_sms):
                candidate_sms[key] = make_example(
                    source, candidate_sms[key],
                    Example.generate_example_id(source.id, i, n_iter),
                    train_sids)

            data[source.id] = candidate_sms

    return data
Пример #3
0
def online_learning(model: Model,
                    dataset: str,
                    train_sms: List[SemanticModel],
                    discover_sources: List[SemanticModel],
                    output_dir: Path,
                    training_args,
                    iter_range=(1, 3)):
    data: Dict[str, Dict[bytes,
                         Example]] = {sm.id: {}
                                      for sm in discover_sources}
    discover_sids = {sm.id for sm in discover_sources}
    ignore_sids = set(
    )  # those should not include in the discovery_helper process because of no new sources
    logger = get_logger("app")
    (output_dir / "examples").mkdir(exist_ok=True, parents=True)

    # default should have ground-truth
    for sm in discover_sources:
        data[sm.id][graph_to_hashable_string(sm.graph)] = make_example(
            sm, sm.graph, Example.generate_example_id(sm.id, 0, 0),
            [sm.id for sm in train_sms])

    for n_iter in range(*iter_range):
        logger.info("==================================> Iter: %s", n_iter)
        new_data = generate_data(model, dataset, train_sms, discover_sources,
                                 n_iter)
        for sm in discover_sources:
            if sm.id in ignore_sids:
                continue

            new_candidate_sms = [
                key for key in new_data[sm.id] if key not in data[sm.id]
            ]
            if len(new_candidate_sms) == 0:
                # no new candidate sms
                logger.info("No new candidate for source: %s", sm.id)
                ignore_sids.add(sm.id)
            else:
                for key in new_candidate_sms:
                    data[sm.id][key] = new_data[sm.id][key]

        train_examples = [
            example for sm in train_sms if sm.id in discover_sids
            for example in data[sm.id].values()
        ]
        train_examples.sort(key=lambda e: e.example_id)

        serializeJSON(train_examples,
                      output_dir / "examples" / f"train.{n_iter}.json")
        shutil.copyfile(output_dir / "examples" / f"train.{n_iter}.json",
                        output_dir / "examples" / f"train.json")

        raw_model, tf_domain, pairwise_domain, __ = train_model(
            dataset, [sm.id for sm in train_sms], 120, train_examples, [],
            training_args, output_dir / "models")
        model = Model(dataset, raw_model, tf_domain, pairwise_domain)

    return model
Пример #4
0
def create_default_model(dataset: str, train_sms: List[SemanticModel],
                         training_args, basedir: Path) -> Model:
    train_examples = []
    for sm in train_sms:
        example = Example(sm.graph, sm.graph,
                          {e.id: True
                           for e in sm.graph.iter_links()},
                          {n.id: n.id
                           for n in sm.graph.iter_nodes()})
        example.set_meta(Example.generate_example_id(sm.id, 0, 0),
                         [sm.id for sm in train_sms])
        train_examples.append(example)

    raw_model, tf_domain, pairwise_domain, __ = train_model(
        dataset, [sm.id for sm in train_sms], 120, train_examples, [],
        training_args, basedir)
    return Model(dataset, raw_model, tf_domain, pairwise_domain)
Пример #5
0
def make_test_from_prediction(train_sms: List[SemanticModel],
                              evaluate_sms: List[SemanticModel], workdir: Path,
                              model_dir: Path):
    search_history: Dict[str, List[List[dict]]] = deserializeJSON(
        model_dir / "search_history.json")
    evaluate_sms = {sm.id: sm for sm in evaluate_sms}
    train_sm_ids = [sm.id for sm in train_sms]

    test_examples = []
    for sid in search_history:
        for i, gs in enumerate(search_history[sid]):
            for j, g in enumerate(gs):
                eid = Example.generate_example_id(sid, j, i)
                example = make_example(evaluate_sms[sid], Graph.from_dict(g),
                                       eid, train_sm_ids)
                test_examples.append(example)

    serializeJSON(test_examples, workdir / "examples" / "test.json")
    return test_examples