def main():
    parser = ArgumentParser()

    parser.add_argument('graph_path', help='Path to the input graph JSON file')
    parser.add_argument('hypotheses_path',
                        help='Path to the raw hypotheses file, or a directory with multiple files')
    parser.add_argument('output_dir',
                        help='Directory to write the filtered hypothesis files(s)')
    parser.add_argument('-f', '--force', action='store_true', default=False,
                        help='If specified, overwrite existing output files without warning')

    args = parser.parse_args()

    output_dir = util.get_output_dir(args.output_dir, overwrite_warning=not args.force)

    json_graph = JsonGraph.from_dict(util.read_json_file(args.graph_path, 'JSON graph'))
    hypotheses_file_paths = util.get_file_list(args.hypotheses_path, suffix='.json', sort=True)

    for hypotheses_file_path in hypotheses_file_paths:
        hypotheses_json = util.read_json_file(hypotheses_file_path, 'hypotheses')
        hypothesis_collection = AidaHypothesisCollection.from_json(hypotheses_json, json_graph)

        hypothesis_collection.expand()

        # create the filter
        hypothesis_filter = AidaHypothesisFilter(json_graph)

        filtered_hyplist = [hypothesis_filter.filtered(hypothesis) for hypothesis in hypothesis_collection\
                 if not hypothesis_too_short(hypothesis, json_graph)]

        filtered_hypothesis_collection = AidaHypothesisCollection(compactify(filtered_hyplist, json_graph))

        filtered_hypotheses_json = filtered_hypothesis_collection.to_json()

        # add graph filename and queries, if they were there before
        if 'graph' in hypotheses_json:
            filtered_hypotheses_json['graph'] = hypotheses_json['graph']
        if "queries" in hypotheses_json:
            filtered_hypotheses_json['queries'] = hypotheses_json['queries']

        output_path = output_dir / hypotheses_file_path.name
        logging.info('Writing filtered hypotheses to {} ...'.format(output_path))

        with open(str(output_path), 'w') as fout:
            json.dump(filtered_hypotheses_json, fout, indent=1)
示例#2
0
def main():
    parser = ArgumentParser()
    parser.add_argument('graph_path', help='Path to the input graph JSON file')
    parser.add_argument(
        'raw_seeds_path',
        help='Path to the raw hypothesis seeds file, or a directory with '
        'multiple seeds files')
    parser.add_argument(
        'output_dir', help='Directory to write the reranked hypothesis seeds')
    parser.add_argument('--plausibility_model_path',
                        help='Path to a hypothesis plausibility model')
    parser.add_argument('--indexer_path', help="Path to the indexers file")
    parser.add_argument('-n',
                        '--max_num_seeds',
                        type=int,
                        default=None,
                        help='Only output up to n hypothesis seeds')
    parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        default=False,
        help='If specified, overwrite existing output files without warning')

    args = parser.parse_args()

    json_graph = JsonGraph.from_dict(
        util.read_json_file(args.graph_path, 'JSON graph'))

    raw_seeds_file_paths = util.get_file_list(args.raw_seeds_path,
                                              suffix='.json',
                                              sort=True)

    output_dir = util.get_output_dir(args.output_dir,
                                     overwrite_warning=not args.force)

    for raw_seeds_file_path in raw_seeds_file_paths:
        raw_seeds_json = util.read_json_file(raw_seeds_file_path,
                                             'seeds by facet')
        seeds_by_facet = {}
        for facet_label, seeds_json in raw_seeds_json.items():
            if facet_label != 'graph':
                seeds_by_facet[facet_label] = [
                    HypothesisSeed.from_json(seed_json, json_graph)
                    for seed_json in seeds_json
                ]

        if args.plausibility_model_path is not None and args.indexer_path is not None:
            seeds_by_facet = rerank_seeds_by_plausibility(
                seeds_by_facet, args.graph_path, args.plausibility_model_path,
                args.indexer_path)

        seeds = select_seeds_by_novelty(seeds_by_facet, args.max_num_seeds)

        hypotheses_to_export = []

        # turn ranks into the log weights of seed hypotheses
        # meaningless numbers. just assign 1/2, 1/3, 1/4, ...
        for rank, seed in enumerate(seeds):
            seed.hypothesis.update_weight(math.log(1.0 / (rank + 1)))
            hypotheses_to_export.append(seed.finalize())

        hypothesis_collection = AidaHypothesisCollection(hypotheses_to_export)

        seeds_json = hypothesis_collection.to_json()
        seeds_json['graph'] = raw_seeds_json['graph']

        output_path = output_dir / (raw_seeds_file_path.name.split('_')[0] +
                                    '_seeds.json')
        logging.info(
            'Writing re-ranked hypothesis seeds to {} ...'.format(output_path))
        with open(str(output_path), 'w') as fout:
            json.dump(seeds_json, fout, indent=1)
示例#3
0
def main():
    parser = ArgumentParser()
    parser.add_argument('graph_path', help='Path to the input graph JSON file')
    parser.add_argument(
        'query_path',
        help=
        'Path to the input query file, or a directory with multiple queries')
    parser.add_argument('output_dir',
                        help='Directory to write the raw hypothesis seeds')
    parser.add_argument(
        '-n',
        '--max_num_seeds_per_facet',
        type=int,
        default=None,
        help='If provided, only save up to <arg> seeds per facet')
    parser.add_argument(
        '-d',
        '--discard_failed_core_constraints',
        action='store_true',
        help='If specified, discard hypotheses with failed core constraints. '
        'Try not to use this one during evaluation at first, so that we '
        'do not discard hypotheses we might still need. If we have too many '
        'hypotheses and the script runs too slowly, then use this.')
    parser.add_argument(
        '-r',
        '--rank_cutoff',
        type=int,
        default=100,
        help=
        'If specified, discard hypotheses early if there are at least <arg> '
        'other hypotheses that have the same fillers for a certain number '
        '(default = 3) of their non-entrypoint query variables. We might '
        'need this in the evaluation if some facets have many variables '
        'that lead to combinatorial explosion.')
    parser.add_argument(
        '--frame_grouping',
        action='store_true',
        help=
        'If specified, group query constraints by frames instead of by facets')
    parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        help='If specified, overwrite existing output files without warning')

    args = parser.parse_args()

    json_graph = JsonGraph.from_dict(
        util.read_json_file(args.graph_path, 'JSON graph'))

    query_file_paths = util.get_file_list(args.query_path,
                                          suffix='.json',
                                          sort=True)

    output_dir = util.get_output_dir(args.output_dir,
                                     overwrite_warning=not args.force)

    for query_file_path in query_file_paths:
        query_json = util.read_json_file(query_file_path, 'query')

        raw_seeds_json = make_cluster_seeds(
            json_graph=json_graph,
            query_json=query_json,
            max_num_seeds_per_facet=args.max_num_seeds_per_facet,
            frame_grouping=args.frame_grouping,
            discard_failed_core_constraints=args.
            discard_failed_core_constraints,
            rank_cutoff=args.rank_cutoff)

        # write hypotheses out in json format.
        output_path = output_dir / (query_file_path.name.split('_')[0] +
                                    '_seeds.json')
        logging.info(
            'Writing raw hypothesis seeds of each facet to {} ...'.format(
                output_path))
        with open(str(output_path), 'w') as fout:
            json.dump(raw_seeds_json, fout, indent=1)
示例#4
0
def main():
    parser = ArgumentParser()
    parser.add_argument(
        'hypotheses_path',
        help='path to the input json file for hypotheses, or a directory with '
        'a list of hypotheses files')
    parser.add_argument(
        'output_dir', help='directory to write the coref-recovered hypotheses')
    parser.add_argument('original_graph_path',
                        help='path to the original graph json file')
    parser.add_argument('compressed_graph_path',
                        help='path to the compressed graph json file')
    parser.add_argument('input_log_path',
                        help='path to log file from coref compression')
    parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        default=False,
        help='If specified, overwrite existing output files without warning')

    args = parser.parse_args()

    hypotheses_file_paths = util.get_file_list(args.hypotheses_path,
                                               suffix='.json',
                                               sort=True)

    output_dir = util.get_output_dir(args.output_dir,
                                     overwrite_warning=not args.force)

    original_graph_json = util.read_json_file(args.original_graph_path,
                                              'original JSON graph')
    compressed_graph_json = util.read_json_file(args.compressed_graph_path,
                                                'compressed JSON graph')
    input_log_json = util.read_json_file(args.input_log_path, 'coref log')

    for hypotheses_file_path in hypotheses_file_paths:
        input_hypotheses_json = util.read_json_file(hypotheses_file_path,
                                                    'hypotheses')

        # probs do not change
        output_hypotheses_json = {
            'probs': input_hypotheses_json['probs'],
            'support': []
        }

        for compressed_hypothesis in input_hypotheses_json["support"]:
            original_hypothesis = {'statements': [], 'statementWeights': []}

            # The mapping from each original statement (before coref-compression) to its weight
            original_stmt_weight_mapping = {}

            # Set of cluster membership nodes to include in the original hypothesis
            cluster_membership_set = set()

            for compressed_stmt, stmt_weight in zip(
                    compressed_hypothesis['statements'],
                    compressed_hypothesis['statementWeights']):
                # Get the statement entry from the compressed graph
                compressed_stmt_entry = compressed_graph_json['theGraph'][
                    compressed_stmt]
                # Get the cluster(s) from the subject of the compressed statement
                stmt_subj_clusters = \
                    input_log_json['prototype_to_clusters'][compressed_stmt_entry['subject']]
                # Whether this is a type statement
                is_type_stmt = (compressed_stmt_entry['predicate'] == 'type')
                # Get the cluster(s) from the object of the compressed statement if it is an edge
                # statement
                if is_type_stmt:
                    stmt_obj_clusters = None
                else:
                    stmt_obj_clusters = \
                        input_log_json['prototype_to_clusters'][compressed_stmt_entry['object']]

                for original_stmt in input_log_json['new_stmt_to_old_stmts'][
                        compressed_stmt]:
                    # Resolve the statements and weights before coref-compression
                    if original_stmt not in original_stmt_weight_mapping:
                        original_stmt_weight_mapping[
                            original_stmt] = stmt_weight
                    elif original_stmt_weight_mapping[
                            original_stmt] < stmt_weight:
                        original_stmt_weight_mapping[
                            original_stmt] = stmt_weight

                    # Get the statement entry from the original graph
                    original_stmt_entry = original_graph_json['theGraph'][
                        original_stmt]

                    # Add cluster membership between the original subject and each subject cluster
                    stmt_subj = original_stmt_entry['subject']
                    for stmt_subj_cluster in stmt_subj_clusters:
                        cluster_membership_set.add(
                            (stmt_subj, stmt_subj_cluster))

                    if is_type_stmt:
                        assert original_stmt_entry['predicate'] == 'type'
                    else:
                        assert original_stmt_entry['predicate'] != 'type'

                        # Add cluster membership between the original object and each object cluster
                        stmt_obj = original_stmt_entry['object']
                        for stmt_obj_cluster in stmt_obj_clusters:
                            cluster_membership_set.add(
                                (stmt_obj, stmt_obj_cluster))

            for original_stmt, stmt_weight in original_stmt_weight_mapping.items(
            ):
                original_hypothesis['statements'].append(original_stmt)
                original_hypothesis['statementWeights'].append(stmt_weight)

            original_hypothesis['clusterMemberships'] = list(
                cluster_membership_set)

            original_hypothesis['failedQueries'] = compressed_hypothesis[
                'failedQueries']

            original_query_stmts = set()
            for compressed_query_stmt in compressed_hypothesis[
                    'queryStatements']:
                original_query_stmts.update(
                    input_log_json['new_stmt_to_old_stmts']
                    [compressed_query_stmt])
            original_hypothesis['queryStatements'] = list(original_query_stmts)

            output_hypotheses_json['support'].append(original_hypothesis)

        if 'graph' in input_hypotheses_json:
            output_hypotheses_json['graph'] = input_hypotheses_json['graph']
        if 'queries' in input_hypotheses_json:
            output_hypotheses_json['queries'] = input_hypotheses_json[
                'queries']

        output_path = util.get_output_path(output_dir /
                                           hypotheses_file_path.name,
                                           overwrite_warning=not args.force)
        print('Writing coref-recovered hypotheses to {}'.format(output_path))
        with open(str(output_path), 'w') as fout:
            json.dump(output_hypotheses_json, fout, indent=2)
def main():
    parser = ArgumentParser()
    parser.add_argument('graph_path', help='path to the graph json file')
    parser.add_argument('hypotheses_path',
                        help='path to the hypotheses json directory')
    parser.add_argument('kb_path', help='path to the TA2 KB file (in AIF)')
    parser.add_argument('output_dir', help='path to output directory')
    parser.add_argument('run_id', help='TA3 run ID')
    parser.add_argument('sin_id_prefix',
                        help='prefix of SIN IDs to name the final hypotheses')
    parser.add_argument('--top',
                        default=50,
                        type=int,
                        help='number of top hypothesis to output')
    parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        default=False,
        help='If specified, overwrite existing output files without warning')

    args = parser.parse_args()

    json_graph = JsonGraph.from_dict(
        util.read_json_file(args.graph_path, 'JSON graph'))

    graph_mappings = json_graph.build_cluster_member_mappings()

    hypotheses_file_paths = util.get_file_list(args.hypotheses_path,
                                               suffix='.json',
                                               sort=True)

    # TODO: there is a known bug in rdflib that
    #  rdflib.Literal("2008", datatype=rdflib.XSD.gYear) would be parsed into
    #  rdflib.term.Literal(u'2008-01-01', datatype=rdflib.XSD.gYear) automatically,
    #  because a `parse_date` function is invoked for all rdflib.XSD.gYear literals.
    #  This is a temporary workaround to patch the _toPythonMapping locally.
    #  c.f.: https://github.com/RDFLib/rdflib/issues/806
    # noinspection PyProtectedMember
    rdflib.term._toPythonMapping.pop(rdflib.XSD['gYear'])

    print('Reading kb from {}'.format(args.kb_path))
    kb_graph = Graph()
    kb_graph.parse(args.kb_path, format='ttl')

    kb_nodes_by_category = catalogue_kb_nodes(kb_graph)

    kb_stmt_key_mapping = index_statement_nodes(
        kb_graph, kb_nodes_by_category['Statement'])
    kb_cm_key_mapping = index_cluster_membership_nodes(
        kb_graph, kb_nodes_by_category['ClusterMembership'])
    kb_type_stmt_key_mapping = index_type_statement_nodes(
        kb_graph, kb_nodes_by_category['TypeStatement'])

    output_dir = util.get_output_dir(args.output_dir,
                                     overwrite_warning=not args.force)

    run_id = args.run_id
    sin_id_prefix = args.sin_id_prefix

    for hypotheses_file_path in hypotheses_file_paths:
        hypotheses_json = util.read_json_file(hypotheses_file_path,
                                              'hypotheses')

        print('Found {} hypotheses with probability {}'.format(
            len(hypotheses_json['probs']), hypotheses_json['probs']))

        soin_id = sin_id_prefix + '_' + hypotheses_file_path.stem.split('_')[0]
        frame_id = soin_id + '_F1'

        top_count = 0
        for hypothesis_idx, prob in sorted(enumerate(hypotheses_json['probs']),
                                           key=itemgetter(1),
                                           reverse=True):
            if prob <= 0.0:
                hypothesis_weight = math.exp(prob / 2.0)
            else:
                hypothesis_weight = 0.0001

            hypothesis = hypotheses_json['support'][hypothesis_idx]

            top_count += 1
            hypothesis_id = '{}_hypothesis_{:0>3d}'.format(frame_id, top_count)

            subgraph = build_subgraph_for_hypothesis(
                kb_graph=kb_graph,
                kb_nodes_by_category=kb_nodes_by_category,
                kb_stmt_key_mapping=kb_stmt_key_mapping,
                kb_cm_key_mapping=kb_cm_key_mapping,
                kb_type_stmt_key_mapping=kb_type_stmt_key_mapping,
                json_graph=json_graph,
                graph_mappings=graph_mappings,
                hypothesis=hypothesis,
                hypothesis_id=hypothesis_id,
                hypothesis_weight=hypothesis_weight)

            output_path = output_dir / '{}.{}.{}.H{:0>3d}.ttl'.format(
                run_id, soin_id, frame_id, top_count)
            print('Writing hypothesis #{:>2d} with prob {:>6.2f} to {}'.format(
                top_count, prob, output_path))
            with open(output_path, 'w') as fout:
                fout.write(print_graph(subgraph))

            if top_count >= args.top:
                break
示例#6
0
def main():
    parser = ArgumentParser(
        description=
        'Read in a TA2 KB and a (list of) XML-based Statement of Information Need '
        'definition, convert the KB to JSON format, then convert each SoIN to a JSON '
        'query by identifying and ranking entry points.')
    parser.add_argument('kb_path', help='Path to the input TA2 KB')
    parser.add_argument('graph_output_path',
                        help='Path to write the JSON graph')
    parser.add_argument(
        '-s',
        '--soin_path',
        help=
        'Path to the input SoIN file, or a directory containing multiple SoIN '
        'files; if not provided, will only transform the graph')
    parser.add_argument(
        '-q',
        '--query_output_dir',
        help=
        'Directory to write the JSON queries, used when soin_path is provided')
    parser.add_argument(
        '-m',
        '--max_matches',
        type=int,
        default=50,
        help='The maximum number of EPs *per entry point description*')
    parser.add_argument(
        '-d',
        '--dup_kb',
        default=duplicate_kb_file,
        help='Path to the json file with duplicate KB ID mappings')
    parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        default=False,
        help='If specified, overwrite existing output files without warning')

    args = parser.parse_args()

    kb_path = util.get_input_path(args.kb_path)
    graph_output_path = util.get_output_path(args.graph_output_path,
                                             overwrite_warning=not args.force)

    aida_graph = AidaGraph()
    aida_graph.build_graph(str(kb_path), fmt='ttl')

    json_graph = JsonGraph()
    json_graph.build_graph(aida_graph)

    logging.info('Writing JSON graph to {} ...'.format(graph_output_path))
    with open(str(graph_output_path), 'w') as fout:
        json.dump(json_graph.as_dict(), fout, indent=1)
    logging.info('Done.')

    if args.soin_path is not None:
        assert args.query_output_dir is not None, 'Must provide query_output_dir'
        soin_path = util.get_input_path(args.soin_path)
        query_output_dir = util.get_output_dir(
            args.query_output_dir, overwrite_warning=not args.force)

        soin_file_paths = util.get_file_list(soin_path,
                                             suffix='.xml',
                                             sort=True)

        dup_kb_id_mapping = None
        if args.dup_kb is not None:
            dup_kb_id_mapping = util.read_json_file(args.dup_kb,
                                                    'duplicate KB ID mapping')

        logging.info('Getting Cluster Mappings ...')
        ere_to_prototypes = get_cluster_mappings(aida_graph)

        for soin_file_path in soin_file_paths:
            query_output_path = query_output_dir / (soin_file_path.stem +
                                                    '_query.json')

            logging.info('Processing SOIN {} ...'.format(soin_file_path))
            soin = SOIN.parse(str(soin_file_path),
                              dup_kbid_mapping=dup_kb_id_mapping)

            logging.info('Resolving all entrypoints ...')
            soin.resolve(aida_graph,
                         ere_to_prototypes,
                         max_matches=args.max_matches)

            query_json = {'graph': kb_path.stem}
            query_json.update(soin.to_json())

            logging.info(
                'Writing JSON query to {} ...'.format(query_output_path))
            with open(str(query_output_path), 'w') as fout:
                json.dump(query_json, fout, indent=1)