def __init__(self,
              mode=None,
              task_id=None,
              shard_id=None,
              question_id=None):
     self.data = CsrData()
     self.data.load_csr_data(full_wiki=FLAGS.full_wiki,
                             files_dir=FLAGS.apr_files_dir,
                             mode=mode,
                             task_id=task_id,
                             shard_id=shard_id,
                             question_id=question_id)
     self.high_freq_relations = {
         'P31': 'instance of',
         'P17': 'country',
         'P131': 'located in the administrative territorial entity',
         'P106': 'occupation',
         'P21': 'sex or gender',
         'P735': 'given name',
         'P27': 'country of citizenship',
         'P19': 'place of birth'
     }
    max_tasks = {"train": 50, "dev": 5}
    max_shards = {"train": 7, "dev": 17}
    apr = ApproximatePageRank()
    for mode in [FLAGS.split]:
        # Parse all shards in each mode
        # Currently sequentially, can be parallelized later
        for task_id in [FLAGS.task_id]:  #range(0, max_tasks[mode]):
            for shard_id in [FLAGS.shard_split_id
                             ]:  #range(0, max_shards[mode]):
                # if task_id == 0 and shard_id in range(0, 16):
                #     print("skipping finished job")
                #     continue
                nq_data, entities = get_examples(FLAGS.nq_dir, mode, task_id,
                                                 shard_id)
                if nq_data is None:
                    print("No examples here")
                    continue
                print("Size of all entities: %d", len(entities))
                two_hop_entities = apr.get_khop_entities(
                    entities, FLAGS.csr_num_hops)
                print("Size of two hop entities: %d", len(two_hop_entities))
                csr_data = CsrData()
                csr_data.create_and_save_csr_data(
                    full_wiki=FLAGS.full_wiki,
                    decompose_ppv=FLAGS.decompose_ppv,
                    files_dir=FLAGS.apr_files_dir,
                    sub_entities=two_hop_entities,
                    mode=mode,
                    task_id=task_id,
                    shard_id=shard_id)
Пример #3
0
 def __init__(self):
     self.data = CsrData()
     self.data.load_csr_data(full_wiki=FLAGS.full_wiki,
                             files_dir=FLAGS.apr_files_dir)