Exemplo n.º 1
0
def readSampleQueries(dataDir,
                      input_subdir1, sample_prob1,
                      input_subdir2, sample_prob2):
    qpath1 = os.path.join(dataDir, input_subdir1, QUESTION_FILE_JSON)
    fullQueryList1 = readQueries(qpath1)
    sampleQueryList1 = np.random.choice(fullQueryList1,
                                        size=int(len(fullQueryList1) * sample_prob1),
                                        replace=False)
    print('Read %d queries from %s sampled %d' %
          (len(fullQueryList1), qpath1, len(sampleQueryList1)))

    qpath2 = os.path.join(dataDir, input_subdir2, QUESTION_FILE_JSON)
    fullQueryList2 = readQueries(qpath2)
    sampleQueryList2 = np.random.choice(fullQueryList2,
                                        size=int(len(fullQueryList2) * sample_prob2),
                                        replace=False)
    print('Read %d queries from %s sampled %d' %
          (len(fullQueryList2), qpath2, len(sampleQueryList2)))

    return sampleQueryList1, sampleQueryList2
Exemplo n.º 2
0
def main():
    args = parse_args()
    print(args.raw_args)

    print("Start reading input files...")
    src_dir = args.src_dir
    queries = readQueries(os.path.join(src_dir, QUESTION_FILE_JSON))
    queries_ids = [data[DOCID_FIELD] for data in queries]
    assert len(queries_ids) == len(set(queries_ids)), "Non-unique queries ids are forbidden!"
    qrels = readQrels(os.path.join(src_dir, QREL_FILE))
    print("Done reading input files.")

    random.seed(args.seed)
    random.shuffle(queries_ids)

    sizes = args.partitions_sizes(len(queries_ids))
    assert len(sizes) == len(args.partitions_names)
    print("Final partitions sizes:", list(zip(args.partitions_names, sizes)))

    query_id_to_partition = build_query_id_to_partition(queries_ids, sizes)

    write_queries_files(queries, query_id_to_partition, args.dst_dir, args.partitions_names)
    write_qrels_files(qrels, query_id_to_partition, args.dst_dir, args.partitions_names)
                    metavar='1st output data subirectory',
                    help='1st output data subdirectory',
                    type=str,
                    required=True)
parser.add_argument('--out_subdir2',
                    metavar='2d output data subirectory',
                    help='2d output data subdirectory',
                    type=str,
                    required=True)

args = parser.parse_args()
print(args)

dataDir = args.data_dir

fullQueryList = readQueries(
    os.path.join(dataDir, args.input_subdir, QUESTION_FILE_JSON))
fullQueryIdSet = set([data[DOCID_FIELD] for data in fullQueryList])

print('Read all the queries from the main dir')

qrelList = readQrels(os.path.join(dataDir, args.input_subdir, QREL_FILE))

print('Read all the QRELs from the main dir')

queryIdSet = set()

partSubDirs = [args.out_subdir1, args.out_subdir2]

for part in range(0, 2):
    outDir = os.path.join(dataDir, partSubDirs[part])
    qrelList = readQrels(os.path.join(outDir, QREL_FILE))
Exemplo n.º 4
0
                    default=None,
                    type=str)
parser.add_argument('--fwd_index_file', metavar='forward index catalog file',
                    help='the "catalog" file of the forward index',
                    type=str, required=True)
parser.add_argument('--min_exp_doc_qty',
                    metavar='min # of docs per query to expect',
                    help='min # of docs per query to expect',
                    type=int, required=True)

args = parser.parse_args()

print('Reading document IDs from the index')
allDocIds = readDocIdsFromForwardFileHeader(args.fwd_index_file)
print('Reading queries')
queries = readQueries(args.query_file)

query_ids = []
query_doc_qtys = {}

for e in queries:
    qid = e[DOCID_FIELD]
    query_ids.append(qid)

# Some copy-paste from common_eval.readRunDict, but ok for now
fileName = args.run_file
with FileWrapper(fileName) as f:
    prevQueryId = None

    # Check for repeating document IDs and improperly sorted entries
    for ln, line in enumerate(f):
Exemplo n.º 5
0
                    type=str, required=True)
parser.add_argument('--min_query_token_qty', type=int, default=0,
                    metavar='min # of query tokens', help='ignore queries that have smaller # of tokens')
parser.add_argument('--' + BERT_TOK_OPT, action='store_true', help=BERT_TOK_OPT_HELP)

args = parser.parse_args()
print(args)
arg_vars = vars(args)

inpFile = FileWrapper(args.input)

ignoreQueries = set()

for qfile_dir in args.filter_query_dir:
    qfile_name = os.path.join(qfile_dir, QUESTION_FILE_JSON)
    for e in readQueries(qfile_name):
        ignoreQueries.add(e[TEXT_FIELD_NAME])
    print('Read queries from: ' + qfile_name)

print('A list of queries to ignore has %d entries' % (len(ignoreQueries)))


if not os.path.exists(args.out_dir):
    os.makedirs(args.out_dir)

outFileQueries = FileWrapper(os.path.join(args.out_dir, QUESTION_FILE_JSON), 'w')
outFileQrelsName = os.path.join(args.out_dir, QREL_FILE)

minQueryTokQty = args.min_query_token_qty

stopWords = readStopWords(STOPWORD_FILE, lowerCase=True)
Exemplo n.º 6
0
                    type=float,
                    default=1.0)
parser.add_argument('-k', metavar='k-NN k', type=int, default=1)
parser.add_argument('--min_jacc',
                    metavar='min jaccard to compare answers',
                    type=float,
                    default=1.0)
parser.add_argument("--use_hnsw", action="store_true")

args = parser.parse_args()
print(args)

dataDir = args.data_dir

qpath1 = os.path.join(dataDir, args.input_subdir1, QUESTION_FILE_JSON)
fullQueryList1 = readQueries(qpath1)
sampleQueryList1 = np.random.choice(
    fullQueryList1,
    size=int(len(fullQueryList1) * args.sample_prob1),
    replace=False)
print('Read %d queries from %s sampled %d' %
      (len(fullQueryList1), qpath1, len(sampleQueryList1)))

qpath2 = os.path.join(dataDir, args.input_subdir2, QUESTION_FILE_JSON)
fullQueryList2 = readQueries(qpath2)
sampleQueryList2 = np.random.choice(
    fullQueryList2,
    size=int(len(fullQueryList2) * args.sample_prob2),
    replace=False)
print('Read %d queries from %s sampled %d' %
      (len(fullQueryList2), qpath2, len(sampleQueryList2)))
parser.add_argument('--data_dir',
                    metavar='data directory',
                    help='data directory',
                    type=str,
                    required=True)
parser.add_argument('--out_dir',
                    metavar='output directory',
                    help='output directory',
                    type=str,
                    required=True)

args = parser.parse_args()
print(args)

out_dir = args.out_dir

if not os.path.exists(out_dir):
    os.makedirs(out_dir)

for subDir in os.listdir(args.data_dir):
    qf = os.path.join(args.data_dir, subDir, QUESTION_FILE_JSON)
    if os.path.exists(qf):
        print('Reading:', qf)
        res = []
        for e in readQueries(qf):
            res.append(e[DOCID_FIELD])

        print('Read', len(res), 'queries')
        np.save(os.path.join(out_dir, subDir + '.npy'), np.array(res))
Exemplo n.º 8
0
parser.add_argument('--filter_query_dir', metavar='filtering query dir',
                    default=[],
                    help=f'all queries found in {QUESTION_FILE_JSON} files from these directories are ignored',
                    nargs='*')
parser.add_argument('--out_dir', metavar='output directory', help='output directory',
                    type=str, required=True)

args = parser.parse_args()
print(args)
arg_vars = vars(args)

ignoreQueries = set()

for qfile_dir in args.filter_query_dir:
    qfile_name = os.path.join(qfile_dir, QUESTION_FILE_JSON)
    for e in readQueries(qfile_name):
        if not TEXT_FIELD_NAME in e:
            continue
        ignoreQueries.add(e[TEXT_FIELD_NAME])
    print('Read queries from: ' + qfile_name)

print('A list of queries to ignore has %d entries' % (len(ignoreQueries)))

if not os.path.exists(args.out_dir):
    os.makedirs(args.out_dir)

outFileQueries = FileWrapper(os.path.join(args.out_dir, QUESTION_FILE_JSON), 'w')

readQty = 0
wroteQty = 0